Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp
21521 views
1
// File: basisu_astc_hdr_6x6_enc.cpp
2
#include "basisu_astc_hdr_6x6_enc.h"
3
#include "basisu_enc.h"
4
#include "basisu_astc_hdr_common.h"
5
#include "basisu_math.h"
6
#include "basisu_resampler.h"
7
#include "basisu_resampler_filters.h"
8
9
#define MINIZ_HEADER_FILE_ONLY
10
#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
11
#include "basisu_miniz.h"
12
13
#include "3rdparty/android_astc_decomp.h"
14
15
#include <array>
16
17
using namespace basisu;
18
using namespace buminiz;
19
using namespace basist::astc_6x6_hdr;
20
21
namespace astc_6x6_hdr
22
{
23
24
static void atomic_max(std::atomic<uint32_t>& atomic_var, uint32_t new_value)
25
{
26
uint32_t current = atomic_var.load(std::memory_order_relaxed);
27
for ( ; ; )
28
{
29
uint32_t new_max = std::max(current, new_value);
30
if (atomic_var.compare_exchange_weak(current, new_max, std::memory_order_relaxed, std::memory_order_relaxed))
31
break;
32
}
33
}
34
35
void astc_hdr_6x6_global_config::set_user_level(int level)
36
{
37
level = basisu::clamp<int>(level, 0, ASTC_HDR_6X6_MAX_USER_COMP_LEVEL);
38
39
m_master_comp_level = 0;
40
m_highest_comp_level = 0;
41
m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS;
42
m_extra_patterns_flag = false;
43
m_brute_force_partition_matching = false;
44
45
switch (level)
46
{
47
case 0:
48
{
49
// Both reduce compression a lot when lambda>0
50
m_favor_higher_compression = false;
51
m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS / 2;
52
break;
53
}
54
case 1:
55
{
56
m_master_comp_level = 0;
57
m_highest_comp_level = 0;
58
break;
59
}
60
case 2:
61
{
62
m_master_comp_level = 0;
63
m_highest_comp_level = 1;
64
break;
65
}
66
case 3:
67
{
68
m_master_comp_level = 1;
69
m_highest_comp_level = 1;
70
break;
71
}
72
case 4:
73
{
74
m_master_comp_level = 1;
75
m_highest_comp_level = 2;
76
break;
77
}
78
case 5:
79
{
80
m_master_comp_level = 1;
81
m_highest_comp_level = 3;
82
break;
83
}
84
case 6:
85
{
86
m_master_comp_level = 1;
87
m_highest_comp_level = 4;
88
break;
89
}
90
case 7:
91
{
92
m_master_comp_level = 2;
93
m_highest_comp_level = 2;
94
break;
95
}
96
case 8:
97
{
98
m_master_comp_level = 2;
99
m_highest_comp_level = 3;
100
break;
101
}
102
case 9:
103
{
104
m_master_comp_level = 2;
105
m_highest_comp_level = 4;
106
break;
107
}
108
case 10:
109
{
110
m_master_comp_level = 3;
111
m_highest_comp_level = 3;
112
break;
113
}
114
case 11:
115
{
116
m_master_comp_level = 3;
117
m_highest_comp_level = 4;
118
break;
119
}
120
case 12:
121
default:
122
{
123
m_master_comp_level = 4;
124
m_highest_comp_level = 4;
125
m_extra_patterns_flag = true;
126
m_brute_force_partition_matching = true;
127
break;
128
}
129
}
130
}
131
132
const float m1 = 0.1593017578125f; // (2610 / 2^14) * (1/100)
133
const float m2 = 78.84375f; // (2523 / 32) * (1/100)
134
const float c1 = 0.8359375f; // 3424 / (2^12)
135
const float c2 = 18.8515625f; // (2413 / 128)
136
const float c3 = 18.6875f; // (2392 / 128)
137
138
static float forwardPQ(float Y)
139
{
140
// 10,000 here is an absolute scale - it's in nits (cd per square meter)
141
float L = Y * (1.0f / 10000.0f);
142
143
float num = powf(L, m1);
144
float N = powf((c1 + c2 * num) / (1 + c3 * num), m2);
145
146
return N;
147
}
148
149
#if 0
150
static float inversePQ(float E)
151
{
152
float N = powf(E, 1.0f / m2);
153
154
float num = basisu::maximum<float>((N - c1), 0.0f) / (c2 - c3 * N);
155
float L = powf(num, 1.0f / m1);
156
157
return L * 10000.0f;
158
}
159
#endif
160
161
// PQ function approximation: convert input to bfloat16, look up in tables, bilinear interpolation between table entries.
162
// max_er: 0.000023007392883, max_rel_er: 0.000023472490284, avg_er: 0.000004330495689, 6-7x faster on x86
163
// Highest error is for values less than SMALLEST_PQ_VAL_IN.
164
//
165
// Approximation is round trip lossless for 10-12 bits at [0,10000] nits:
166
// for x [0,1024] (SCALE=1023) or for x [0,4095] (SCALE=4096):
167
// round(forwardPQTab(inversePQ(x / SCALE)) * SCALE) == x
168
//
169
// bfloat16 has enough precision to handle 8-bit sRGB to linear conversions:
170
// round(linear_to_srgb(bfloat16_to_float(float_to_bfloat16(srgb_to_linear(isRGB/255.0f))))*255.0) is lossless
171
172
const int PQ_APPROX_MIN_EXP = -16, PQ_APPROX_MAX_EXP = 16;
173
const int PQ_APPROX_EXP_RANGE = (PQ_APPROX_MAX_EXP - PQ_APPROX_MIN_EXP + 1);
174
175
const float SMALLEST_PQ_VAL_IN = 0.000015258829080f;
176
const float SMALLEST_PQ_VAL = 0.000551903737f; // forwardPQ(SMALLEST_PQ_VAL_IN)
177
178
const float LARGEST_PQ_VAL = 1.251312f;
179
180
float g_pq_approx_tabs[PQ_APPROX_EXP_RANGE][128];
181
182
static void init_pq_tables()
183
{
184
for (int exp = PQ_APPROX_MIN_EXP; exp <= PQ_APPROX_MAX_EXP; exp++)
185
{
186
for (int mant = 0; mant < 128; mant++)
187
{
188
bfloat16 b = bfloat16_init(1, exp, mant);
189
float bf = bfloat16_to_float(b);
190
191
float pq = forwardPQ(bf);
192
193
g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant] = pq;
194
}
195
}
196
197
//fmt_printf("{.15} {.15}\n", g_pq_approx_tabs[0][0], inversePQ(g_pq_approx_tabs[0][0]));
198
//fmt_printf("{.15}\n", forwardPQ(SMALLEST_PQ_VAL_IN));
199
}
200
201
static inline float forwardPQTab(float v)
202
{
203
assert(g_pq_approx_tabs[0][0]);
204
205
assert(v >= 0.0f);
206
if (v == 0.0f)
207
return 0.0f;
208
209
bfloat16 bf = float_to_bfloat16(v, false);
210
assert(v >= bfloat16_to_float(bf));
211
212
int exp = bfloat16_get_exp(bf);
213
214
if (exp < PQ_APPROX_MIN_EXP)
215
{
216
// not accurate but should be good enough for our uses
217
return lerp(0.0f, SMALLEST_PQ_VAL, minimum(1.0f, v / SMALLEST_PQ_VAL_IN));
218
}
219
else if (exp > PQ_APPROX_MAX_EXP)
220
return LARGEST_PQ_VAL;
221
222
int mant = bfloat16_get_mantissa(bf);
223
224
float a = g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant];
225
float bf_f32 = bfloat16_to_float(bf);
226
227
int next_mant = mant + 1;
228
int next_exp = exp;
229
if (next_mant == 128)
230
{
231
next_mant = 0;
232
next_exp++;
233
if (next_exp > PQ_APPROX_MAX_EXP)
234
return a;
235
}
236
237
float b = g_pq_approx_tabs[next_exp - PQ_APPROX_MIN_EXP][next_mant];
238
239
bfloat16 next_bf = bfloat16_init(1, next_exp, next_mant);
240
float next_bf_f32 = bfloat16_to_float(next_bf);
241
assert(v <= next_bf_f32);
242
243
float lerp_factor = (v - bf_f32) / (next_bf_f32 - bf_f32);
244
assert((lerp_factor >= 0) && (lerp_factor <= 1.0f));
245
246
return lerp(a, b, lerp_factor);
247
}
248
249
// 100 nits = ~.5 i
250
// This converts absolute linear RGB light in either REC 709 or REC2020/BT2100 color gamut to ICtCp, a coding space where Ct is scaled by 2.
251
// To convert to perceptual ITP for error/distance calculations, multiply the result Ct by .5 (or set itp_flag to true).
252
// Assumes REC 709 input, or REC 2020/BT.2100 RGB input if rec2020_bt2100_color_gamut is true.
253
//
254
// ITP info:
255
// https://www.portrait.com/resource-center/ictcp-color-difference-metric/
256
// https://professional.dolby.com/siteassets/pdfs/measuringperceptualcolorvolume_v07.253.pdf (see scale to JND's)
257
// This also converts from a ICtCp coding space to threshold or perceptually uniform space ITP.
258
//
259
// Linear REC709 to REC2020/BT.2100 gamut conversion:
260
// rgb_2100[0] = rgb_in[0] * 0.6274f + rgb_in[1] * 0.3293f + rgb_in[2] * 0.0433f;
261
// rgb_2100[1] = rgb_in[0] * 0.0691f + rgb_in[1] * 0.9195f + rgb_in[2] * 0.0114f;
262
// rgb_2100[2] = rgb_in[0] * 0.0164f + rgb_in[1] * 0.0880f + rgb_in[2] * 0.8956f;
263
// const float S = 1.0f / 4096.0f;
264
// l = (1688.0f * S) * rgb_2100[0] + (2146.0f * S) * rgb_2100[1] + (262.0f * S) * rgb_2100[2];
265
// m = (683.0f * S) * rgb_2100[0] + (2951.0f * S) * rgb_2100[1] + (462.0f * S) * rgb_2100[2];
266
// s = (99.0f * S) * rgb_2100[0] + (309.0f * S) * rgb_2100[1] + (3688.0f * S) * rgb_2100[2];
267
static void linear_rgb_to_ictcp(const vec3F& rgb_in, vec3F& ictcp, bool itp_flag = false, bool rec2020_bt2100_color_gamut = false)
268
{
269
vec3F rgb_2100(rgb_in);
270
271
float l, m, s;
272
if (!rec2020_bt2100_color_gamut)
273
{
274
// Assume REC 709 input color gamut
275
// (REC2020_to_LMS * REC709_to_2020) * input_color
276
l = rgb_2100[0] * 0.2958097f + rgb_2100[1] * 0.6230863f + rgb_2100[2] * 0.0811040f;
277
m = rgb_2100[0] * 0.1562512f + rgb_2100[1] * 0.7272980f + rgb_2100[2] * 0.1164508f;
278
s = rgb_2100[0] * 0.0351435f + rgb_2100[1] * 0.1565601f + rgb_2100[2] * 0.8082964f;
279
}
280
else
281
{
282
// Assumes REC2020/BT.2100 input color gamut (this is from the spec)
283
l = 0.412109375f * rgb_2100[0] + 0.52392578125f * rgb_2100[1] + 0.06396484375f * rgb_2100[2];
284
m = 0.166748046875f * rgb_2100[0] + 0.720458984375f * rgb_2100[1] + 0.11279296875f * rgb_2100[2];
285
s = 0.024169921875f * rgb_2100[0] + 0.075439453125f * rgb_2100[1] + 0.900390625f * rgb_2100[2];
286
}
287
288
float ld = forwardPQTab(l);
289
float md = forwardPQTab(m);
290
float sd = forwardPQTab(s);
291
292
ictcp[0] = .5f * ld + .5f * md;
293
294
// if ITP scale Ct by .5 (the ICtCp spec scaled Ct to better exploit the full scaled output, which is not perceptually linear)
295
if (itp_flag)
296
ictcp[1] = 0.806884765625f * ld + -1.6617431640625f * md + 0.8548583984375f * sd;
297
else
298
ictcp[1] = 1.61376953125f * ld + -3.323486328125f * md + 1.709716796875f * sd;
299
300
ictcp[2] = 4.378173828125f * ld + -4.24560546875f * md + -0.132568359375f * sd;
301
}
302
303
static inline void linear_rgb_to_itp(const vec3F& rgb_in, vec3F& itp, const astc_hdr_6x6_global_config &cfg)
304
{
305
linear_rgb_to_ictcp(rgb_in, itp, true, cfg.m_rec2020_bt2100_color_gamut);
306
}
307
308
#if 0
309
// Outputs rec2020/bt2100 color gamut (i.e. this doesn't convert back to REC709 gamut).
310
static void ictcp_to_linear_rgb(const vec3F& ictcp, vec3F& rgb, bool itp_flag = false)
311
{
312
float ct = ictcp[1];
313
314
if (itp_flag)
315
ct *= 2.0f;
316
317
float ld = ictcp[0] + ct * 0.008609037037932726f + ictcp[2] * 0.11102962500302596f;
318
float md = ictcp[0] + ct * -0.008609037037932726f + ictcp[2] * -0.11102962500302596f;
319
float sd = ictcp[0] + ct * 0.5600313357106792f + ictcp[2] * -0.32062717498731885f;
320
321
float l = inversePQ(ld);
322
float m = inversePQ(md);
323
float s = inversePQ(sd);
324
325
rgb[0] = l * 3.436606694333079f + m * -2.5064521186562705f + s * 0.06984542432319149f;
326
rgb[1] = l * -0.7913295555989289f + m * 1.983600451792291f + s * -0.192270896193362f;
327
rgb[2] = l * -0.025949899690592672f + m * -0.09891371471172646f + s * 1.1248636144023192f;
328
}
329
#endif
330
331
struct half_vec3
332
{
333
basist::half_float m_vals[3];
334
335
inline half_vec3() { }
336
337
inline half_vec3(basist::half_float x, basist::half_float y, basist::half_float z)
338
{
339
m_vals[0] = x;
340
m_vals[1] = y;
341
m_vals[2] = z;
342
}
343
344
inline half_vec3(const half_vec3& other)
345
{
346
*this = other;
347
}
348
349
inline half_vec3& operator= (const half_vec3& rhs)
350
{
351
m_vals[0] = rhs.m_vals[0];
352
m_vals[1] = rhs.m_vals[1];
353
m_vals[2] = rhs.m_vals[2];
354
return *this;
355
}
356
357
inline void clear()
358
{
359
clear_obj(m_vals);
360
}
361
362
inline half_vec3 &set(basist::half_float x, basist::half_float y, basist::half_float z)
363
{
364
m_vals[0] = x;
365
m_vals[1] = y;
366
m_vals[2] = z;
367
return *this;
368
}
369
370
inline half_vec3& set(float x, float y, float z)
371
{
372
m_vals[0] = basist::float_to_half(x);
373
m_vals[1] = basist::float_to_half(y);
374
m_vals[2] = basist::float_to_half(z);
375
return *this;
376
}
377
378
template<typename T>
379
inline half_vec3& set_vec(const T& vec)
380
{
381
m_vals[0] = basist::float_to_half(vec[0]);
382
m_vals[1] = basist::float_to_half(vec[1]);
383
m_vals[2] = basist::float_to_half(vec[2]);
384
return *this;
385
}
386
387
template<typename T>
388
inline T get_vec() const
389
{
390
return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]));
391
}
392
393
inline basist::half_float operator[] (uint32_t c) const { assert(c < 3); return m_vals[c]; }
394
inline basist::half_float& operator[] (uint32_t c) { assert(c < 3); return m_vals[c]; }
395
396
float get_float_comp(uint32_t c) const
397
{
398
assert(c < 3);
399
return basist::half_to_float(m_vals[c]);
400
}
401
402
half_vec3& set_float_comp(uint32_t c, float v)
403
{
404
assert(c < 3);
405
m_vals[c] = basist::float_to_half(v);
406
return *this;
407
}
408
};
409
410
struct half_vec4
411
{
412
basist::half_float m_vals[4];
413
414
inline half_vec4() { }
415
416
inline half_vec4(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)
417
{
418
m_vals[0] = x;
419
m_vals[1] = y;
420
m_vals[2] = z;
421
m_vals[3] = w;
422
}
423
424
inline half_vec4(const half_vec4& other)
425
{
426
*this = other;
427
}
428
429
inline half_vec4& operator= (const half_vec4& rhs)
430
{
431
m_vals[0] = rhs.m_vals[0];
432
m_vals[1] = rhs.m_vals[1];
433
m_vals[2] = rhs.m_vals[2];
434
m_vals[3] = rhs.m_vals[3];
435
return *this;
436
}
437
438
inline void clear()
439
{
440
clear_obj(m_vals);
441
}
442
443
inline half_vec4& set(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)
444
{
445
m_vals[0] = x;
446
m_vals[1] = y;
447
m_vals[2] = z;
448
m_vals[3] = w;
449
return *this;
450
}
451
452
inline half_vec4& set(float x, float y, float z, float w)
453
{
454
m_vals[0] = basist::float_to_half(x);
455
m_vals[1] = basist::float_to_half(y);
456
m_vals[2] = basist::float_to_half(z);
457
m_vals[3] = basist::float_to_half(w);
458
return *this;
459
}
460
461
template<typename T>
462
inline half_vec4& set_vec(const T& vec)
463
{
464
m_vals[0] = basist::float_to_half(vec[0]);
465
m_vals[1] = basist::float_to_half(vec[1]);
466
m_vals[2] = basist::float_to_half(vec[2]);
467
m_vals[3] = basist::float_to_half(vec[3]);
468
return *this;
469
}
470
471
template<typename T>
472
inline T get_vec() const
473
{
474
return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]), basist::half_to_float(m_vals[3]));
475
}
476
477
inline basist::half_float operator[] (uint32_t c) const { assert(c < 4); return m_vals[c]; }
478
inline basist::half_float &operator[] (uint32_t c) { assert(c < 4); return m_vals[c]; }
479
480
float get_float_comp(uint32_t c) const
481
{
482
assert(c < 4);
483
return basist::half_to_float(m_vals[c]);
484
}
485
486
half_vec4& set_float_comp(uint32_t c, float v)
487
{
488
assert(c < 4);
489
m_vals[c] = basist::float_to_half(v);
490
return *this;
491
}
492
};
493
494
const uint32_t MAX_BLOCK_W = 6, MAX_BLOCK_H = 6;
495
496
struct trial_result
497
{
498
astc_helpers::log_astc_block m_log_blk;
499
double m_err;
500
bool m_valid;
501
};
502
503
//----------------------------------------------------------
504
505
const uint32_t NUM_PART3_MAPPINGS = 6;
506
static uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] =
507
{
508
{ 0, 1, 2 },
509
{ 1, 2, 0 },
510
{ 2, 0, 1 },
511
{ 0, 2, 1 },
512
{ 1, 0, 2 },
513
{ 2, 1, 0 }
514
};
515
516
struct partition_pattern_vec
517
{
518
uint8_t m_parts[6 * 6];
519
520
partition_pattern_vec()
521
{
522
clear();
523
}
524
525
partition_pattern_vec(const partition_pattern_vec& other)
526
{
527
*this = other;
528
}
529
530
void clear()
531
{
532
memset(m_parts, 0, sizeof(m_parts));
533
}
534
535
partition_pattern_vec& operator= (const partition_pattern_vec& rhs)
536
{
537
if (this == &rhs)
538
return *this;
539
memcpy(m_parts, rhs.m_parts, 36);
540
return *this;
541
}
542
543
uint8_t operator[] (uint32_t i) const { assert(i < 36); return m_parts[i]; }
544
uint8_t& operator[] (uint32_t i) { assert(i < 36); return m_parts[i]; }
545
546
uint8_t operator() (uint32_t x, uint32_t y) const { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }
547
uint8_t& operator() (uint32_t x, uint32_t y) { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }
548
549
int get_squared_distance(const partition_pattern_vec& other) const
550
{
551
int total_dist = 0;
552
for (uint32_t i = 0; i < 36; i++)
553
total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]);
554
return total_dist;
555
}
556
557
float get_distance(const partition_pattern_vec& other) const
558
{
559
return sqrtf((float)get_squared_distance(other));
560
}
561
562
partition_pattern_vec get_permuted2(uint32_t permute_index) const
563
{
564
assert(permute_index <= 1);
565
566
partition_pattern_vec res;
567
for (uint32_t i = 0; i < 36; i++)
568
{
569
assert(m_parts[i] <= 1);
570
res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index);
571
}
572
573
return res;
574
}
575
576
partition_pattern_vec get_permuted3(uint32_t permute_index) const
577
{
578
assert(permute_index <= 5);
579
580
partition_pattern_vec res;
581
for (uint32_t i = 0; i < 36; i++)
582
{
583
assert(m_parts[i] <= 2);
584
res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]];
585
}
586
587
return res;
588
}
589
590
partition_pattern_vec get_canonicalized() const
591
{
592
partition_pattern_vec res;
593
594
int new_labels[3] = { -1, -1, -1 };
595
uint32_t next_index = 0;
596
for (uint32_t i = 0; i < 36; i++)
597
{
598
uint32_t p = m_parts[i];
599
if (new_labels[p] == -1)
600
new_labels[p] = next_index++;
601
602
res.m_parts[i] = (uint8_t)new_labels[p];
603
}
604
605
return res;
606
}
607
608
bool operator== (const partition_pattern_vec& rhs) const
609
{
610
return memcmp(m_parts, rhs.m_parts, sizeof(m_parts)) == 0;
611
}
612
613
operator size_t() const
614
{
615
return basisu::hash_hsieh(m_parts, sizeof(m_parts));
616
}
617
};
618
619
struct vp_tree_node
620
{
621
partition_pattern_vec m_vantage_point;
622
uint32_t m_point_index;
623
float m_dist;
624
625
int m_inner_node, m_outer_node;
626
};
627
628
#define BRUTE_FORCE_PART_SEARCH (0)
629
630
class vp_tree
631
{
632
public:
633
vp_tree()
634
{
635
}
636
637
void clear()
638
{
639
m_nodes.clear();
640
}
641
642
// This requires no redundant patterns, i.e. all must be unique.
643
bool init(uint32_t n, const partition_pattern_vec* pUnique_pats)
644
{
645
clear();
646
647
uint_vec pat_indices(n);
648
for (uint32_t i = 0; i < n; i++)
649
pat_indices[i] = i;
650
651
std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);
652
653
if (root_idx.first == -1)
654
return false;
655
656
m_nodes.resize(1);
657
m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first];
658
m_nodes[0].m_point_index = root_idx.first;
659
m_nodes[0].m_dist = root_idx.second;
660
m_nodes[0].m_inner_node = -1;
661
m_nodes[0].m_outer_node = -1;
662
663
uint_vec inner_list, outer_list;
664
665
inner_list.reserve(n / 2);
666
outer_list.reserve(n / 2);
667
668
for (uint32_t pat_index = 0; pat_index < n; pat_index++)
669
{
670
if ((int)pat_index == root_idx.first)
671
continue;
672
673
const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]);
674
675
if (dist <= root_idx.second)
676
inner_list.push_back(pat_index);
677
else
678
outer_list.push_back(pat_index);
679
}
680
681
if (inner_list.size())
682
{
683
m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list);
684
if (m_nodes[0].m_inner_node < 0)
685
return false;
686
}
687
688
if (outer_list.size())
689
{
690
m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list);
691
if (m_nodes[0].m_outer_node < 0)
692
return false;
693
}
694
695
return true;
696
}
697
698
struct result
699
{
700
uint32_t m_pat_index;
701
uint32_t m_mapping_index;
702
float m_dist;
703
704
bool operator< (const result& rhs) const { return m_dist < rhs.m_dist; }
705
bool operator> (const result& rhs) const { return m_dist > rhs.m_dist; }
706
};
707
708
class result_queue
709
{
710
enum { MaxSupportedSize = 256 + 1 };
711
712
public:
713
result_queue() :
714
m_cur_size(0)
715
{
716
}
717
718
size_t get_size() const
719
{
720
return m_cur_size;
721
}
722
723
bool empty() const
724
{
725
return !m_cur_size;
726
}
727
728
typedef std::array<result, MaxSupportedSize + 1> result_array_type;
729
730
const result_array_type& get_elements() const { return m_elements; }
731
result_array_type& get_elements() { return m_elements; }
732
733
void clear()
734
{
735
m_cur_size = 0;
736
}
737
738
void reserve(uint32_t n)
739
{
740
BASISU_NOTE_UNUSED(n);
741
}
742
743
const result& top() const
744
{
745
assert(m_cur_size);
746
return m_elements[1];
747
}
748
749
bool insert(const result& val, uint32_t max_size)
750
{
751
assert(max_size < MaxSupportedSize);
752
753
if (m_cur_size >= MaxSupportedSize)
754
return false;
755
756
m_elements[++m_cur_size] = val;
757
up_heap(m_cur_size);
758
759
if (m_cur_size > max_size)
760
pop();
761
762
return true;
763
}
764
765
bool pop()
766
{
767
if (m_cur_size == 0)
768
return false;
769
770
m_elements[1] = m_elements[m_cur_size--];
771
down_heap(1);
772
return true;
773
}
774
775
float get_highest_dist() const
776
{
777
if (!m_cur_size)
778
return 0.0f;
779
780
return top().m_dist;
781
}
782
783
private:
784
result_array_type m_elements;
785
size_t m_cur_size;
786
787
void up_heap(size_t index)
788
{
789
while ((index > 1) && (m_elements[index] > m_elements[index >> 1]))
790
{
791
std::swap(m_elements[index], m_elements[index >> 1]);
792
index >>= 1;
793
}
794
}
795
796
void down_heap(size_t index)
797
{
798
for ( ; ; )
799
{
800
size_t largest = index, left_child = 2 * index, right_child = 2 * index + 1;
801
802
if ((left_child <= m_cur_size) && (m_elements[left_child] > m_elements[largest]))
803
largest = left_child;
804
805
if ((right_child <= m_cur_size) && (m_elements[right_child] > m_elements[largest]))
806
largest = right_child;
807
808
if (largest == index)
809
break;
810
811
std::swap(m_elements[index], m_elements[largest]);
812
index = largest;
813
}
814
}
815
};
816
817
void find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results)
818
{
819
assert((num_subsets >= 2) && (num_subsets <= 3));
820
821
results.clear();
822
823
if (!m_nodes.size())
824
return;
825
826
uint32_t num_desired_pats;
827
partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS];
828
829
if (num_subsets == 2)
830
{
831
num_desired_pats = 2;
832
for (uint32_t i = 0; i < 2; i++)
833
desired_pats[i] = desired_pat.get_permuted2(i);
834
}
835
else
836
{
837
num_desired_pats = NUM_PART3_MAPPINGS;
838
for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++)
839
desired_pats[i] = desired_pat.get_permuted3(i);
840
}
841
842
#if 0
843
find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results);
844
#else
845
find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results);
846
#endif
847
}
848
849
private:
850
basisu::vector<vp_tree_node> m_nodes;
851
852
void find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)
853
{
854
float best_dist_to_vantage = BIG_FLOAT_VAL;
855
uint32_t best_mapping = 0;
856
for (uint32_t i = 0; i < num_desired_pats; i++)
857
{
858
float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);
859
if (dist < best_dist_to_vantage)
860
{
861
best_dist_to_vantage = dist;
862
best_mapping = i;
863
}
864
}
865
866
result r;
867
r.m_dist = best_dist_to_vantage;
868
r.m_mapping_index = best_mapping;
869
r.m_pat_index = m_nodes[node_index].m_point_index;
870
871
results.insert(r, max_results);
872
873
if (best_dist_to_vantage <= m_nodes[node_index].m_dist)
874
{
875
// inner first
876
if (m_nodes[node_index].m_inner_node >= 0)
877
find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);
878
879
if (m_nodes[node_index].m_outer_node >= 0)
880
{
881
if ( (results.get_size() < max_results) ||
882
((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())
883
)
884
{
885
find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);
886
}
887
}
888
}
889
else
890
{
891
// outer first
892
if (m_nodes[node_index].m_outer_node >= 0)
893
find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);
894
895
if (m_nodes[node_index].m_inner_node >= 0)
896
{
897
if ( (results.get_size() < max_results) ||
898
((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())
899
)
900
{
901
find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);
902
}
903
}
904
}
905
}
906
907
void find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)
908
{
909
uint_vec node_stack;
910
node_stack.reserve(16);
911
node_stack.push_back(init_node_index);
912
913
do
914
{
915
const uint32_t node_index = node_stack.back();
916
node_stack.pop_back();
917
918
float best_dist_to_vantage = BIG_FLOAT_VAL;
919
uint32_t best_mapping = 0;
920
for (uint32_t i = 0; i < num_desired_pats; i++)
921
{
922
float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);
923
if (dist < best_dist_to_vantage)
924
{
925
best_dist_to_vantage = dist;
926
best_mapping = i;
927
}
928
}
929
930
result r;
931
r.m_dist = best_dist_to_vantage;
932
r.m_mapping_index = best_mapping;
933
r.m_pat_index = m_nodes[node_index].m_point_index;
934
935
results.insert(r, max_results);
936
937
if (best_dist_to_vantage <= m_nodes[node_index].m_dist)
938
{
939
if (m_nodes[node_index].m_outer_node >= 0)
940
{
941
if ((results.get_size() < max_results) ||
942
((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())
943
)
944
{
945
node_stack.push_back(m_nodes[node_index].m_outer_node);
946
}
947
}
948
949
// inner first
950
if (m_nodes[node_index].m_inner_node >= 0)
951
{
952
node_stack.push_back(m_nodes[node_index].m_inner_node);
953
}
954
}
955
else
956
{
957
if (m_nodes[node_index].m_inner_node >= 0)
958
{
959
if ((results.get_size() < max_results) ||
960
((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())
961
)
962
{
963
node_stack.push_back(m_nodes[node_index].m_inner_node);
964
}
965
}
966
967
// outer first
968
if (m_nodes[node_index].m_outer_node >= 0)
969
{
970
node_stack.push_back(m_nodes[node_index].m_outer_node);
971
}
972
}
973
974
} while (!node_stack.empty());
975
}
976
977
// returns the index of the new node, or -1 on error
978
int create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices)
979
{
980
std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);
981
982
if (root_idx.first < 0)
983
return -1;
984
985
m_nodes.resize(m_nodes.size() + 1);
986
const uint32_t new_node_index = m_nodes.size_u32() - 1;
987
988
m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first];
989
m_nodes[new_node_index].m_point_index = root_idx.first;
990
m_nodes[new_node_index].m_dist = root_idx.second;
991
m_nodes[new_node_index].m_inner_node = -1;
992
m_nodes[new_node_index].m_outer_node = -1;
993
994
uint_vec inner_list, outer_list;
995
996
inner_list.reserve(pat_indices.size_u32() / 2);
997
outer_list.reserve(pat_indices.size_u32() / 2);
998
999
for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++)
1000
{
1001
const uint32_t pat_index = pat_indices[pat_indices_iter];
1002
1003
if ((int)pat_index == root_idx.first)
1004
continue;
1005
1006
const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]);
1007
1008
if (dist <= root_idx.second)
1009
inner_list.push_back(pat_index);
1010
else
1011
outer_list.push_back(pat_index);
1012
}
1013
1014
if (inner_list.size())
1015
m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list);
1016
1017
if (outer_list.size())
1018
m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list);
1019
1020
return new_node_index;
1021
}
1022
1023
// returns the pattern index of the vantage point (-1 on error), and the optimal split distance
1024
std::pair<int, float> find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec &pat_indices)
1025
{
1026
BASISU_NOTE_UNUSED(num_unique_pats);
1027
1028
const uint32_t n = pat_indices.size_u32();
1029
1030
assert(n);
1031
if (n == 1)
1032
return std::pair(pat_indices[0], 0.0f);
1033
1034
float best_split_metric = -1.0f;
1035
int best_split_pat = -1;
1036
float best_split_dist = 0.0f;
1037
float best_split_var = 0.0f;
1038
1039
basisu::vector< std::pair<float, uint32_t> > dists;
1040
dists.reserve(n);
1041
1042
float_vec float_dists;
1043
float_dists.reserve(n);
1044
1045
for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++)
1046
{
1047
const uint32_t split_pat_index = pat_indices[pat_indices_iter];
1048
assert(split_pat_index < num_unique_pats);
1049
1050
const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index];
1051
1052
dists.resize(0);
1053
float_dists.resize(0);
1054
1055
for (uint32_t j = 0; j < n; j++)
1056
{
1057
const uint32_t pat_index = pat_indices[j];
1058
assert(pat_index < num_unique_pats);
1059
1060
if (pat_index == split_pat_index)
1061
continue;
1062
1063
float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);
1064
dists.emplace_back(std::pair(dist, pat_index));
1065
1066
float_dists.push_back(dist);
1067
}
1068
1069
stats<double> s;
1070
s.calc(float_dists.size_u32(), float_dists.data());
1071
1072
std::sort(dists.begin(), dists.end(), [](const auto &a, const auto &b) {
1073
return a.first < b.first;
1074
});
1075
1076
const uint32_t num_dists = dists.size_u32();
1077
float split_dist = dists[num_dists / 2].first;
1078
if ((num_dists & 1) == 0)
1079
split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f;
1080
1081
uint32_t total_inner = 0, total_outer = 0;
1082
1083
for (uint32_t j = 0; j < n; j++)
1084
{
1085
const uint32_t pat_index = pat_indices[j];
1086
if (pat_index == split_pat_index)
1087
continue;
1088
1089
float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);
1090
1091
if (dist <= split_dist)
1092
total_inner++;
1093
else
1094
total_outer++;
1095
}
1096
1097
float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer);
1098
1099
if ( (split_metric > best_split_metric) ||
1100
((split_metric == best_split_metric) && (s.m_var > best_split_var)) )
1101
{
1102
best_split_metric = split_metric;
1103
best_split_dist = split_dist;
1104
best_split_pat = split_pat_index;
1105
best_split_var = (float)s.m_var;
1106
}
1107
}
1108
1109
return std::pair(best_split_pat, best_split_dist);
1110
}
1111
};
1112
1113
struct partition
1114
{
1115
uint64_t m_p;
1116
1117
inline partition() :
1118
m_p(0)
1119
{
1120
}
1121
1122
inline partition(uint64_t p) :
1123
m_p(p)
1124
{
1125
assert(p < (1ULL << 36));
1126
}
1127
1128
inline partition& operator=(uint64_t p)
1129
{
1130
assert(p < (1ULL << 36));
1131
m_p = p;
1132
return *this;
1133
}
1134
1135
inline bool operator< (const partition& p) const
1136
{
1137
return m_p < p.m_p;
1138
}
1139
1140
inline bool operator== (const partition& p) const
1141
{
1142
return m_p == p.m_p;
1143
}
1144
1145
inline operator size_t() const
1146
{
1147
return hash_hsieh((const uint8_t *)&m_p, sizeof(m_p));
1148
}
1149
};
1150
1151
partition_pattern_vec g_partitions2[NUM_UNIQUE_PARTITIONS2];
1152
int g_part2_seed_to_unique_index[1024];
1153
vp_tree g_part2_vp_tree;
1154
1155
static inline vec3F vec3F_norm_approx(vec3F axis)
1156
{
1157
float l = axis.norm();
1158
axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec3F(0.577350269f);
1159
return axis;
1160
}
1161
1162
static void init_partitions2_6x6()
1163
{
1164
#if 0
1165
// makes pattern bits to the 10-bit ASTC seed index
1166
typedef basisu::hash_map<uint64_t, uint32_t> partition2_hash_map;
1167
partition2_hash_map phash;
1168
phash.reserve(1024);
1169
1170
for (uint32_t i = 0; i < 1024; i++)
1171
{
1172
uint64_t p_bits = 0;
1173
uint64_t p_bits_inv = 0;
1174
1175
for (uint32_t y = 0; y < 6; y++)
1176
{
1177
for (uint32_t x = 0; x < 6; x++)
1178
{
1179
uint64_t p = astc_helpers::compute_texel_partition(i, x, y, 0, 2, false);
1180
assert(p < 2);
1181
1182
p_bits |= (p << (x + y * 6));
1183
p_bits_inv |= ((1 - p) << (x + y * 6));
1184
}
1185
}
1186
1187
if (!p_bits)
1188
continue;
1189
if (p_bits == ((1ULL << 36) - 1))
1190
continue;
1191
1192
assert(p_bits < (1ULL << 36));
1193
assert(p_bits_inv < (1ULL << 36));
1194
1195
if (phash.contains(p_bits))
1196
{
1197
}
1198
else if (phash.contains(p_bits_inv))
1199
{
1200
}
1201
else
1202
{
1203
auto res = phash.insert(p_bits, i);
1204
assert(res.second);
1205
BASISU_NOTE_UNUSED(res);
1206
}
1207
}
1208
1209
uint32_t num_unique_partitions2 = 0;
1210
1211
for (const auto& r : phash)
1212
{
1213
assert(r.second < 1024);
1214
1215
const uint32_t unique_index = num_unique_partitions2;
1216
assert(unique_index < NUM_UNIQUE_PARTITIONS2);
1217
1218
partition_pattern_vec pat_vec;
1219
for (uint32_t i = 0; i < 36; i++)
1220
pat_vec[i] = (uint8_t)((r.first >> i) & 1);
1221
1222
g_partitions2[unique_index] = pat_vec;
1223
1224
assert(g_part2_unique_index_to_seed[unique_index] == r.second);
1225
g_part2_seed_to_unique_index[r.second] = unique_index;
1226
1227
num_unique_partitions2++;
1228
}
1229
assert(num_unique_partitions2 == NUM_UNIQUE_PARTITIONS2);
1230
#else
1231
for (uint32_t unique_index = 0; unique_index < NUM_UNIQUE_PARTITIONS2; unique_index++)
1232
{
1233
const uint32_t seed_index = g_part2_unique_index_to_seed[unique_index];
1234
assert(seed_index < 1024);
1235
1236
assert(g_part2_seed_to_unique_index[seed_index] == 0);
1237
g_part2_seed_to_unique_index[seed_index] = unique_index;
1238
1239
partition_pattern_vec& pat_vec = g_partitions2[unique_index];
1240
1241
for (uint32_t y = 0; y < 6; y++)
1242
{
1243
for (uint32_t x = 0; x < 6; x++)
1244
{
1245
uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, 2, false);
1246
assert(p < 2);
1247
1248
pat_vec[x + y * 6] = p;
1249
}
1250
}
1251
}
1252
#endif
1253
1254
g_part2_vp_tree.init(NUM_UNIQUE_PARTITIONS2, g_partitions2);
1255
}
1256
1257
static bool estimate_partition2_6x6(
1258
const basist::half_float pBlock_pixels_half[][3],
1259
int* pBest_parts, uint32_t num_best_parts)
1260
{
1261
const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H;
1262
1263
vec3F training_vecs[BLOCK_T], mean(0.0f);
1264
1265
for (uint32_t i = 0; i < BLOCK_T; i++)
1266
{
1267
vec3F& v = training_vecs[i];
1268
1269
v[0] = (float)pBlock_pixels_half[i][0];
1270
v[1] = (float)pBlock_pixels_half[i][1];
1271
v[2] = (float)pBlock_pixels_half[i][2];
1272
1273
mean += v;
1274
}
1275
mean *= (1.0f / (float)BLOCK_T);
1276
1277
vec3F max_vals(-BIG_FLOAT_VAL);
1278
1279
for (uint32_t i = 0; i < BLOCK_T; i++)
1280
{
1281
vec3F& v = training_vecs[i];
1282
max_vals = vec3F::component_max(max_vals, v);
1283
}
1284
1285
// Initialize principle axis approximation
1286
vec3F axis(max_vals - mean);
1287
1288
// Incremental approx. PCA - only viable if we have a reasonably fast approximation for 1.0/sqrt(x).
1289
for (uint32_t i = 0; i < BLOCK_T; i++)
1290
{
1291
axis = vec3F_norm_approx(axis);
1292
1293
vec3F color(training_vecs[i] - mean);
1294
1295
float d = color.dot(axis);
1296
1297
axis += color * d;
1298
}
1299
1300
if (axis.norm() < SMALL_FLOAT_VAL)
1301
axis.set(0.57735027f);
1302
else
1303
axis.normalize_in_place();
1304
1305
#if BRUTE_FORCE_PART_SEARCH
1306
int desired_parts[BLOCK_H][BLOCK_W]; // [y][x]
1307
for (uint32_t i = 0; i < BLOCK_T; i++)
1308
{
1309
float proj = (training_vecs[i] - mean).dot(axis);
1310
1311
desired_parts[i / BLOCK_W][i % BLOCK_W] = proj < 0.0f;
1312
}
1313
#else
1314
partition_pattern_vec desired_part;
1315
1316
for (uint32_t i = 0; i < BLOCK_T; i++)
1317
{
1318
float proj = (training_vecs[i] - mean).dot(axis);
1319
1320
desired_part.m_parts[i] = proj < 0.0f;
1321
}
1322
#endif
1323
1324
//interval_timer tm;
1325
//tm.start();
1326
1327
#if BRUTE_FORCE_PART_SEARCH
1328
uint32_t part_similarity[NUM_UNIQUE_PARTITIONS2];
1329
1330
for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS2; part_index++)
1331
{
1332
const partition_pattern_vec &pat_vec = g_partitions2[part_index];
1333
1334
int total_sim_non_inv = 0;
1335
int total_sim_inv = 0;
1336
1337
for (uint32_t y = 0; y < BLOCK_H; y++)
1338
{
1339
for (uint32_t x = 0; x < BLOCK_W; x++)
1340
{
1341
int part = pat_vec[x + y * 6];
1342
1343
if (part == desired_parts[y][x])
1344
total_sim_non_inv++;
1345
1346
if ((part ^ 1) == desired_parts[y][x])
1347
total_sim_inv++;
1348
}
1349
}
1350
1351
int total_sim = maximum(total_sim_non_inv, total_sim_inv);
1352
1353
part_similarity[part_index] = (total_sim << 16) | part_index;
1354
1355
} // part_index;
1356
1357
std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS2);
1358
1359
for (uint32_t i = 0; i < num_best_parts; i++)
1360
pBest_parts[i] = part_similarity[(NUM_UNIQUE_PARTITIONS2 - 1) - i] & 0xFFFF;
1361
#else
1362
vp_tree::result_queue results;
1363
results.reserve(num_best_parts);
1364
g_part2_vp_tree.find_nearest(2, desired_part, results, num_best_parts);
1365
1366
assert(results.get_size() == num_best_parts);
1367
1368
const auto& elements = results.get_elements();
1369
1370
for (uint32_t i = 0; i < results.get_size(); i++)
1371
pBest_parts[i] = elements[1 + i].m_pat_index;
1372
#endif
1373
1374
//fmt_printf("{} ", tm.get_elapsed_ms());
1375
1376
return true;
1377
}
1378
1379
const uint32_t MIN_REFINE_LEVEL = 0;
1380
1381
static bool encode_block_2_subsets(
1382
trial_result res[2],
1383
uint32_t grid_w, uint32_t grid_h,
1384
uint32_t cem,
1385
uint32_t weights_ise_range, uint32_t endpoints_ise_range,
1386
const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,
1387
astc_hdr_codec_base_options& coptions,
1388
bool uber_mode_flag,
1389
int unique_pat_index,
1390
uint32_t comp_level,
1391
opt_mode_t mode11_opt_mode,
1392
bool refine_endpoints_flag)
1393
{
1394
const uint32_t num_endpoint_vals = (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;
1395
1396
res[0].m_valid = false;
1397
res[1].m_valid = false;
1398
1399
const uint32_t BLOCK_W = 6, BLOCK_H = 6;
1400
1401
astc_helpers::log_astc_block best_log_blk;
1402
clear_obj(best_log_blk);
1403
1404
best_log_blk.m_num_partitions = 2;
1405
best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;
1406
best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;
1407
best_log_blk.m_grid_width = (uint8_t)grid_w;
1408
best_log_blk.m_grid_height = (uint8_t)grid_h;
1409
1410
best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;
1411
best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;
1412
1413
partition_pattern_vec* pPat = &g_partitions2[unique_pat_index];
1414
const uint32_t p_seed = g_part2_unique_index_to_seed[unique_pat_index];
1415
1416
vec4F part_pixels_q16[2][64];
1417
half_vec3 part_half_pixels[2][64];
1418
uint8_t part_pixel_index[2][64];
1419
uint32_t part_total_pixels[2] = { 0 };
1420
1421
for (uint32_t y = 0; y < BLOCK_H; y++)
1422
{
1423
for (uint32_t x = 0; x < BLOCK_W; x++)
1424
{
1425
uint32_t part_index = (*pPat)[x + y * BLOCK_W];
1426
1427
uint32_t l = part_total_pixels[part_index];
1428
1429
part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];
1430
part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];
1431
part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);
1432
1433
part_total_pixels[part_index] = l + 1;
1434
} // x
1435
} // y
1436
1437
uint8_t blk_endpoints[2][basist::NUM_MODE11_ENDPOINTS];
1438
uint8_t blk_weights[2][BLOCK_W * BLOCK_H];
1439
uint32_t best_submode[2];
1440
1441
for (uint32_t part_iter = 0; part_iter < 2; part_iter++)
1442
{
1443
assert(part_total_pixels[part_iter]);
1444
1445
double e;
1446
if (cem == 7)
1447
{
1448
e = encode_astc_hdr_block_mode_7(
1449
part_total_pixels[part_iter],
1450
(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1451
best_log_blk.m_weight_ise_range,
1452
best_submode[part_iter],
1453
BIG_FLOAT_VAL,
1454
blk_endpoints[part_iter],
1455
blk_weights[part_iter],
1456
coptions,
1457
best_log_blk.m_endpoint_ise_range);
1458
}
1459
else
1460
{
1461
assert(cem == 11);
1462
1463
e = encode_astc_hdr_block_mode_11(
1464
part_total_pixels[part_iter],
1465
(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1466
best_log_blk.m_weight_ise_range,
1467
best_submode[part_iter],
1468
BIG_FLOAT_VAL,
1469
blk_endpoints[part_iter],
1470
blk_weights[part_iter],
1471
coptions,
1472
false,
1473
best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,
1474
mode11_opt_mode);
1475
}
1476
1477
if (e == BIG_FLOAT_VAL)
1478
return false;
1479
1480
} // part_iter
1481
1482
uint8_t ise_weights[BLOCK_W * BLOCK_H];
1483
1484
uint32_t src_pixel_index[2] = { 0, 0 };
1485
for (uint32_t y = 0; y < BLOCK_H; y++)
1486
{
1487
for (uint32_t x = 0; x < BLOCK_W; x++)
1488
{
1489
uint32_t part_index = (*pPat)[x + y * BLOCK_W];
1490
ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
1491
src_pixel_index[part_index]++;
1492
} // x
1493
} // y
1494
1495
if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))
1496
{
1497
best_log_blk.m_partition_id = (uint16_t)p_seed;
1498
1499
memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);
1500
memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);
1501
memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);
1502
1503
res[0].m_valid = true;
1504
res[0].m_log_blk = best_log_blk;
1505
}
1506
else
1507
{
1508
uint8_t desired_weights[BLOCK_H * BLOCK_W];
1509
1510
const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;
1511
1512
for (uint32_t by = 0; by < BLOCK_H; by++)
1513
for (uint32_t bx = 0; bx < BLOCK_W; bx++)
1514
desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];
1515
1516
uint8_t downsampled_weights[BLOCK_H * BLOCK_W];
1517
1518
const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);
1519
if (!pDownsample_matrix)
1520
{
1521
assert(0);
1522
return false;
1523
}
1524
1525
downsample_weight_grid(
1526
pDownsample_matrix,
1527
BLOCK_W, BLOCK_H, // source/from dimension (block size)
1528
grid_w, grid_h, // dest/to dimension (grid size)
1529
desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]
1530
downsampled_weights); // [wy][wx]
1531
1532
best_log_blk.m_partition_id = (uint16_t)p_seed;
1533
memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);
1534
memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);
1535
1536
const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;
1537
1538
for (uint32_t gy = 0; gy < grid_h; gy++)
1539
for (uint32_t gx = 0; gx < grid_w; gx++)
1540
best_log_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];
1541
1542
res[0].m_valid = true;
1543
res[0].m_log_blk = best_log_blk;
1544
1545
if ((refine_endpoints_flag) && (comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))
1546
{
1547
bool any_refined = false;
1548
1549
for (uint32_t part_iter = 0; part_iter < 2; part_iter++)
1550
{
1551
bool refine_status = refine_endpoints(
1552
cem,
1553
endpoints_ise_range,
1554
best_log_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize
1555
BLOCK_W, BLOCK_H, // block dimensions
1556
grid_w, grid_h, best_log_blk.m_weights, weights_ise_range, // weight grid
1557
part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1558
&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets
1559
coptions, mode11_opt_mode);
1560
1561
if (refine_status)
1562
any_refined = true;
1563
}
1564
1565
if (any_refined)
1566
{
1567
res[1].m_valid = true;
1568
res[1].m_log_blk = best_log_blk;
1569
}
1570
}
1571
}
1572
1573
return true;
1574
}
1575
1576
typedef basisu::hash_map<partition_pattern_vec, std::pair<uint32_t, uint32_t > > partition3_hash_map;
1577
1578
partition_pattern_vec g_partitions3[NUM_UNIQUE_PARTITIONS3];
1579
int g_part3_seed_to_unique_index[1024];
1580
vp_tree g_part3_vp_tree;
1581
1582
static void init_partitions3_6x6()
1583
{
1584
uint32_t t = 0;
1585
1586
for (uint32_t i = 0; i < 1024; i++)
1587
g_part3_seed_to_unique_index[i] = -1;
1588
1589
partition3_hash_map part3_hash;
1590
part3_hash.reserve(512);
1591
1592
for (uint32_t seed_index = 0; seed_index < 1024; seed_index++)
1593
{
1594
partition_pattern_vec p3;
1595
uint32_t part_hist[3] = { 0 };
1596
1597
for (uint32_t y = 0; y < 6; y++)
1598
{
1599
for (uint32_t x = 0; x < 6; x++)
1600
{
1601
uint64_t p = astc_helpers::compute_texel_partition(seed_index, x, y, 0, 3, false);
1602
assert(p < 3);
1603
1604
p3.m_parts[x + y * 6] = (uint8_t)p;
1605
part_hist[p]++;
1606
}
1607
}
1608
1609
if (!part_hist[0] || !part_hist[1] || !part_hist[2])
1610
continue;
1611
1612
uint32_t j;
1613
for (j = 0; j < NUM_PART3_MAPPINGS; j++)
1614
{
1615
partition_pattern_vec temp_part3(p3.get_permuted3(j));
1616
1617
if (part3_hash.contains(temp_part3))
1618
break;
1619
}
1620
if (j < NUM_PART3_MAPPINGS)
1621
continue;
1622
1623
part3_hash.insert(p3, std::make_pair(seed_index, t) );
1624
1625
assert(g_part3_unique_index_to_seed[t] == seed_index);
1626
g_part3_seed_to_unique_index[seed_index] = t;
1627
g_partitions3[t] = p3;
1628
1629
t++;
1630
}
1631
1632
g_part3_vp_tree.init(NUM_UNIQUE_PARTITIONS3, g_partitions3);
1633
}
1634
1635
static bool estimate_partition3_6x6(
1636
const basist::half_float pBlock_pixels_half[][3],
1637
int* pBest_parts, uint32_t num_best_parts)
1638
{
1639
const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H, NUM_SUBSETS = 3;
1640
1641
assert(num_best_parts && (num_best_parts <= NUM_UNIQUE_PARTITIONS3));
1642
1643
vec3F training_vecs[BLOCK_T], mean(0.0f);
1644
1645
float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL;
1646
vec3F cluster_centroids[NUM_SUBSETS];
1647
1648
for (uint32_t i = 0; i < BLOCK_T; i++)
1649
{
1650
vec3F& v = training_vecs[i];
1651
1652
v.set((float)pBlock_pixels_half[i][0], (float)pBlock_pixels_half[i][1], (float)pBlock_pixels_half[i][2]);
1653
1654
float inten = v.dot(vec3F(1.0f));
1655
if (inten < darkest_inten)
1656
{
1657
darkest_inten = inten;
1658
cluster_centroids[0] = v;
1659
}
1660
1661
if (inten > brightest_inten)
1662
{
1663
brightest_inten = inten;
1664
cluster_centroids[1] = v;
1665
}
1666
}
1667
1668
if (cluster_centroids[0] == cluster_centroids[1])
1669
return false;
1670
1671
float furthest_dist2 = 0.0f;
1672
for (uint32_t i = 0; i < BLOCK_T; i++)
1673
{
1674
vec3F& v = training_vecs[i];
1675
1676
float dist_a = v.squared_distance(cluster_centroids[0]);
1677
if (dist_a == 0.0f)
1678
continue;
1679
1680
float dist_b = v.squared_distance(cluster_centroids[1]);
1681
if (dist_b == 0.0f)
1682
continue;
1683
1684
float dist2 = dist_a + dist_b;
1685
if (dist2 > furthest_dist2)
1686
{
1687
furthest_dist2 = dist2;
1688
cluster_centroids[2] = v;
1689
}
1690
}
1691
1692
if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2]))
1693
return false;
1694
1695
uint32_t cluster_pixels[NUM_SUBSETS][BLOCK_T];
1696
uint32_t num_cluster_pixels[NUM_SUBSETS];
1697
vec3F new_cluster_means[NUM_SUBSETS];
1698
1699
const uint32_t NUM_ITERS = 4;
1700
1701
for (uint32_t s = 0; s < NUM_ITERS; s++)
1702
{
1703
memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels));
1704
memset(new_cluster_means, 0, sizeof(new_cluster_means));
1705
1706
for (uint32_t i = 0; i < BLOCK_T; i++)
1707
{
1708
float d[NUM_SUBSETS] = {
1709
training_vecs[i].squared_distance(cluster_centroids[0]),
1710
training_vecs[i].squared_distance(cluster_centroids[1]),
1711
training_vecs[i].squared_distance(cluster_centroids[2]) };
1712
1713
float min_d = d[0];
1714
uint32_t min_idx = 0;
1715
for (uint32_t j = 1; j < NUM_SUBSETS; j++)
1716
{
1717
if (d[j] < min_d)
1718
{
1719
min_d = d[j];
1720
min_idx = j;
1721
}
1722
}
1723
1724
cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i;
1725
new_cluster_means[min_idx] += training_vecs[i];
1726
num_cluster_pixels[min_idx]++;
1727
} // i
1728
1729
for (uint32_t j = 0; j < NUM_SUBSETS; j++)
1730
{
1731
if (!num_cluster_pixels[j])
1732
return false;
1733
1734
cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j];
1735
}
1736
} // s
1737
1738
partition_pattern_vec desired_part;
1739
for (uint32_t p = 0; p < NUM_SUBSETS; p++)
1740
{
1741
for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)
1742
{
1743
const uint32_t pix_index = cluster_pixels[p][i];
1744
desired_part[pix_index] = (uint8_t)p;
1745
}
1746
}
1747
1748
#if BRUTE_FORCE_PART_SEARCH
1749
partition_pattern_vec desired_parts[NUM_PART3_MAPPINGS];
1750
for (uint32_t j = 0; j < NUM_PART3_MAPPINGS; j++)
1751
desired_parts[j] = desired_part.get_permuted3(j);
1752
1753
uint32_t part_similarity[NUM_UNIQUE_PARTITIONS3];
1754
1755
for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS3; part_index++)
1756
{
1757
const partition_pattern_vec& pat = g_partitions3[part_index];
1758
1759
uint32_t lowest_pat_dist = UINT32_MAX;
1760
for (uint32_t p = 0; p < NUM_PART3_MAPPINGS; p++)
1761
{
1762
uint32_t dist = pat.get_squared_distance(desired_parts[p]);
1763
if (dist < lowest_pat_dist)
1764
lowest_pat_dist = dist;
1765
}
1766
1767
part_similarity[part_index] = (lowest_pat_dist << 16) | part_index;
1768
1769
} // part_index;
1770
1771
std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS3);
1772
1773
for (uint32_t i = 0; i < num_best_parts; i++)
1774
pBest_parts[i] = part_similarity[i] & 0xFFFF;
1775
#else
1776
vp_tree::result_queue results;
1777
results.reserve(num_best_parts);
1778
g_part3_vp_tree.find_nearest(3, desired_part, results, num_best_parts);
1779
1780
assert(results.get_size() == num_best_parts);
1781
1782
const auto& elements = results.get_elements();
1783
1784
for (uint32_t i = 0; i < results.get_size(); i++)
1785
pBest_parts[i] = elements[1 + i].m_pat_index;
1786
#endif
1787
1788
return true;
1789
}
1790
1791
static bool encode_block_3_subsets(
1792
trial_result& res,
1793
uint32_t cem,
1794
uint32_t grid_w, uint32_t grid_h,
1795
uint32_t weights_ise_range, uint32_t endpoints_ise_range,
1796
const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,
1797
astc_hdr_codec_base_options& coptions,
1798
bool uber_mode_flag,
1799
const int* pEst_patterns, int num_est_patterns,
1800
uint32_t comp_level,
1801
opt_mode_t mode11_opt_mode)
1802
{
1803
BASISU_NOTE_UNUSED(uber_mode_flag);
1804
const uint32_t BLOCK_W = 6, BLOCK_H = 6, NUM_SUBSETS = 3;
1805
const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem);
1806
1807
res.m_valid = false;
1808
1809
double best_e = BIG_FLOAT_VAL;
1810
1811
astc_helpers::log_astc_block best_log_blk;
1812
clear_obj(best_log_blk);
1813
1814
best_log_blk.m_num_partitions = NUM_SUBSETS;
1815
best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;
1816
best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;
1817
best_log_blk.m_color_endpoint_modes[2] = (uint8_t)cem;
1818
best_log_blk.m_grid_width = (uint8_t)grid_w;
1819
best_log_blk.m_grid_height = (uint8_t)grid_h;
1820
1821
best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;
1822
best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;
1823
1824
const uint32_t n = num_est_patterns ? num_est_patterns : NUM_UNIQUE_PARTITIONS3;
1825
1826
for (uint32_t unique_p_iter = 0; unique_p_iter < n; unique_p_iter++)
1827
{
1828
const uint32_t unique_part_index = num_est_patterns ? pEst_patterns[unique_p_iter] : unique_p_iter;
1829
assert(unique_part_index < NUM_UNIQUE_PARTITIONS3);
1830
const partition_pattern_vec*pPart = &g_partitions3[unique_part_index];
1831
1832
vec4F part_pixels_q16[NUM_SUBSETS][64];
1833
half_vec3 part_half_pixels[NUM_SUBSETS][64];
1834
uint8_t part_pixel_index[NUM_SUBSETS][64];
1835
uint32_t part_total_pixels[NUM_SUBSETS] = { 0 };
1836
1837
for (uint32_t y = 0; y < BLOCK_H; y++)
1838
{
1839
for (uint32_t x = 0; x < BLOCK_W; x++)
1840
{
1841
const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];
1842
1843
uint32_t l = part_total_pixels[part_index];
1844
1845
part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];
1846
part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];
1847
part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);
1848
1849
part_total_pixels[part_index] = l + 1;
1850
} // x
1851
} // y
1852
1853
uint8_t blk_endpoints[NUM_SUBSETS][basist::NUM_MODE11_ENDPOINTS];
1854
uint8_t blk_weights[NUM_SUBSETS][BLOCK_W * BLOCK_H];
1855
uint32_t best_submode[NUM_SUBSETS];
1856
1857
double e = 0.0f;
1858
for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)
1859
{
1860
assert(part_total_pixels[part_iter]);
1861
1862
if (cem == 7)
1863
{
1864
e += encode_astc_hdr_block_mode_7(
1865
part_total_pixels[part_iter],
1866
(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1867
best_log_blk.m_weight_ise_range,
1868
best_submode[part_iter],
1869
BIG_FLOAT_VAL,
1870
blk_endpoints[part_iter],
1871
blk_weights[part_iter],
1872
coptions,
1873
best_log_blk.m_endpoint_ise_range);
1874
}
1875
else
1876
{
1877
assert(cem == 11);
1878
1879
e += encode_astc_hdr_block_mode_11(
1880
part_total_pixels[part_iter],
1881
(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1882
best_log_blk.m_weight_ise_range,
1883
best_submode[part_iter],
1884
BIG_FLOAT_VAL,
1885
blk_endpoints[part_iter],
1886
blk_weights[part_iter],
1887
coptions,
1888
false, best_log_blk.m_endpoint_ise_range, uber_mode_flag, false,
1889
FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode);
1890
}
1891
1892
} // part_iter
1893
1894
uint8_t ise_weights[BLOCK_W * BLOCK_H];
1895
1896
uint32_t src_pixel_index[NUM_SUBSETS] = { 0 };
1897
for (uint32_t y = 0; y < BLOCK_H; y++)
1898
{
1899
for (uint32_t x = 0; x < BLOCK_W; x++)
1900
{
1901
const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];
1902
1903
ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
1904
src_pixel_index[part_index]++;
1905
} // x
1906
} // y
1907
1908
if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))
1909
{
1910
if (e < best_e)
1911
{
1912
best_e = e;
1913
best_log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];
1914
1915
for (uint32_t p = 0; p < NUM_SUBSETS; p++)
1916
memcpy(best_log_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);
1917
1918
memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);
1919
}
1920
}
1921
else
1922
{
1923
uint8_t desired_weights[BLOCK_H * BLOCK_W];
1924
1925
const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;
1926
1927
for (uint32_t by = 0; by < BLOCK_H; by++)
1928
for (uint32_t bx = 0; bx < BLOCK_W; bx++)
1929
desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];
1930
1931
uint8_t downsampled_weights[BLOCK_H * BLOCK_W];
1932
1933
const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);
1934
if (!pDownsample_matrix)
1935
{
1936
assert(0);
1937
return false;
1938
}
1939
1940
downsample_weight_grid(
1941
pDownsample_matrix,
1942
BLOCK_W, BLOCK_H, // source/from dimension (block size)
1943
grid_w, grid_h, // dest/to dimension (grid size)
1944
desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]
1945
downsampled_weights); // [wy][wx]
1946
1947
astc_helpers::log_astc_block trial_blk(best_log_blk);
1948
1949
trial_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];
1950
1951
for (uint32_t p = 0; p < NUM_SUBSETS; p++)
1952
memcpy(trial_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);
1953
1954
const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;
1955
1956
for (uint32_t gy = 0; gy < grid_h; gy++)
1957
for (uint32_t gx = 0; gx < grid_w; gx++)
1958
trial_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];
1959
1960
if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))
1961
{
1962
for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)
1963
{
1964
bool refine_status = refine_endpoints(
1965
cem,
1966
endpoints_ise_range,
1967
trial_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize
1968
BLOCK_W, BLOCK_H, // block dimensions
1969
grid_w, grid_h, trial_blk.m_weights, weights_ise_range, // weight grid
1970
part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1971
&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets
1972
coptions, mode11_opt_mode);
1973
1974
BASISU_NOTE_UNUSED(refine_status);
1975
}
1976
}
1977
1978
half_vec4 decoded_pixels_half4[BLOCK_H][BLOCK_W]; // [y][x]
1979
bool status = astc_helpers::decode_block(trial_blk, decoded_pixels_half4, BLOCK_W, BLOCK_H, astc_helpers::cDecodeModeHDR16);
1980
assert(status);
1981
if (!status)
1982
return false;
1983
1984
half_vec3 decoded_pixels_half3[BLOCK_H][BLOCK_W];
1985
for (uint32_t y = 0; y < BLOCK_H; y++)
1986
for (uint32_t x = 0; x < BLOCK_W; x++)
1987
decoded_pixels_half3[y][x].set(decoded_pixels_half4[y][x][0], decoded_pixels_half4[y][x][1], decoded_pixels_half4[y][x][2]);
1988
1989
double trial_err = compute_block_error(BLOCK_W * BLOCK_H, (const basist::half_float*)pBlock_pixels_half, (const basist::half_float*)decoded_pixels_half3, coptions);
1990
if (trial_err < best_e)
1991
{
1992
best_e = trial_err;
1993
best_log_blk = trial_blk;
1994
}
1995
}
1996
1997
} // unique_p_iter
1998
1999
if (best_e < BIG_FLOAT_VAL)
2000
{
2001
res.m_log_blk = best_log_blk;
2002
res.m_valid = true;
2003
res.m_err = best_e;
2004
}
2005
else
2006
{
2007
res.m_valid = false;
2008
}
2009
2010
return res.m_valid;
2011
}
2012
2013
static uint32_t encode_values(bitwise_coder &coder, uint32_t total_values, const uint8_t *pVals, uint32_t endpoint_range)
2014
{
2015
const uint32_t MAX_VALS = 64;
2016
uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3];
2017
uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1;
2018
2019
assert((total_values) && (total_values <= MAX_VALS));
2020
2021
const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0];
2022
const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1];
2023
const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2];
2024
2025
for (uint32_t i = 0; i < total_values; i++)
2026
{
2027
uint32_t val = pVals[i];
2028
2029
uint32_t bits = val & ((1 << ep_bits) - 1);
2030
uint32_t tq = val >> ep_bits;
2031
2032
bit_values[i] = bits;
2033
2034
if (ep_trits)
2035
{
2036
assert(tq < 3);
2037
tq_accum += tq * tq_mul;
2038
tq_mul *= 3;
2039
if (tq_mul == 243)
2040
{
2041
assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
2042
tq_values[total_tq_values++] = tq_accum;
2043
tq_accum = 0;
2044
tq_mul = 1;
2045
}
2046
}
2047
else if (ep_quints)
2048
{
2049
assert(tq < 5);
2050
tq_accum += tq * tq_mul;
2051
tq_mul *= 5;
2052
if (tq_mul == 125)
2053
{
2054
assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
2055
tq_values[total_tq_values++] = tq_accum;
2056
tq_accum = 0;
2057
tq_mul = 1;
2058
}
2059
}
2060
}
2061
2062
uint32_t total_bits_output = 0;
2063
2064
for (uint32_t i = 0; i < total_tq_values; i++)
2065
{
2066
const uint32_t num_bits = ep_trits ? 8 : 7;
2067
coder.put_bits(tq_values[i], num_bits);
2068
total_bits_output += num_bits;
2069
}
2070
2071
if (tq_mul > 1)
2072
{
2073
uint32_t num_bits;
2074
if (ep_trits)
2075
{
2076
if (tq_mul == 3)
2077
num_bits = 2;
2078
else if (tq_mul == 9)
2079
num_bits = 4;
2080
else if (tq_mul == 27)
2081
num_bits = 5;
2082
else //if (tq_mul == 81)
2083
num_bits = 7;
2084
}
2085
else
2086
{
2087
if (tq_mul == 5)
2088
num_bits = 3;
2089
else //if (tq_mul == 25)
2090
num_bits = 5;
2091
}
2092
coder.put_bits(tq_accum, num_bits);
2093
total_bits_output += num_bits;
2094
}
2095
2096
for (uint32_t i = 0; i < total_values; i++)
2097
{
2098
coder.put_bits(bit_values[i], ep_bits);
2099
total_bits_output += ep_bits;
2100
}
2101
2102
return total_bits_output;
2103
}
2104
2105
static inline uint32_t get_num_endpoint_vals(uint32_t cem)
2106
{
2107
assert((cem == 7) || (cem == 11));
2108
return (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;
2109
}
2110
2111
static void code_block(bitwise_coder& coder,
2112
const astc_helpers::log_astc_block& log_blk,
2113
block_mode block_mode_index,
2114
endpoint_mode em, const uint8_t *pEP_deltas)
2115
{
2116
coder.put_truncated_binary((uint32_t)block_mode_index, (uint32_t)block_mode::cBMTotalModes);
2117
coder.put_truncated_binary((uint32_t)em, (uint32_t)endpoint_mode::cTotal);
2118
2119
const uint32_t num_endpoint_vals = get_num_endpoint_vals(log_blk.m_color_endpoint_modes[0]);
2120
2121
if ((em == endpoint_mode::cUseLeftDelta) || (em == endpoint_mode::cUseUpperDelta))
2122
{
2123
assert(log_blk.m_num_partitions == 1);
2124
2125
for (uint32_t i = 0; i < num_endpoint_vals; i++)
2126
coder.put_bits(pEP_deltas[i], NUM_ENDPOINT_DELTA_BITS);
2127
}
2128
else if (em == endpoint_mode::cRaw)
2129
{
2130
if (log_blk.m_num_partitions == 2)
2131
{
2132
const int unique_partition_index = g_part2_seed_to_unique_index[log_blk.m_partition_id];
2133
assert(unique_partition_index != -1);
2134
2135
coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS2);
2136
}
2137
else if (log_blk.m_num_partitions == 3)
2138
{
2139
const int unique_partition_index = g_part3_seed_to_unique_index[log_blk.m_partition_id];
2140
assert(unique_partition_index != -1);
2141
2142
coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS3);
2143
}
2144
2145
encode_values(coder, num_endpoint_vals * log_blk.m_num_partitions, log_blk.m_endpoints, log_blk.m_endpoint_ise_range);
2146
}
2147
2148
encode_values(coder, log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1), log_blk.m_weights, log_blk.m_weight_ise_range);
2149
}
2150
2151
struct smooth_map_params
2152
{
2153
bool m_no_mse_scaling;
2154
2155
float m_max_smooth_std_dev;
2156
float m_smooth_max_mse_scale;
2157
2158
float m_max_med_smooth_std_dev;
2159
float m_med_smooth_max_mse_scale;
2160
2161
float m_max_ultra_smooth_std_dev;
2162
float m_ultra_smooth_max_mse_scale;
2163
2164
bool m_debug_images;
2165
2166
smooth_map_params()
2167
{
2168
clear();
2169
}
2170
2171
void clear()
2172
{
2173
m_no_mse_scaling = false;
2174
2175
// 3x3 region
2176
m_max_smooth_std_dev = 100.0f;
2177
m_smooth_max_mse_scale = 13000.0f;
2178
2179
// 7x7 region
2180
m_max_med_smooth_std_dev = 9.0f;
2181
m_med_smooth_max_mse_scale = 15000.0f;
2182
2183
// 11x11 region
2184
m_max_ultra_smooth_std_dev = 4.0f;
2185
//m_ultra_smooth_max_mse_scale = 4500.0f;
2186
//m_ultra_smooth_max_mse_scale = 10000.0f;
2187
//m_ultra_smooth_max_mse_scale = 50000.0f;
2188
//m_ultra_smooth_max_mse_scale = 100000.0f;
2189
//m_ultra_smooth_max_mse_scale = 400000.0f;
2190
//m_ultra_smooth_max_mse_scale = 800000.0f;
2191
m_ultra_smooth_max_mse_scale = 2000000.0f;
2192
2193
m_debug_images = true;
2194
}
2195
};
2196
2197
Resampler::Contrib_List* g_contrib_lists[7]; // 1-6
2198
2199
static void init_contrib_lists()
2200
{
2201
for (uint32_t dst_width = 1; dst_width <= 6; dst_width++)
2202
//g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);
2203
g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_BELL_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);
2204
}
2205
2206
#if 0
2207
static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, half_vec3 *pDst_block_half3, vec4F *pDst_block_q16)
2208
{
2209
vec3F temp_block[6][6]; // [y][x]
2210
2211
// first filter rows to temp_block
2212
if (grid_x == 6)
2213
{
2214
memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);
2215
}
2216
else
2217
{
2218
Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
2219
2220
for (uint32_t y = 0; y < 6; y++)
2221
{
2222
for (uint32_t x = 0; x < 6; x++)
2223
{
2224
vec3F p(0.0f);
2225
2226
for (uint32_t i = 0; i < pRow_lists[x].n; i++)
2227
p += pSrc_block[y * 6 + pRow_lists[x].p[i].pixel] * pRow_lists[x].p[i].weight;
2228
2229
p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2230
2231
temp_block[y][x] = p;
2232
} // x
2233
} // y
2234
}
2235
2236
// filter columns
2237
if (grid_y == 6)
2238
{
2239
for (uint32_t y = 0; y < 6; y++)
2240
{
2241
for (uint32_t x = 0; x < 6; x++)
2242
{
2243
for (uint32_t c = 0; c < 3; c++)
2244
{
2245
const basist::half_float h = basist::float_to_half(temp_block[y][x][c]);
2246
2247
pDst_block_half3[x + y * 6][c] = h;
2248
pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);
2249
}
2250
2251
pDst_block_q16[x + y * 6][3] = 0.0f;
2252
} // x
2253
} // y
2254
}
2255
else
2256
{
2257
Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
2258
2259
for (uint32_t x = 0; x < 6; x++)
2260
{
2261
for (uint32_t y = 0; y < 6; y++)
2262
{
2263
vec3F p(0.0f);
2264
2265
for (uint32_t i = 0; i < pCol_lists[y].n; i++)
2266
p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
2267
2268
p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2269
2270
for (uint32_t c = 0; c < 3; c++)
2271
{
2272
const basist::half_float h = basist::float_to_half(p[c]);
2273
2274
pDst_block_half3[x + y * 6][c] = h;
2275
pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);
2276
}
2277
2278
pDst_block_q16[x + y * 6][3] = 0.0f;
2279
2280
} // x
2281
} // y
2282
}
2283
}
2284
#endif
2285
2286
static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec4F* pSrc_block, vec4F* pDst_block)
2287
{
2288
vec4F temp_block[6][6]; // [y][x]
2289
2290
// first filter rows to temp_block
2291
if (grid_x == 6)
2292
{
2293
memcpy(temp_block, pSrc_block, sizeof(vec4F) * 6 * 6);
2294
}
2295
else
2296
{
2297
Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
2298
2299
for (uint32_t y = 0; y < 6; y++)
2300
{
2301
for (uint32_t x = 0; x < 6; x++)
2302
{
2303
vec3F p(0.0f);
2304
2305
for (uint32_t i = 0; i < pRow_lists[x].n; i++)
2306
p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;
2307
2308
p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2309
2310
temp_block[y][x] = p;
2311
} // x
2312
} // y
2313
}
2314
2315
// filter columns
2316
if (grid_y == 6)
2317
{
2318
for (uint32_t y = 0; y < 6; y++)
2319
{
2320
for (uint32_t x = 0; x < 6; x++)
2321
{
2322
for (uint32_t c = 0; c < 3; c++)
2323
pDst_block[x + y * 6][c] = temp_block[y][x][c];
2324
} // x
2325
} // y
2326
}
2327
else
2328
{
2329
Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
2330
2331
for (uint32_t x = 0; x < 6; x++)
2332
{
2333
for (uint32_t y = 0; y < 6; y++)
2334
{
2335
vec3F p(0.0f);
2336
2337
for (uint32_t i = 0; i < pCol_lists[y].n; i++)
2338
p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
2339
2340
p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2341
2342
pDst_block[x + y * 6] = p;
2343
2344
} // x
2345
} // y
2346
}
2347
}
2348
2349
static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, vec3F* pDst_block)
2350
{
2351
vec3F temp_block[6][6]; // [y][x]
2352
2353
// first filter rows to temp_block
2354
if (grid_x == 6)
2355
{
2356
memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);
2357
}
2358
else
2359
{
2360
Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
2361
2362
for (uint32_t y = 0; y < 6; y++)
2363
{
2364
for (uint32_t x = 0; x < 6; x++)
2365
{
2366
vec3F p(0.0f);
2367
2368
for (uint32_t i = 0; i < pRow_lists[x].n; i++)
2369
p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;
2370
2371
temp_block[y][x] = p;
2372
} // x
2373
} // y
2374
}
2375
2376
// filter columns
2377
if (grid_y == 6)
2378
{
2379
memcpy((void *)pDst_block, temp_block, sizeof(vec3F) * 6 * 6);
2380
}
2381
else
2382
{
2383
Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
2384
2385
for (uint32_t x = 0; x < 6; x++)
2386
{
2387
for (uint32_t y = 0; y < 6; y++)
2388
{
2389
vec3F& p = pDst_block[x + y * 6];
2390
p.set(0.0f);
2391
2392
for (uint32_t i = 0; i < pCol_lists[y].n; i++)
2393
p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
2394
} // x
2395
} // y
2396
}
2397
}
2398
2399
static float diff_blocks(const vec4F* pA, const vec4F* pB)
2400
{
2401
const uint32_t BLOCK_T = 36;
2402
2403
float diff = 0.0f;
2404
for (uint32_t i = 0; i < BLOCK_T; i++)
2405
diff += square(pA[i][0] - pB[i][0]) + square(pA[i][1] - pB[i][1]) + square(pA[i][2] - pB[i][2]);
2406
2407
return diff * (1.0f / (float)BLOCK_T);
2408
}
2409
2410
static float sub_and_compute_std_dev(const vec3F* pA, const vec3F* pB)
2411
{
2412
const uint32_t BLOCK_T = 36;
2413
2414
vec3F mean(0.0f);
2415
2416
for (uint32_t i = 0; i < BLOCK_T; i++)
2417
{
2418
vec3F diff(pA[i] - pB[i]);
2419
mean += diff;
2420
}
2421
2422
mean *= (1.0f / (float)BLOCK_T);
2423
2424
vec3F diff_sum(0.0f);
2425
for (uint32_t i = 0; i < BLOCK_T; i++)
2426
{
2427
vec3F diff(pA[i] - pB[i]);
2428
diff -= mean;
2429
diff_sum += vec3F::component_mul(diff, diff);
2430
}
2431
2432
vec3F var(diff_sum * (1.0f / (float)BLOCK_T));
2433
2434
vec3F std_dev(sqrtf(var[0]), sqrtf(var[1]), sqrtf(var[2]));
2435
2436
return maximum(std_dev[0], std_dev[1], std_dev[2]);
2437
}
2438
2439
static void create_smooth_maps2(
2440
vector2D<float>& smooth_block_mse_scales,
2441
const image& orig_img,
2442
smooth_map_params& params, image* pUltra_smooth_img = nullptr)
2443
{
2444
const uint32_t width = orig_img.get_width();
2445
const uint32_t height = orig_img.get_height();
2446
//const uint32_t total_pixels = orig_img.get_total_pixels();
2447
const uint32_t num_comps = 3;
2448
2449
if (params.m_no_mse_scaling)
2450
{
2451
smooth_block_mse_scales.set_all(1.0f);
2452
return;
2453
}
2454
2455
smooth_block_mse_scales.resize(width, height);
2456
2457
image smooth_vis, med_smooth_vis, ultra_smooth_vis;
2458
2459
if (params.m_debug_images)
2460
{
2461
smooth_vis.resize(width, height);
2462
med_smooth_vis.resize(width, height);
2463
ultra_smooth_vis.resize(width, height);
2464
}
2465
2466
for (uint32_t y = 0; y < height; y++)
2467
{
2468
for (uint32_t x = 0; x < width; x++)
2469
{
2470
{
2471
tracked_stat_dbl comp_stats[4];
2472
for (int yd = -1; yd <= 1; yd++)
2473
{
2474
for (int xd = -1; xd <= 1; xd++)
2475
{
2476
const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
2477
2478
comp_stats[0].update((float)p[0]);
2479
comp_stats[1].update((float)p[1]);
2480
comp_stats[2].update((float)p[2]);
2481
}
2482
}
2483
2484
float max_std_dev = 0.0f;
2485
for (uint32_t i = 0; i < num_comps; i++)
2486
max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
2487
2488
float yl = clampf(max_std_dev / params.m_max_smooth_std_dev, 0.0f, 1.0f);
2489
//yl = powf(yl, 2.0f);
2490
yl = powf(yl, 1.0f / 2.0f); // substantially less bits
2491
2492
smooth_block_mse_scales(x, y) = lerp(params.m_smooth_max_mse_scale, 1.0f, yl);
2493
2494
if (params.m_debug_images)
2495
{
2496
//smooth_vis(x, y).set(clamp((int)((smooth_block_mse_scales(x, y) - 1.0f) / (params.m_smooth_max_mse_scale - 1.0f) * 255.0f + .5f), 0, 255));
2497
// white=high local activity (edges/detail)
2498
// black=low local activity (smooth - error is amplified)
2499
smooth_vis(x, y).set(clamp((int)((yl * 255.0f) + .5f), 0, 255));
2500
}
2501
}
2502
2503
{
2504
tracked_stat_dbl comp_stats[4];
2505
2506
const int S = 3;
2507
for (int yd = -S; yd < S; yd++)
2508
{
2509
for (int xd = -S; xd < S; xd++)
2510
{
2511
const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
2512
2513
comp_stats[0].update((float)p[0]);
2514
comp_stats[1].update((float)p[1]);
2515
comp_stats[2].update((float)p[2]);
2516
}
2517
}
2518
2519
float max_std_dev = 0.0f;
2520
for (uint32_t i = 0; i < num_comps; i++)
2521
max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
2522
2523
float yl = clampf(max_std_dev / params.m_max_med_smooth_std_dev, 0.0f, 1.0f);
2524
//yl = powf(yl, 2.0f);
2525
2526
smooth_block_mse_scales(x, y) = lerp(params.m_med_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);
2527
2528
if (params.m_debug_images)
2529
med_smooth_vis(x, y).set((int)std::round(yl * 255.0f));
2530
}
2531
2532
{
2533
tracked_stat_dbl comp_stats[4];
2534
2535
const int S = 5;
2536
for (int yd = -S; yd < S; yd++)
2537
{
2538
for (int xd = -S; xd < S; xd++)
2539
{
2540
const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
2541
2542
comp_stats[0].update((float)p[0]);
2543
comp_stats[1].update((float)p[1]);
2544
comp_stats[2].update((float)p[2]);
2545
}
2546
}
2547
2548
float max_std_dev = 0.0f;
2549
for (uint32_t i = 0; i < num_comps; i++)
2550
max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
2551
2552
float yl = clampf(max_std_dev / params.m_max_ultra_smooth_std_dev, 0.0f, 1.0f);
2553
yl = powf(yl, 2.0f);
2554
2555
smooth_block_mse_scales(x, y) = lerp(params.m_ultra_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);
2556
2557
if (params.m_debug_images)
2558
ultra_smooth_vis(x, y).set((int)std::round(yl * 255.0f));
2559
}
2560
2561
}
2562
}
2563
2564
if (params.m_debug_images)
2565
{
2566
save_png("dbg_smooth_vis.png", smooth_vis);
2567
save_png("dbg_med_smooth_vis.png", med_smooth_vis);
2568
save_png("dbg_ultra_smooth_vis.png", ultra_smooth_vis);
2569
2570
image vis_img(width, height);
2571
2572
float max_scale = 0.0f;
2573
for (uint32_t y = 0; y < height; y++)
2574
for (uint32_t x = 0; x < width; x++)
2575
max_scale = basisu::maximumf(max_scale, smooth_block_mse_scales(x, y));
2576
2577
for (uint32_t y = 0; y < height; y++)
2578
for (uint32_t x = 0; x < width; x++)
2579
vis_img(x, y).set((int)std::round(smooth_block_mse_scales(x, y) * 255.0f / max_scale));
2580
2581
save_png("scale_vis.png", vis_img);
2582
}
2583
2584
if (pUltra_smooth_img)
2585
*pUltra_smooth_img = ultra_smooth_vis;
2586
}
2587
2588
const float REALLY_DARK_I_THRESHOLD = 0.0625f;
2589
const float REALLY_DARK_MSE_ERR_SCALE = 128.0f;
2590
const float REALLY_DARK_DELTA_ITP_JND_SCALE = 5.0f;
2591
2592
static float compute_pixel_mse_itp(const vec3F& orig_pixel_itp, const vec3F& comp_pixel_itp, bool delta_itp_dark_adjustment)
2593
{
2594
float delta_i = orig_pixel_itp[0] - comp_pixel_itp[0];
2595
float delta_t = orig_pixel_itp[1] - comp_pixel_itp[1];
2596
float delta_p = orig_pixel_itp[2] - comp_pixel_itp[2];
2597
2598
float err = (delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p);
2599
2600
if (delta_itp_dark_adjustment)
2601
{
2602
// We have to process a large range of inputs, including extremely dark inputs.
2603
// Artifically amplify MSE on very dark pixels - otherwise they'll be overly compressed at higher lambdas.
2604
// This is to better handle very dark signals which could be explictly overexposed.
2605
float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig_pixel_itp[0]);
2606
s = lerp(REALLY_DARK_MSE_ERR_SCALE, 1.0f, s);
2607
err *= s;
2608
}
2609
2610
return err;
2611
}
2612
2613
static float compute_block_mse_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp, bool delta_itp_dark_adjustment)
2614
{
2615
float total_mse = 0.0f;
2616
2617
for (uint32_t y = 0; y < block_h; y++)
2618
{
2619
for (uint32_t x = 0; x < block_w; x++)
2620
{
2621
total_mse += compute_pixel_mse_itp(pOrig_pixels_itp[x + y * block_w], pComp_pixels_itp[x + y * block_w], delta_itp_dark_adjustment);
2622
} // x
2623
} // y
2624
2625
return total_mse * (1.0f / (float)(block_w * block_h));
2626
}
2627
2628
static float compute_block_ssim_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp)
2629
{
2630
const uint32_t n = block_w * block_h;
2631
assert(n <= 36);
2632
2633
stats<float> x_stats[3], y_stats[3];
2634
comparative_stats<float> xy_cov[3];
2635
2636
for (uint32_t c = 0; c < 3; c++)
2637
{
2638
x_stats[c].calc_simplified(n, &pOrig_pixels_itp[0][c], 3);
2639
y_stats[c].calc_simplified(n, &pComp_pixels_itp[0][c], 3);
2640
}
2641
2642
for (uint32_t c = 0; c < 3; c++)
2643
xy_cov[c].calc_cov(n, &pOrig_pixels_itp[0][c], &pComp_pixels_itp[0][c], 3, 3, &x_stats[c], &y_stats[c]);
2644
2645
float ssim[3];
2646
const double d = 1.0f, k1 = .01f, k2 = .03f;
2647
2648
// weight mean error more highly to reduce blocking
2649
float ap = 1.5f, bp = 1.0f, cp = 1.0f;
2650
2651
const double s_c1 = square(k1 * d), s_c2 = square(k2 * d);
2652
const double s_c3(s_c2 * .5f);
2653
2654
for (uint32_t c = 0; c < 3; c++)
2655
{
2656
float lum = (float)((2.0f * x_stats[c].m_avg * y_stats[c].m_avg + s_c1) / (square(x_stats[c].m_avg) + square(y_stats[c].m_avg) + s_c1));
2657
lum = saturate(lum);
2658
2659
float con = (float)((2.0f * x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c2) / (x_stats[c].m_var + y_stats[c].m_var + s_c2));
2660
con = saturate(con);
2661
2662
float str = (float)((xy_cov[c].m_cov + s_c3) / (x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c3));
2663
str = saturate(str);
2664
2665
ssim[c] = powf(lum, ap) * powf(con, bp) * powf(str, cp);
2666
}
2667
2668
#if 0
2669
float final_ssim = (ssim[0] * .4f + ssim[1] * .3f + ssim[2] * .3f);
2670
#elif 1
2671
float final_ssim = ssim[0] * ssim[1] * ssim[2];
2672
#else
2673
const float LP = .75f;
2674
float final_ssim = ssim[0] * powf((ssim[1] + ssim[2]) * .5f, LP);
2675
#endif
2676
2677
return final_ssim;
2678
}
2679
2680
// delta ITP, 1.0 is JND (Rec. ITU-R BT.2124), modified for higher error at low light
2681
static float compute_pixel_delta_itp(const vec3F& a, const vec3F& b, const vec3F& orig, bool delta_itp_dark_adjustment)
2682
{
2683
float delta_i = a[0] - b[0];
2684
float delta_t = a[1] - b[1];
2685
float delta_p = a[2] - b[2];
2686
2687
float err = 720.0f * sqrtf((delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p));
2688
2689
float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig[0]);
2690
2691
if (delta_itp_dark_adjustment)
2692
{
2693
// This is to better handle very dark signals which could be explictly overexposed.
2694
s = lerp(REALLY_DARK_DELTA_ITP_JND_SCALE, 1.0f, s);
2695
err *= s;
2696
}
2697
2698
return err;
2699
}
2700
2701
struct candidate_encoding
2702
{
2703
encoding_type m_encoding_type;
2704
2705
basist::half_float m_solid_color[3];
2706
2707
uint32_t m_run_len;
2708
2709
vec3F m_comp_pixels[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]
2710
vec3F m_comp_pixels_itp[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]
2711
2712
endpoint_mode m_endpoint_mode;
2713
block_mode m_block_mode;
2714
2715
bitwise_coder m_coder;
2716
2717
// The block to code, which may not be valid ASTC. This may have to be transcoded (by requantizing the weights/endpoints) before it's valid ASTC.
2718
// Note the endpoints may be coded endpoints OR transcoded endpoints, depending on the encoding type.
2719
astc_helpers::log_astc_block m_coded_log_blk;
2720
2721
// The block the decoder outputs.
2722
astc_helpers::log_astc_block m_decomp_log_blk;
2723
2724
int m_reuse_delta_index;
2725
2726
float m_t, m_d, m_bits;
2727
2728
candidate_encoding()
2729
{
2730
clear();
2731
}
2732
2733
candidate_encoding(const candidate_encoding &other)
2734
{
2735
*this = other;
2736
}
2737
2738
candidate_encoding(candidate_encoding&& other)
2739
{
2740
*this = std::move(other);
2741
}
2742
2743
candidate_encoding& operator=(const candidate_encoding& rhs)
2744
{
2745
if (this == &rhs)
2746
return *this;
2747
2748
m_encoding_type = rhs.m_encoding_type;
2749
memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));
2750
m_run_len = rhs.m_run_len;
2751
memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));
2752
m_endpoint_mode = rhs.m_endpoint_mode;
2753
m_block_mode = rhs.m_block_mode;
2754
m_coder = rhs.m_coder;
2755
m_coded_log_blk = rhs.m_coded_log_blk;
2756
m_decomp_log_blk = rhs.m_decomp_log_blk;
2757
m_reuse_delta_index = rhs.m_reuse_delta_index;
2758
2759
return *this;
2760
}
2761
2762
candidate_encoding& operator=(candidate_encoding&& rhs)
2763
{
2764
if (this == &rhs)
2765
return *this;
2766
2767
m_encoding_type = rhs.m_encoding_type;
2768
memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));
2769
m_run_len = rhs.m_run_len;
2770
memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));
2771
m_endpoint_mode = rhs.m_endpoint_mode;
2772
m_block_mode = rhs.m_block_mode;
2773
m_coder = std::move(rhs.m_coder);
2774
m_coded_log_blk = rhs.m_coded_log_blk;
2775
m_decomp_log_blk = rhs.m_decomp_log_blk;
2776
m_reuse_delta_index = rhs.m_reuse_delta_index;
2777
2778
return *this;
2779
}
2780
2781
void clear()
2782
{
2783
m_encoding_type = encoding_type::cInvalid;
2784
2785
clear_obj(m_solid_color);
2786
2787
m_run_len = 0;
2788
2789
clear_obj(m_comp_pixels);
2790
2791
m_endpoint_mode = endpoint_mode::cInvalid;
2792
m_block_mode = block_mode::cInvalid;
2793
2794
m_coder.restart();
2795
2796
m_coded_log_blk.clear();
2797
m_decomp_log_blk.clear();
2798
2799
m_t = 0;
2800
m_d = 0;
2801
m_bits = 0;
2802
2803
m_reuse_delta_index = 0;
2804
}
2805
};
2806
2807
bool decode_astc_block(uint32_t block_w, uint32_t block_h, astc_helpers::log_astc_block &log_blk, vec3F *pPixels)
2808
{
2809
assert((block_w <= 6) && (block_h <= 6));
2810
2811
half_vec4 decoded_pixels_half4[6 * 6]; // [y][x]
2812
bool status = astc_helpers::decode_block(log_blk, decoded_pixels_half4, block_w, block_h, astc_helpers::cDecodeModeHDR16);
2813
assert(status);
2814
2815
if (!status)
2816
return false;
2817
2818
for (uint32_t y = 0; y < block_h; y++)
2819
{
2820
for (uint32_t x = 0; x < block_w; x++)
2821
{
2822
pPixels[x + y * block_w].set(
2823
basist::half_to_float(decoded_pixels_half4[x + y * block_w][0]),
2824
basist::half_to_float(decoded_pixels_half4[x + y * block_w][1]),
2825
basist::half_to_float(decoded_pixels_half4[x + y * block_w][2]));
2826
} // x
2827
} //y
2828
2829
return true;
2830
}
2831
2832
static inline bool validate_log_blk(const astc_helpers::log_astc_block &decomp_blk)
2833
{
2834
astc_helpers::astc_block phys_blk;
2835
return astc_helpers::pack_astc_block(phys_blk, decomp_blk);
2836
}
2837
2838
#define SYNC_MARKERS (0)
2839
2840
static bool decode_file(const uint8_vec& comp_data, vector2D<astc_helpers::astc_block>& decoded_blocks, uint32_t &width, uint32_t &height)
2841
{
2842
interval_timer tm;
2843
tm.start();
2844
2845
const uint32_t BLOCK_W = 6, BLOCK_H = 6;
2846
2847
width = 0;
2848
height = 0;
2849
2850
if (comp_data.size() <= 2*3)
2851
return false;
2852
2853
basist::bitwise_decoder decoder;
2854
if (!decoder.init(comp_data.data(), comp_data.size_u32()))
2855
return false;
2856
2857
if (decoder.get_bits(16) != 0xABCD)
2858
return false;
2859
2860
width = decoder.get_bits(16);
2861
height = decoder.get_bits(16);
2862
2863
if (!width || !height || (width > MAX_ASTC_HDR_6X6_DIM) || (height > MAX_ASTC_HDR_6X6_DIM))
2864
return false;
2865
2866
const uint32_t num_blocks_x = (width + BLOCK_W - 1) / BLOCK_W;
2867
const uint32_t num_blocks_y = (height + BLOCK_H - 1) / BLOCK_H;
2868
const uint32_t total_blocks = num_blocks_x * num_blocks_y;
2869
2870
decoded_blocks.resize(num_blocks_x, num_blocks_y);
2871
//memset(decoded_blocks.get_ptr(), 0, decoded_blocks.size_in_bytes());
2872
2873
vector2D<astc_helpers::log_astc_block> decoded_log_blocks(num_blocks_x, num_blocks_y);
2874
//memset(decoded_log_blocks.get_ptr(), 0, decoded_log_blocks.size_in_bytes());
2875
2876
uint32_t cur_bx = 0, cur_by = 0;
2877
uint32_t step_counter = 0;
2878
BASISU_NOTE_UNUSED(step_counter);
2879
2880
while (cur_by < num_blocks_y)
2881
{
2882
step_counter++;
2883
2884
//if ((cur_bx == 9) && (cur_by == 13))
2885
// printf("!");
2886
2887
#if SYNC_MARKERS
2888
uint32_t mk = decoder.get_bits(16);
2889
if (mk != 0xDEAD)
2890
{
2891
printf("!");
2892
assert(0);
2893
return false;
2894
}
2895
#endif
2896
if (decoder.get_bits_remaining() < 1)
2897
return false;
2898
2899
encoding_type et = encoding_type::cBlock;
2900
2901
uint32_t b0 = decoder.get_bits(1);
2902
if (!b0)
2903
{
2904
uint32_t b1 = decoder.get_bits(1);
2905
if (b1)
2906
et = encoding_type::cReuse;
2907
else
2908
{
2909
uint32_t b2 = decoder.get_bits(1);
2910
if (b2)
2911
et = encoding_type::cSolid;
2912
else
2913
et = encoding_type::cRun;
2914
}
2915
}
2916
2917
switch (et)
2918
{
2919
case encoding_type::cRun:
2920
{
2921
if (!cur_bx && !cur_by)
2922
return false;
2923
2924
const uint32_t run_len = decoder.decode_vlc(5) + 1;
2925
2926
uint32_t num_blocks_remaining = total_blocks - (cur_bx + cur_by * num_blocks_x);
2927
if (run_len > num_blocks_remaining)
2928
return false;
2929
2930
uint32_t prev_bx = cur_bx, prev_by = cur_by;
2931
2932
if (cur_bx)
2933
prev_bx--;
2934
else
2935
{
2936
prev_bx = num_blocks_x - 1;
2937
prev_by--;
2938
}
2939
2940
const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);
2941
const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);
2942
2943
for (uint32_t i = 0; i < run_len; i++)
2944
{
2945
decoded_log_blocks(cur_bx, cur_by) = prev_log_blk;
2946
decoded_blocks(cur_bx, cur_by) = prev_phys_blk;
2947
2948
cur_bx++;
2949
if (cur_bx == num_blocks_x)
2950
{
2951
cur_bx = 0;
2952
cur_by++;
2953
}
2954
}
2955
2956
break;
2957
}
2958
case encoding_type::cSolid:
2959
{
2960
const basist::half_float rh = (basist::half_float)decoder.get_bits(15);
2961
const basist::half_float gh = (basist::half_float)decoder.get_bits(15);
2962
const basist::half_float bh = (basist::half_float)decoder.get_bits(15);
2963
2964
astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
2965
2966
log_blk.clear();
2967
log_blk.m_solid_color_flag_hdr = true;
2968
log_blk.m_solid_color[0] = rh;
2969
log_blk.m_solid_color[1] = gh;
2970
log_blk.m_solid_color[2] = bh;
2971
log_blk.m_solid_color[3] = basist::float_to_half(1.0f);
2972
2973
bool status = astc_helpers::pack_astc_block(decoded_blocks(cur_bx, cur_by), log_blk);
2974
if (!status)
2975
return false;
2976
2977
cur_bx++;
2978
if (cur_bx == num_blocks_x)
2979
{
2980
cur_bx = 0;
2981
cur_by++;
2982
}
2983
2984
break;
2985
}
2986
case encoding_type::cReuse:
2987
{
2988
if (!cur_bx && !cur_by)
2989
return false;
2990
2991
const uint32_t reuse_delta_index = decoder.get_bits(REUSE_XY_DELTA_BITS);
2992
2993
const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;
2994
const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;
2995
2996
const int prev_bx = cur_bx + reuse_delta_x, prev_by = cur_by + reuse_delta_y;
2997
if ((prev_bx < 0) || (prev_bx >= (int)num_blocks_x))
2998
return false;
2999
if (prev_by < 0)
3000
return false;
3001
3002
const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);
3003
const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);
3004
3005
if (prev_log_blk.m_solid_color_flag_hdr)
3006
return false;
3007
3008
astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3009
astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3010
3011
log_blk = prev_log_blk;
3012
3013
const uint32_t total_grid_weights = log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1);
3014
3015
bool status = basist::astc_6x6_hdr::decode_values(decoder, total_grid_weights, log_blk.m_weight_ise_range, log_blk.m_weights);
3016
if (!status)
3017
return false;
3018
3019
astc_helpers::log_astc_block decomp_blk;
3020
status = astc_helpers::unpack_block(&prev_phys_blk, decomp_blk, BLOCK_W, BLOCK_H);
3021
if (!status)
3022
return false;
3023
3024
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3025
basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, decomp_blk.m_weight_ise_range);
3026
3027
copy_weight_grid(log_blk.m_dual_plane, log_blk.m_grid_width, log_blk.m_grid_height, transcode_weights, decomp_blk);
3028
3029
status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3030
if (!status)
3031
return false;
3032
3033
cur_bx++;
3034
if (cur_bx == num_blocks_x)
3035
{
3036
cur_bx = 0;
3037
cur_by++;
3038
}
3039
3040
break;
3041
}
3042
case encoding_type::cBlock:
3043
{
3044
const block_mode bm = (block_mode)decoder.decode_truncated_binary((uint32_t)block_mode::cBMTotalModes);
3045
const endpoint_mode em = (endpoint_mode)decoder.decode_truncated_binary((uint32_t)endpoint_mode::cTotal);
3046
3047
switch (em)
3048
{
3049
case endpoint_mode::cUseLeft:
3050
case endpoint_mode::cUseUpper:
3051
{
3052
int neighbor_bx = cur_bx, neighbor_by = cur_by;
3053
3054
if (em == endpoint_mode::cUseLeft)
3055
neighbor_bx--;
3056
else
3057
neighbor_by--;
3058
3059
if ((neighbor_bx < 0) || (neighbor_by < 0))
3060
return false;
3061
3062
const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);
3063
if (!neighbor_blk.m_color_endpoint_modes[0])
3064
return false;
3065
3066
const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
3067
const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
3068
3069
if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])
3070
return false;
3071
3072
astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3073
astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3074
3075
log_blk.clear();
3076
log_blk.m_num_partitions = 1;
3077
log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3078
log_blk.m_endpoint_ise_range = neighbor_blk.m_endpoint_ise_range;
3079
log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
3080
log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
3081
log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
3082
log_blk.m_dual_plane = (uint8_t)bmd.m_dp;
3083
log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3084
3085
memcpy(log_blk.m_endpoints, neighbor_blk.m_endpoints, num_endpoint_values);
3086
3087
const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
3088
3089
bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
3090
if (!status)
3091
return false;
3092
3093
astc_helpers::log_astc_block decomp_blk;
3094
decomp_blk.clear();
3095
3096
decomp_blk.m_num_partitions = 1;
3097
decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3098
decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
3099
decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
3100
decomp_blk.m_dual_plane = bmd.m_dp;
3101
decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3102
3103
basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
3104
3105
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3106
basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
3107
3108
copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
3109
3110
status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3111
if (!status)
3112
return false;
3113
3114
cur_bx++;
3115
if (cur_bx == num_blocks_x)
3116
{
3117
cur_bx = 0;
3118
cur_by++;
3119
}
3120
3121
break;
3122
}
3123
case endpoint_mode::cUseLeftDelta:
3124
case endpoint_mode::cUseUpperDelta:
3125
{
3126
int neighbor_bx = cur_bx, neighbor_by = cur_by;
3127
3128
if (em == endpoint_mode::cUseLeftDelta)
3129
neighbor_bx--;
3130
else
3131
neighbor_by--;
3132
3133
if ((neighbor_bx < 0) || (neighbor_by < 0))
3134
return false;
3135
3136
const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);
3137
if (!neighbor_blk.m_color_endpoint_modes[0])
3138
return false;
3139
3140
const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
3141
const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
3142
3143
if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])
3144
return false;
3145
3146
astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3147
astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3148
3149
log_blk.clear();
3150
log_blk.m_num_partitions = 1;
3151
log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3152
log_blk.m_dual_plane = bmd.m_dp;
3153
log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3154
3155
log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;
3156
basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, neighbor_blk.m_endpoint_ise_range, neighbor_blk.m_endpoints, bmd.m_endpoint_ise_range, log_blk.m_endpoints);
3157
3158
const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;
3159
const int low_delta_limit = -(total_endpoint_delta_vals / 2); // high_delta_limit = (total_endpoint_delta_vals / 2) - 1;
3160
3161
const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_ISE_to_rank;
3162
const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_rank_to_ISE;
3163
const int total_endpoint_levels = astc_helpers::get_ise_levels(log_blk.m_endpoint_ise_range);
3164
3165
for (uint32_t i = 0; i < num_endpoint_values; i++)
3166
{
3167
int cur_val = ise_to_rank[log_blk.m_endpoints[i]];
3168
3169
int delta = (int)decoder.get_bits(NUM_ENDPOINT_DELTA_BITS) + low_delta_limit;
3170
3171
cur_val += delta;
3172
if ((cur_val < 0) || (cur_val >= total_endpoint_levels))
3173
return false;
3174
3175
log_blk.m_endpoints[i] = rank_to_ise[cur_val];
3176
}
3177
3178
log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
3179
log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
3180
log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
3181
3182
const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
3183
3184
bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
3185
if (!status)
3186
return false;
3187
3188
astc_helpers::log_astc_block decomp_blk;
3189
decomp_blk.clear();
3190
3191
decomp_blk.m_num_partitions = 1;
3192
decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3193
decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
3194
decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
3195
decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp;
3196
decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3197
3198
basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
3199
3200
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3201
basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
3202
3203
copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
3204
3205
status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3206
if (!status)
3207
return false;
3208
3209
cur_bx++;
3210
if (cur_bx == num_blocks_x)
3211
{
3212
cur_bx = 0;
3213
cur_by++;
3214
}
3215
3216
break;
3217
}
3218
case endpoint_mode::cRaw:
3219
{
3220
const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
3221
3222
const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
3223
3224
astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3225
astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3226
3227
log_blk.clear();
3228
log_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;
3229
3230
for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
3231
log_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;
3232
3233
log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;
3234
log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
3235
3236
log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
3237
log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
3238
log_blk.m_dual_plane = (uint8_t)bmd.m_dp;
3239
log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3240
3241
if (bmd.m_num_partitions == 2)
3242
{
3243
const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS2);
3244
log_blk.m_partition_id = (uint16_t)g_part2_unique_index_to_seed[unique_partition_index];
3245
}
3246
else if (bmd.m_num_partitions == 3)
3247
{
3248
const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS3);
3249
log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_partition_index];
3250
}
3251
3252
bool status = decode_values(decoder, num_endpoint_values * bmd.m_num_partitions, bmd.m_endpoint_ise_range, log_blk.m_endpoints);
3253
if (!status)
3254
return false;
3255
3256
const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
3257
3258
status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
3259
if (!status)
3260
return false;
3261
3262
astc_helpers::log_astc_block decomp_blk;
3263
decomp_blk.clear();
3264
3265
decomp_blk.m_dual_plane = bmd.m_dp;
3266
decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3267
decomp_blk.m_partition_id = log_blk.m_partition_id;
3268
3269
decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;
3270
3271
for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
3272
decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;
3273
3274
decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
3275
decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
3276
3277
for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
3278
basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, bmd.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p);
3279
3280
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3281
basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
3282
3283
copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
3284
3285
status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3286
if (!status)
3287
return false;
3288
3289
cur_bx++;
3290
if (cur_bx == num_blocks_x)
3291
{
3292
cur_bx = 0;
3293
cur_by++;
3294
}
3295
3296
break;
3297
}
3298
default:
3299
{
3300
assert(0);
3301
return false;
3302
}
3303
}
3304
3305
break;
3306
}
3307
default:
3308
{
3309
assert(0);
3310
return false;
3311
}
3312
}
3313
}
3314
3315
if (decoder.get_bits(16) != 0xA742)
3316
{
3317
fmt_error_printf("End marker not found!\n");
3318
return false;
3319
}
3320
3321
//fmt_printf("Total decode_file() time: {} secs\n", tm.get_elapsed_secs());
3322
3323
return true;
3324
}
3325
3326
static bool unpack_physical_astc_block(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)
3327
{
3328
astc_helpers::log_astc_block log_blk;
3329
if (!astc_helpers::unpack_block(pBlock, log_blk, block_width, block_height))
3330
return false;
3331
3332
basist::half_float half_block[MAX_BLOCK_W * MAX_BLOCK_H][4];
3333
if (!astc_helpers::decode_block(log_blk, half_block, block_width, block_height, astc_helpers::cDecodeModeHDR16))
3334
return false;
3335
3336
const uint32_t total_block_pixels = block_width * block_height;
3337
for (uint32_t p = 0; p < total_block_pixels; p++)
3338
{
3339
pPixels[p][0] = basist::half_to_float(half_block[p][0]);
3340
pPixels[p][1] = basist::half_to_float(half_block[p][1]);
3341
pPixels[p][2] = basist::half_to_float(half_block[p][2]);
3342
pPixels[p][3] = basist::half_to_float(half_block[p][3]);
3343
}
3344
3345
return true;
3346
}
3347
3348
static bool unpack_physical_astc_block_google(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)
3349
{
3350
return basisu_astc::astc::decompress_hdr((float *)pPixels, (uint8_t*)pBlock, block_width, block_height);
3351
}
3352
3353
static bool pack_bc6h_image(const imagef &src_img, vector2D<basist::bc6h_block> &bc6h_blocks, imagef *pPacked_bc6h_img, const fast_bc6h_params &enc_params)
3354
{
3355
const uint32_t width = src_img.get_width();
3356
const uint32_t height = src_img.get_height();
3357
3358
if (pPacked_bc6h_img)
3359
pPacked_bc6h_img->resize(width, height);
3360
3361
interval_timer tm;
3362
double total_enc_time = 0.0f;
3363
BASISU_NOTE_UNUSED(total_enc_time);
3364
3365
const uint32_t num_blocks_x = src_img.get_block_width(4);
3366
const uint32_t num_blocks_y = src_img.get_block_height(4);
3367
3368
bc6h_blocks.resize(num_blocks_x, num_blocks_y);
3369
3370
for (uint32_t by = 0; by < num_blocks_y; by++)
3371
{
3372
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
3373
{
3374
// Extract source image block
3375
vec4F block_pixels[4][4]; // [y][x]
3376
src_img.extract_block_clamped(&block_pixels[0][0], bx * 4, by * 4, 4, 4);
3377
3378
basist::half_float half_pixels[16 * 3]; // [y][x]
3379
3380
for (uint32_t y = 0; y < 4; y++)
3381
{
3382
for (uint32_t x = 0; x < 4; x++)
3383
{
3384
for (uint32_t c = 0; c < 3; c++)
3385
{
3386
float v = block_pixels[y][x][c];
3387
3388
basist::half_float h = basist::float_to_half(v);
3389
3390
half_pixels[(x + y * 4) * 3 + c] = h;
3391
3392
} // c
3393
3394
} // x
3395
} // y
3396
3397
basist::bc6h_block& bc6h_blk = bc6h_blocks(bx, by);
3398
3399
tm.start();
3400
3401
basist::astc_6x6_hdr::fast_encode_bc6h(half_pixels, &bc6h_blk, enc_params);
3402
3403
total_enc_time += tm.get_elapsed_secs();
3404
3405
if (pPacked_bc6h_img)
3406
{
3407
basist::half_float unpacked_blk[16 * 3];
3408
bool status = unpack_bc6h(&bc6h_blk, unpacked_blk, false);
3409
assert(status);
3410
if (!status)
3411
{
3412
fmt_error_printf("unpack_bc6h() failed\n");
3413
return false;
3414
}
3415
3416
for (uint32_t y = 0; y < 4; y++)
3417
{
3418
for (uint32_t x = 0; x < 4; x++)
3419
{
3420
vec4F p;
3421
3422
for (uint32_t c = 0; c < 3; c++)
3423
{
3424
float v = basist::half_to_float(unpacked_blk[(x + y * 4) * 3 + c]);
3425
p[c] = v;
3426
3427
} // c
3428
3429
p[3] = 1.0f;
3430
3431
pPacked_bc6h_img->set_clipped(bx * 4 + x, by * 4 + y, p);
3432
} // x
3433
} // y
3434
}
3435
3436
} // bx
3437
} // by
3438
3439
//fmt_printf("Total BC6H encode time: {}\n", total_enc_time);
3440
3441
return true;
3442
}
3443
3444
static float dist_to_line_squared(const vec3F& p, const vec3F &line_org, const vec3F &line_dir)
3445
{
3446
vec3F q(p - line_org);
3447
vec3F v(q - q.dot(line_dir) * line_dir);
3448
return v.dot(v);
3449
}
3450
3451
static void estimate_partitions_mode7_and_11(
3452
uint32_t num_parts, // 2 or 3 partitions
3453
uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns
3454
uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine
3455
const vec3F *pHalf_pixels_as_floats, // block's half pixel values casted to floats
3456
const astc_hdr_codec_base_options& coptions, // options
3457
uint32_t num_desired_pats,
3458
int *pDesired_pat_indices_mode11, int *pDesired_pat_indices_mode7) // output indices
3459
{
3460
BASISU_NOTE_UNUSED(coptions);
3461
BASISU_NOTE_UNUSED(num_unique_pats);
3462
3463
const uint32_t BLOCK_W = 6, BLOCK_H = 6, MAX_PARTS = 3; // BLOCK_T = 6 * 6
3464
assert(num_parts <= MAX_PARTS);
3465
3466
struct candidate_res
3467
{
3468
float m_total_sq_dist;
3469
uint32_t m_index;
3470
bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }
3471
};
3472
3473
const uint32_t MAX_CANDIDATES = 1024;
3474
assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));
3475
3476
candidate_res mode11_candidates[MAX_CANDIDATES];
3477
candidate_res mode7_candidates[MAX_CANDIDATES];
3478
3479
const vec3F grayscale_axis(0.5773502691f);
3480
3481
for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)
3482
{
3483
const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];
3484
assert(unique_part_index < num_unique_pats);
3485
3486
const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];
3487
3488
vec3F part_means[MAX_PARTS];
3489
uint32_t part_total_texels[MAX_PARTS] = { 0 };
3490
3491
for (uint32_t i = 0; i < num_parts; i++)
3492
part_means[i].clear();
3493
3494
for (uint32_t y = 0; y < BLOCK_H; y++)
3495
{
3496
for (uint32_t x = 0; x < BLOCK_W; x++)
3497
{
3498
const uint32_t part_index = (*pPat)(x, y);
3499
assert(part_index < num_parts);
3500
3501
part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];
3502
part_total_texels[part_index]++;
3503
3504
} // x
3505
} // y
3506
3507
for (uint32_t i = 0; i < num_parts; i++)
3508
{
3509
assert(part_total_texels[i]);
3510
part_means[i] /= (float)part_total_texels[i];
3511
}
3512
3513
float part_cov[MAX_PARTS][6];
3514
memset(part_cov, 0, sizeof(part_cov));
3515
3516
for (uint32_t y = 0; y < BLOCK_H; y++)
3517
{
3518
for (uint32_t x = 0; x < BLOCK_W; x++)
3519
{
3520
const uint32_t part_index = (*pPat)(x, y);
3521
assert(part_index < num_parts);
3522
3523
const vec3F p(pHalf_pixels_as_floats[x + y * BLOCK_W] - part_means[part_index]);
3524
3525
const float r = p[0], g = p[1], b = p[2];
3526
3527
part_cov[part_index][0] += r * r;
3528
part_cov[part_index][1] += r * g;
3529
part_cov[part_index][2] += r * b;
3530
part_cov[part_index][3] += g * g;
3531
part_cov[part_index][4] += g * b;
3532
part_cov[part_index][5] += b * b;
3533
3534
} // x
3535
} // y
3536
3537
// For each partition compute the total variance of all channels.
3538
float total_variance[MAX_PARTS];
3539
for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3540
total_variance[part_index] = part_cov[part_index][0] + part_cov[part_index][3] + part_cov[part_index][5];
3541
3542
vec3F part_axis[MAX_PARTS];
3543
float mode11_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis
3544
float mode7_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis
3545
3546
for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3547
{
3548
float* pCov = &part_cov[part_index][0];
3549
3550
float xr = .9f, xg = 1.0f, xb = .7f;
3551
3552
const uint32_t NUM_POWER_ITERS = 4;
3553
for (uint32_t iter = 0; iter < NUM_POWER_ITERS; iter++)
3554
{
3555
float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];
3556
float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];
3557
float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];
3558
3559
float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
3560
3561
if (m >= 1e-10f)
3562
{
3563
m = 1.0f / m;
3564
3565
r *= m;
3566
g *= m;
3567
b *= m;
3568
}
3569
3570
xr = r;
3571
xg = g;
3572
xb = b;
3573
}
3574
3575
float len_sq = xr * xr + xg * xg + xb * xb;
3576
3577
if (len_sq < 1e-10f)
3578
{
3579
xr = grayscale_axis[0];
3580
xg = grayscale_axis[0];
3581
xb = grayscale_axis[0];
3582
}
3583
else
3584
{
3585
len_sq = 1.0f / sqrtf(len_sq);
3586
3587
xr *= len_sq;
3588
xg *= len_sq;
3589
xb *= len_sq;
3590
}
3591
3592
{
3593
// Transform the principle axis by the covariance matrix, which will scale the vector by its eigenvalue (the variance of the dataset projected onto the principle axis).
3594
float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];
3595
float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];
3596
float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];
3597
3598
// Estimate the principle eigenvalue by computing the magnitude of the transformed vector.
3599
// The result is the variance along the principle axis.
3600
//float z1 = sqrtf(r * r + g * g + b * b); // this works with the principle axis
3601
//float z2 = r * xr + g * xg + b * xb; // compute length projected along xr,xg,xb
3602
3603
mode11_eigenvalue_est[part_index] = r * xr + g * xg + b * xb;
3604
}
3605
3606
{
3607
const float yrgb = grayscale_axis[0];
3608
3609
// Transform the grayscale axis by the covariance matrix, which will scale the vector by the eigenvalue (which is the variance of the dataset projected onto this vector).
3610
float r = yrgb * pCov[0] + yrgb * pCov[1] + yrgb * pCov[2];
3611
float g = yrgb * pCov[1] + yrgb * pCov[3] + yrgb * pCov[4];
3612
float b = yrgb * pCov[2] + yrgb * pCov[4] + yrgb * pCov[5];
3613
3614
mode7_eigenvalue_est[part_index] = r * yrgb + g * yrgb + b * yrgb;
3615
}
3616
3617
} // part_index
3618
3619
// Compute the total variance (squared error) of the other 2 axes by subtracting the total variance of all channels by the variance of the principle axis.
3620
// TODO: Could also compute the ratio of the principle axis's variance vs. the total variance.
3621
float mode11_total_sq_dist_to_line_alt = 0.0f;
3622
for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3623
{
3624
float d = maximum(0.0f, total_variance[part_index] - mode11_eigenvalue_est[part_index]);
3625
mode11_total_sq_dist_to_line_alt += d;
3626
}
3627
3628
{
3629
#if 0
3630
// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),
3631
// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.
3632
float total_sq_dist_to_line = 0.0f;
3633
for (uint32_t i = 0; i < BLOCK_T; i++)
3634
{
3635
const uint32_t part_index = (*pPat)[i];
3636
assert(part_index < num_parts);
3637
3638
total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis[part_index]);
3639
}
3640
3641
mode11_candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;
3642
#else
3643
mode11_candidates[examine_iter].m_total_sq_dist = mode11_total_sq_dist_to_line_alt;
3644
#endif
3645
mode11_candidates[examine_iter].m_index = unique_part_index;
3646
}
3647
3648
{
3649
float mode7_total_sq_dist_to_line_alt = 0.0f;
3650
for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3651
{
3652
float d = maximum(0.0f, total_variance[part_index] - mode7_eigenvalue_est[part_index]);
3653
mode7_total_sq_dist_to_line_alt += d;
3654
}
3655
3656
mode7_candidates[examine_iter].m_total_sq_dist = mode7_total_sq_dist_to_line_alt;
3657
mode7_candidates[examine_iter].m_index = unique_part_index;
3658
}
3659
3660
} // examine_iter
3661
3662
std::sort(&mode11_candidates[0], &mode11_candidates[num_pats_to_examine]);
3663
std::sort(&mode7_candidates[0], &mode7_candidates[num_pats_to_examine]);
3664
3665
for (uint32_t i = 0; i < num_desired_pats; i++)
3666
pDesired_pat_indices_mode11[i] = mode11_candidates[i].m_index;
3667
3668
for (uint32_t i = 0; i < num_desired_pats; i++)
3669
pDesired_pat_indices_mode7[i] = mode7_candidates[i].m_index;
3670
}
3671
3672
static void estimate_partitions_mode7(
3673
uint32_t num_parts, // 2 or 3 partitions
3674
uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns
3675
uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine
3676
const vec3F* pHalf_pixels_as_floats, // block's half pixel values casted to floats
3677
const astc_hdr_codec_base_options& coptions, // options
3678
uint32_t num_desired_pats, uint32_t* pDesired_pat_indices) // output indices
3679
{
3680
BASISU_NOTE_UNUSED(coptions);
3681
BASISU_NOTE_UNUSED(num_unique_pats);
3682
3683
const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = 6 * 6, MAX_PARTS = 3;
3684
assert(num_parts <= MAX_PARTS);
3685
3686
struct candidate_res
3687
{
3688
float m_total_sq_dist;
3689
uint32_t m_index;
3690
bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }
3691
};
3692
3693
const uint32_t MAX_CANDIDATES = 1024;
3694
assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));
3695
3696
candidate_res candidates[MAX_CANDIDATES];
3697
3698
for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)
3699
{
3700
const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];
3701
assert(unique_part_index < num_unique_pats);
3702
3703
const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];
3704
3705
vec3F part_means[MAX_PARTS];
3706
uint32_t part_total_texels[MAX_PARTS] = { 0 };
3707
3708
for (uint32_t i = 0; i < num_parts; i++)
3709
part_means[i].clear();
3710
3711
for (uint32_t y = 0; y < BLOCK_H; y++)
3712
{
3713
for (uint32_t x = 0; x < BLOCK_W; x++)
3714
{
3715
const uint32_t part_index = (*pPat)(x, y);
3716
assert(part_index < num_parts);
3717
3718
part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];
3719
part_total_texels[part_index]++;
3720
3721
} // x
3722
} // y
3723
3724
for (uint32_t i = 0; i < num_parts; i++)
3725
{
3726
assert(part_total_texels[i]);
3727
part_means[i] /= (float)part_total_texels[i];
3728
}
3729
3730
vec3F part_axis(0.5773502691f);
3731
3732
// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),
3733
// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.
3734
float total_sq_dist_to_line = 0.0f;
3735
for (uint32_t i = 0; i < BLOCK_T; i++)
3736
{
3737
const uint32_t part_index = (*pPat)[i];
3738
assert(part_index < num_parts);
3739
3740
total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis);
3741
}
3742
3743
candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;
3744
3745
candidates[examine_iter].m_index = unique_part_index;
3746
3747
} // examine_iter
3748
3749
std::sort(&candidates[0], &candidates[num_pats_to_examine]);
3750
3751
for (uint32_t i = 0; i < num_desired_pats; i++)
3752
pDesired_pat_indices[i] = candidates[i].m_index;
3753
}
3754
3755
static float calc_deblocking_penalty_itp(
3756
uint32_t bx, uint32_t by, uint32_t width, uint32_t height,
3757
const imagef& pass_src_img_itp, const candidate_encoding& candidate)
3758
{
3759
float total_deblock_penalty = 0.0f;
3760
3761
float total_orig_mse = 0.0f, total_comp_mse = 0.0f;
3762
uint32_t total_c = 0;
3763
3764
for (uint32_t b = 0; b < 4; b++)
3765
{
3766
for (uint32_t i = 0; i < 6; i++)
3767
{
3768
int ox = 0, oy = 0, qx = 0, qy = 0;
3769
3770
switch (b)
3771
{
3772
case 0:
3773
ox = bx * 6 + i; oy = (by - 1) * 6 + 5;
3774
qx = bx * 6 + i; qy = by * 6;
3775
break;
3776
case 1:
3777
ox = bx * 6 + i; oy = (by + 1) * 6;
3778
qx = bx * 6 + i; qy = by * 6 + 5;
3779
break;
3780
case 2:
3781
ox = (bx - 1) * 6 + 5; oy = by * 6 + i;
3782
qx = bx * 6; qy = by * 6 + i;
3783
break;
3784
case 3:
3785
ox = (bx + 1) * 6; oy = by * 6 + i;
3786
qx = bx * 6 + 5; qy = by * 6 + i;
3787
break;
3788
}
3789
3790
if ((ox < 0) || (oy < 0) || (ox >= (int)width) || (oy >= (int)height))
3791
continue;
3792
3793
const vec3F& o_pixel_itp = pass_src_img_itp(ox, oy);
3794
const vec3F& q_pixel_itp = pass_src_img_itp(qx, qy);
3795
3796
const vec3F &d_pixel_itp = candidate.m_comp_pixels_itp[qy - by * 6][qx - bx * 6]; // compressed block
3797
3798
vec3F orig_delta_v(o_pixel_itp - q_pixel_itp);
3799
total_orig_mse += square(orig_delta_v[0]) + square(orig_delta_v[1]) + square(orig_delta_v[2]);
3800
3801
vec3F d_delta_v(o_pixel_itp - d_pixel_itp);
3802
total_comp_mse += square(d_delta_v[0]) + square(d_delta_v[1]) + square(d_delta_v[2]);
3803
3804
total_c++;
3805
}
3806
}
3807
3808
if (total_c)
3809
{
3810
total_orig_mse /= (float)total_c;
3811
total_comp_mse /= (float)total_c;
3812
3813
if (total_orig_mse)
3814
{
3815
total_deblock_penalty = fabsf((total_comp_mse - total_orig_mse) / total_orig_mse);
3816
}
3817
}
3818
3819
return total_deblock_penalty;
3820
}
3821
3822
static bool calc_strip_size(
3823
float lambda,
3824
uint32_t num_blocks_y, uint32_t total_threads, bool force_one_strip,
3825
uint32_t& res_total_strips, uint32_t& res_rows_per_strip, astc_hdr_6x6_global_config &global_cfg)
3826
{
3827
uint32_t total_strips = 1;
3828
3829
if (lambda == 0.0f)
3830
{
3831
if (!force_one_strip)
3832
{
3833
total_strips = total_threads;
3834
}
3835
}
3836
else
3837
{
3838
const uint32_t MIN_DESIRED_STRIPS = 8;
3839
const uint32_t MAX_TARGET_STRIPS = 32;
3840
const uint32_t TARGET_ASTC_6X6_ROWS_PER_STRIP = 12;
3841
3842
if (!force_one_strip)
3843
{
3844
total_strips = maximum<uint32_t>(1, num_blocks_y / TARGET_ASTC_6X6_ROWS_PER_STRIP);
3845
3846
if (num_blocks_y >= MIN_DESIRED_STRIPS * 2)
3847
total_strips = maximum(total_strips, MIN_DESIRED_STRIPS);
3848
}
3849
3850
total_strips = minimum(total_strips, MAX_TARGET_STRIPS);
3851
}
3852
3853
uint32_t rows_per_strip = 0;
3854
if (total_strips <= 1)
3855
{
3856
rows_per_strip = num_blocks_y;
3857
}
3858
else
3859
{
3860
rows_per_strip = (num_blocks_y / total_strips) & ~1;
3861
3862
if (rows_per_strip < 2)
3863
rows_per_strip = 2;// num_blocks_y;
3864
}
3865
3866
assert((rows_per_strip == num_blocks_y) || ((rows_per_strip & 1) == 0));
3867
3868
total_strips = (num_blocks_y + rows_per_strip - 1) / rows_per_strip;
3869
3870
if (global_cfg.m_debug_output)
3871
{
3872
fmt_printf("num_blocks_y: {}, total_threads : {}, Total strips : {}\n", num_blocks_y, total_threads, total_strips);
3873
fmt_printf("ASTC 6x6 block rows per strip: {}\n", rows_per_strip);
3874
fmt_printf("ASTC 6x6 block rows on final strip: {}\n", num_blocks_y - (total_strips - 1) * rows_per_strip);
3875
}
3876
3877
uint32_t total_rows = 0;
3878
for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)
3879
{
3880
uint32_t strip_first_by = strip_index * rows_per_strip;
3881
uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);
3882
3883
if (strip_index == (total_strips - 1))
3884
strip_last_by = num_blocks_y - 1;
3885
3886
uint32_t num_strip_block_rows = (strip_last_by - strip_first_by) + 1;
3887
total_rows += num_strip_block_rows;
3888
3889
if (global_cfg.m_debug_output)
3890
fmt_printf("Strip row: {}, total block rows: {}\n", strip_index, num_strip_block_rows);
3891
}
3892
3893
if (total_rows != num_blocks_y)
3894
{
3895
fmt_error_printf("Strip calc failed\n");
3896
return false;
3897
}
3898
3899
res_total_strips = total_strips;
3900
res_rows_per_strip = rows_per_strip;
3901
3902
return true;
3903
}
3904
3905
static void convet_rgb_image_to_itp(const imagef &src_img, imagef &dst_img, const astc_hdr_6x6_global_config& cfg)
3906
{
3907
const uint32_t width = src_img.get_width(), height = src_img.get_height();
3908
3909
dst_img.resize(width, height);
3910
3911
for (uint32_t y = 0; y < height; y++)
3912
{
3913
for (uint32_t x = 0; x < width; x++)
3914
{
3915
vec3F src_rgb(src_img(x, y));
3916
3917
vec3F src_itp;
3918
linear_rgb_to_itp(src_rgb, src_itp, cfg);
3919
3920
dst_img(x, y) = src_itp;
3921
}
3922
}
3923
}
3924
3925
const uint32_t BLOCK_W = 6, BLOCK_H = 6;
3926
const uint32_t NUM_BLOCK_PIXELS = BLOCK_W * BLOCK_H;
3927
3928
const float SOLID_PENALTY = 4.0f;
3929
const float REUSE_PENALTY = 1.0f;
3930
const float RUN_PENALTY = 10.0f;
3931
3932
const float MSE_WEIGHT = 300000.0f;
3933
const float SSIM_WEIGHT = 200.0f;
3934
const float TWO_LEVEL_PENALTY = 1.425f;
3935
const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM = .04f;
3936
const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM = .04f;
3937
const float COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY = 1.5f;
3938
const float COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY = 1.25f;
3939
const float COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY = 1.15f;
3940
3941
struct uastc_hdr_6x6_debug_state
3942
{
3943
uint32_t m_encoding_type_hist[(uint32_t)encoding_type::cTotal] = { 0 };
3944
uint32_t m_endpoint_mode_hist[(uint32_t)endpoint_mode::cTotal] = { 0 };
3945
uint32_t m_block_mode_hist[(uint32_t)block_mode::cBMTotalModes] = { 0 };
3946
uint64_t m_block_mode_total_bits[(uint32_t)block_mode::cBMTotalModes] = { 0 };
3947
3948
basisu::vector< basisu::stats<float> > m_block_mode_comp_stats[(uint32_t)block_mode::cBMTotalModes][3];
3949
basisu::vector< basisu::comparative_stats<float> > m_block_mode_comparative_stats[(uint32_t)block_mode::cBMTotalModes][3];
3950
3951
std::atomic<uint32_t> m_total_gaussian1_blocks;
3952
std::atomic<uint32_t> m_total_gaussian2_blocks;
3953
std::atomic<uint32_t> m_total_filter_horizontal;
3954
std::atomic<uint32_t> m_detail_stats[5];
3955
std::atomic<uint32_t> m_total_mode7_skips;
3956
3957
std::atomic<uint32_t> m_total_blocks_compressed;
3958
3959
std::atomic<uint32_t> m_total_candidates_considered;
3960
std::atomic<uint32_t> m_max_candidates_considered;
3961
3962
std::atomic<uint32_t> m_total_part2_stats[4];
3963
std::atomic<uint32_t> m_dp_stats[5];
3964
3965
std::atomic<uint32_t> m_reuse_num_parts[4];
3966
std::atomic<uint32_t> m_reuse_total_dp;
3967
3968
imagef m_stat_vis;
3969
std::mutex m_stat_vis_mutex;
3970
3971
image m_part_vis;
3972
image m_mode_vis;
3973
image m_mode_vis2;
3974
image m_grid_vis;
3975
image m_enc_vis;
3976
std::mutex m_vis_image_mutex;
3977
3978
std::atomic<uint32_t> m_comp_level_hist[ASTC_HDR_6X6_MAX_COMP_LEVEL + 1];
3979
3980
std::atomic<uint32_t> m_total_jnd_replacements;
3981
3982
std::mutex m_stats_mutex;
3983
3984
uastc_hdr_6x6_debug_state()
3985
{
3986
for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
3987
{
3988
for (uint32_t j = 0; j < 3; j++)
3989
{
3990
m_block_mode_comp_stats[i][j].reserve(512);
3991
m_block_mode_comparative_stats[i][j].reserve(512);
3992
}
3993
}
3994
}
3995
3996
void init(uint32_t width, uint32_t height)
3997
{
3998
m_stat_vis.resize(width, height);
3999
m_part_vis.resize(width, height);
4000
m_mode_vis.resize(width, height);
4001
m_mode_vis2.resize(width, height);
4002
m_grid_vis.resize(width, height);
4003
m_enc_vis.resize(width, height);
4004
4005
basisu::clear_obj(m_encoding_type_hist);
4006
basisu::clear_obj(m_endpoint_mode_hist);
4007
basisu::clear_obj(m_block_mode_hist);
4008
basisu::clear_obj(m_block_mode_total_bits);
4009
4010
for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
4011
{
4012
for (uint32_t j = 0; j < 3; j++)
4013
{
4014
m_block_mode_comp_stats[i][j].clear();
4015
m_block_mode_comparative_stats[i][j].clear();
4016
}
4017
}
4018
4019
m_total_gaussian1_blocks.store(0);
4020
m_total_gaussian2_blocks.store(0);
4021
m_total_filter_horizontal.store(0);
4022
for (uint32_t i = 0; i < std::size(m_detail_stats); i++)
4023
m_detail_stats[i].store(0);
4024
m_total_mode7_skips.store(0);
4025
4026
for (uint32_t i = 0; i < std::size(m_comp_level_hist); i++)
4027
m_comp_level_hist[i].store(0);
4028
4029
m_total_blocks_compressed.store(0);
4030
4031
m_total_candidates_considered.store(0);
4032
m_max_candidates_considered.store(0);
4033
4034
for (uint32_t i = 0; i < std::size(m_total_part2_stats); i++)
4035
m_total_part2_stats[i].store(0);
4036
4037
for (uint32_t i = 0; i < std::size(m_dp_stats); i++)
4038
m_dp_stats[i].store(0);
4039
4040
for (uint32_t i = 0; i < std::size(m_reuse_num_parts); i++)
4041
m_reuse_num_parts[i] .store(0);
4042
4043
m_reuse_total_dp.store(0);
4044
4045
m_total_jnd_replacements.store(0);
4046
}
4047
4048
void print(uint32_t total_blocks) const
4049
{
4050
fmt_printf("Total blocks: {}\n", total_blocks);
4051
fmt_printf("Total JND replacements: {} {3.2}%\n", m_total_jnd_replacements, (float)m_total_jnd_replacements * 100.0f / (float)total_blocks);
4052
fmt_printf("Comp level histogram: {} {} {} {} {}\n", m_comp_level_hist[0], m_comp_level_hist[1], m_comp_level_hist[2], m_comp_level_hist[3], m_comp_level_hist[4]);
4053
fmt_printf("Total gaussian 1 blocks: {} {3.2}%\n", m_total_gaussian1_blocks, (float)m_total_gaussian1_blocks * 100.0f / (float)total_blocks);
4054
fmt_printf("Total gaussian 2 blocks: {} {3.2}%\n", m_total_gaussian2_blocks, (float)m_total_gaussian2_blocks * 100.0f / (float)total_blocks);
4055
fmt_printf("Total filter horizontal: {} {3.2}%\n", m_total_filter_horizontal, (float)m_total_filter_horizontal * 100.0f / (float)total_blocks);
4056
fmt_printf("Detail stats: Detailed block low grid skip: {}, Blurry block skip: {}, Very blurry block skip: {}, NH:{} H:{}\n", m_detail_stats[0], m_detail_stats[1], m_detail_stats[2], m_detail_stats[3], m_detail_stats[4]);
4057
fmt_printf("Total mode7 skips: {}\n", m_total_mode7_skips);
4058
4059
fmt_printf("Total candidates: {}, {} avg per block\n", m_total_candidates_considered, (float)m_total_candidates_considered / (float)total_blocks);
4060
fmt_printf("Max ever candidates: {}\n", m_max_candidates_considered);
4061
4062
fmt_printf("Part2/3 stats: {} {} {} {}\n", m_total_part2_stats[0], m_total_part2_stats[1], m_total_part2_stats[2], m_total_part2_stats[3]);
4063
fmt_printf("Dual plane stats: {} {} {} {} {}\n", m_dp_stats[0], m_dp_stats[1], m_dp_stats[2], m_dp_stats[3], m_dp_stats[4]);
4064
fmt_printf("Reuse total dual plane: {}\n", m_reuse_total_dp);
4065
fmt_printf("Reuse part stats: {} {} {}\n", m_reuse_num_parts[1], m_reuse_num_parts[2], m_reuse_num_parts[3]);
4066
4067
fmt_printf("\nEncoding type histogram:\n");
4068
for (uint32_t i = 0; i < std::size(m_encoding_type_hist); i++)
4069
fmt_printf("{}: {}\n", i, m_encoding_type_hist[i]);
4070
4071
fmt_printf("\nEndpoint mode histogram:\n");
4072
for (uint32_t i = 0; i < std::size(m_endpoint_mode_hist); i++)
4073
fmt_printf("{}: {}\n", i, m_endpoint_mode_hist[i]);
4074
4075
fmt_printf("\nBlock mode histogram:\n");
4076
4077
uint32_t total_dp = 0, total_sp = 0;
4078
uint32_t total_mode11 = 0, total_mode7 = 0;
4079
uint32_t part_hist[3] = { 0 };
4080
uint32_t part2_mode7_total = 0, part2_mode11_total = 0;
4081
uint32_t total_used_modes = 0;
4082
for (uint32_t i = 0; i < std::size(m_block_mode_hist); i++)
4083
{
4084
const auto& bm_desc = g_block_mode_descs[i];
4085
4086
const uint32_t total_uses = m_block_mode_hist[i];
4087
4088
if (bm_desc.m_dp)
4089
total_dp += total_uses;
4090
else
4091
total_sp += total_uses;
4092
4093
if (bm_desc.m_cem == 7)
4094
total_mode7 += total_uses;
4095
else
4096
total_mode11 += total_uses;
4097
4098
part_hist[bm_desc.m_num_partitions - 1] += total_uses;
4099
4100
if (bm_desc.m_num_partitions == 2)
4101
{
4102
if (bm_desc.m_cem == 7)
4103
part2_mode7_total += total_uses;
4104
else
4105
{
4106
assert(bm_desc.m_cem == 11);
4107
part2_mode11_total += total_uses;
4108
}
4109
}
4110
4111
float avg_std_dev = 0.0f;
4112
float avg_cross_correlations[3] = { 0 };
4113
4114
if (m_block_mode_comp_stats[i][0].size())
4115
{
4116
const uint32_t num_uses = m_block_mode_comp_stats[i][0].size_u32();
4117
4118
for (uint32_t j = 0; j < num_uses; j++)
4119
avg_std_dev += (float)maximum(m_block_mode_comp_stats[i][0][j].m_std_dev, m_block_mode_comp_stats[i][1][j].m_std_dev, m_block_mode_comp_stats[i][2][j].m_std_dev);
4120
avg_std_dev /= (float)num_uses;
4121
4122
for (uint32_t j = 0; j < num_uses; j++)
4123
{
4124
avg_cross_correlations[0] += fabsf((float)m_block_mode_comparative_stats[i][0][j].m_pearson);
4125
avg_cross_correlations[1] += fabsf((float)m_block_mode_comparative_stats[i][1][j].m_pearson);
4126
avg_cross_correlations[2] += fabsf((float)m_block_mode_comparative_stats[i][2][j].m_pearson);
4127
}
4128
4129
avg_cross_correlations[0] /= (float)num_uses;
4130
avg_cross_correlations[1] /= (float)num_uses;
4131
avg_cross_correlations[2] /= (float)num_uses;
4132
}
4133
4134
fmt_printf("{ 2}: uses: { 6}, cem: {}, dp: {} chan: {}, parts: {}, grid: {}x{}, endpoint levels: {}, weight levels: {}, Avg bits: {}, Avg Max Std Dev: {}, RG: {} RB: {} GB: {}\n", i, total_uses,
4135
bm_desc.m_cem,
4136
bm_desc.m_dp, bm_desc.m_dp_channel,
4137
bm_desc.m_num_partitions,
4138
bm_desc.m_grid_x, bm_desc.m_grid_y,
4139
astc_helpers::get_ise_levels(bm_desc.m_endpoint_ise_range),
4140
astc_helpers::get_ise_levels(bm_desc.m_weight_ise_range),
4141
total_uses ? ((double)m_block_mode_total_bits[i] / total_uses) : 0.0f,
4142
avg_std_dev, avg_cross_correlations[0], avg_cross_correlations[1], avg_cross_correlations[2]);
4143
4144
if (total_uses)
4145
total_used_modes++;
4146
}
4147
4148
fmt_printf("Total used modes: {}\n", total_used_modes);
4149
4150
fmt_printf("Total single plane: {}, total dual plane: {}\n", total_sp, total_dp);
4151
fmt_printf("Total mode 11: {}, mode 7: {}\n", total_mode11, total_mode7);
4152
fmt_printf("Partition histogram: {} {} {}\n", part_hist[0], part_hist[1], part_hist[2]);
4153
fmt_printf("2 subset mode 7 uses: {}, mode 11 uses: {}\n", part2_mode7_total, part2_mode11_total);
4154
}
4155
};
4156
4157
struct uastc_hdr_6x6_encode_state
4158
{
4159
astc_hdr_codec_base_options master_coptions;
4160
4161
imagef src_img;
4162
4163
imagef src_img_filtered1;
4164
imagef src_img_filtered2;
4165
4166
imagef src_img_itp;
4167
imagef src_img_filtered1_itp;
4168
imagef src_img_filtered2_itp;
4169
4170
vector2D<float> smooth_block_mse_scales;
4171
4172
imagef packed_img;
4173
4174
basisu::vector<bitwise_coder> strip_bits;
4175
4176
basisu::vector2D<astc_helpers::astc_block> final_astc_blocks;
4177
4178
vector2D<candidate_encoding> coded_blocks;
4179
};
4180
4181
static bool compress_strip_task(
4182
uint32_t strip_index, uint32_t total_strips, uint32_t strip_first_by, uint32_t strip_last_by,
4183
uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t total_blocks, uint32_t width, uint32_t height,
4184
astc_hdr_6x6_global_config &global_cfg, uastc_hdr_6x6_debug_state &debug_state, uastc_hdr_6x6_encode_state &enc_state)
4185
{
4186
BASISU_NOTE_UNUSED(num_blocks_y);
4187
BASISU_NOTE_UNUSED(total_strips);
4188
4189
vec3F prev_comp_pixels[BLOCK_H][BLOCK_W]; // [y][x]
4190
basisu::clear_obj(prev_comp_pixels);
4191
4192
uint32_t prev_run_len = 0;
4193
4194
bitwise_coder prev_encoding;
4195
candidate_encoding prev_candidate_encoding; // the previous candidate written, which may have been a run extension
4196
candidate_encoding prev_non_run_candidate_encoding; // the previous *non-run* candidate written
4197
4198
bitwise_coder& strip_coded_bits = enc_state.strip_bits[strip_index];
4199
4200
const uint32_t CANDIDATES_TO_RESERVE = 1536;
4201
4202
basisu::vector<candidate_encoding> candidates;
4203
candidates.reserve(CANDIDATES_TO_RESERVE);
4204
4205
for (uint32_t by = strip_first_by; by <= strip_last_by; by++)
4206
{
4207
const bool has_upper_neighbor = by > strip_first_by;
4208
4209
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
4210
{
4211
//if ((bx == 1) && (by == 2))
4212
// basisu::fmt_printf("!");
4213
4214
for (uint32_t outer_pass = 0; outer_pass < 3; outer_pass++)
4215
{
4216
const bool has_left_neighbor = bx > 0;
4217
//const bool has_prev = has_left_neighbor || has_upper_neighbor;
4218
4219
// Select either the original source image, or the Gaussian filtered version.
4220
// From here the encoder *must* use these 2 sources.
4221
const imagef& pass_src_img = (outer_pass == 2) ? enc_state.src_img_filtered2 :
4222
((outer_pass == 1) ? enc_state.src_img_filtered1 : enc_state.src_img);
4223
4224
const imagef& pass_src_img_itp = (outer_pass == 2) ? enc_state.src_img_filtered2_itp :
4225
((outer_pass == 1) ? enc_state.src_img_filtered1_itp : enc_state.src_img_itp);
4226
4227
// Extract source image block
4228
vec4F block_pixels[BLOCK_H][BLOCK_W]; // [y][x]
4229
pass_src_img.extract_block_clamped(&block_pixels[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
4230
4231
vec4F block_pixels_itp[BLOCK_H][BLOCK_W]; // [y][x]
4232
pass_src_img_itp.extract_block_clamped(&block_pixels_itp[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
4233
4234
half_vec3 half_pixels[BLOCK_H][BLOCK_W]; // [y][x] half-float values
4235
vec3F half_pixels_as_floats[BLOCK_H][BLOCK_W]; // [y][x] half float values, integer bits as floats
4236
vec4F block_pixels_q16[BLOCK_H][BLOCK_W]; // [y][x], q16 space for low-level ASTC encoding
4237
vec3F block_pixels_as_itp[BLOCK_H][BLOCK_W]; // [y][x] input converted to itp space, for faster error calculations
4238
4239
bool is_grayscale = true;
4240
4241
candidates.resize(0);
4242
4243
float block_ly = BIG_FLOAT_VAL, block_hy = 0.0f, block_avg_y = 0.0f;
4244
4245
for (uint32_t y = 0; y < BLOCK_H; y++)
4246
{
4247
for (uint32_t x = 0; x < BLOCK_W; x++)
4248
{
4249
vec3F rgb_input;
4250
4251
for (uint32_t c = 0; c < 3; c++)
4252
{
4253
float v = block_pixels[y][x][c];
4254
4255
rgb_input[c] = v;
4256
4257
const basist::half_float h = basisu::fast_float_to_half_no_clamp_neg_nan_or_inf(v);
4258
assert(h == basist::float_to_half(v));
4259
4260
half_pixels[y][x][c] = h;
4261
4262
block_pixels_q16[y][x][c] = (float)half_to_qlog16(h);
4263
4264
half_pixels_as_floats[y][x][c] = (float)h;
4265
4266
} // c
4267
4268
float py = rgb_input.dot(vec3F(REC_709_R, REC_709_G, REC_709_B));
4269
if (py < block_ly)
4270
block_ly = py;
4271
if (py > block_hy)
4272
block_hy = py;
4273
block_avg_y += py;
4274
4275
//linear_rgb_to_itp(rgb_input, block_pixels_as_itp[y][x]);
4276
4277
block_pixels_as_itp[y][x] = block_pixels_itp[y][x];
4278
4279
block_pixels_q16[y][x][3] = 0.0f;
4280
4281
if ((half_pixels[y][x][0] != half_pixels[y][x][1]) || (half_pixels[y][x][0] != half_pixels[y][x][2]))
4282
is_grayscale = false;
4283
4284
} // x
4285
} // y
4286
4287
block_avg_y *= (1.0f / (float)NUM_BLOCK_PIXELS);
4288
4289
encode_astc_block_stats enc_block_stats;
4290
enc_block_stats.init(NUM_BLOCK_PIXELS, &block_pixels_q16[0][0]);
4291
4292
vec4F x_filtered[6][6], y_filtered[6][6];
4293
4294
filter_block(3, 6, (vec4F*)block_pixels, (vec4F*)x_filtered); // filter rows (horizontal)
4295
filter_block(6, 3, (vec4F*)block_pixels, (vec4F*)y_filtered); // filter cols (vertically)
4296
4297
const float filtered_x_err = diff_blocks((vec4F*)block_pixels, (vec4F*)x_filtered);
4298
const float filtered_y_err = diff_blocks((vec4F*)block_pixels, (vec4F*)y_filtered);
4299
const bool filter_horizontally = filtered_x_err < filtered_y_err;
4300
4301
//const float block_mag_gradient_mag = block_max_gradient_mag(bx, by);
4302
4303
if (filter_horizontally)
4304
debug_state.m_total_filter_horizontal.fetch_add(1, std::memory_order_relaxed);
4305
4306
vec3F lowpass_filtered[6][6];
4307
filter_block(3, 3, &half_pixels_as_floats[0][0], &lowpass_filtered[0][0]);
4308
float lowpass_std_dev = sub_and_compute_std_dev(&lowpass_filtered[0][0], &half_pixels_as_floats[0][0]);
4309
4310
const bool very_detailed_block = lowpass_std_dev > 350.0f;
4311
const bool very_blurry_block = lowpass_std_dev < 30.0f;
4312
const bool super_blurry_block = lowpass_std_dev < 15.0f;
4313
4314
basisu::stats<float> half_comp_stats[3];
4315
for (uint32_t c = 0; c < 3; c++)
4316
half_comp_stats[c].calc(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][c], 3);
4317
4318
const float SINGLE_PART_HALF_THRESH = 256.0f;
4319
const float COMPLEX_HALF_THRESH = 1024.0f;
4320
// HACK HACK
4321
const float VERY_COMPLEX_HALF_THRESH = 1400.0f; // 1536.0f;
4322
4323
const float max_std_dev = (float)maximum(half_comp_stats[0].m_std_dev, half_comp_stats[1].m_std_dev, half_comp_stats[2].m_std_dev);
4324
4325
const bool very_simple_block = (max_std_dev < SINGLE_PART_HALF_THRESH);
4326
const bool complex_block = (max_std_dev > COMPLEX_HALF_THRESH);
4327
const bool very_complex_block = (max_std_dev > VERY_COMPLEX_HALF_THRESH);
4328
4329
// Dynamically choose a comp_level for this block.
4330
astc_hdr_codec_base_options coptions(enc_state.master_coptions);
4331
uint32_t comp_level = global_cfg.m_master_comp_level;
4332
4333
if (very_complex_block)
4334
comp_level = global_cfg.m_highest_comp_level;
4335
else if (complex_block)
4336
comp_level = (global_cfg.m_master_comp_level + global_cfg.m_highest_comp_level + 1) / 2;
4337
4338
debug_state.m_comp_level_hist[comp_level].fetch_add(1, std::memory_order_relaxed);
4339
4340
bool any_2subset_enabled = false, any_2subset_mode11_enabled = false, any_2subset_mode7_enabled = false, any_3subset_enabled = false;
4341
BASISU_NOTE_UNUSED(any_2subset_mode11_enabled);
4342
4343
for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
4344
{
4345
if (comp_level == 0)
4346
{
4347
if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)
4348
continue;
4349
}
4350
else if (comp_level == 1)
4351
{
4352
if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)
4353
continue;
4354
}
4355
else if (comp_level == 2)
4356
{
4357
if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)
4358
continue;
4359
}
4360
4361
if (g_block_mode_descs[i].m_num_partitions == 2)
4362
{
4363
any_2subset_enabled = true;
4364
4365
if (g_block_mode_descs[i].m_cem == 7)
4366
{
4367
any_2subset_mode7_enabled = true;
4368
}
4369
else
4370
{
4371
assert(g_block_mode_descs[i].m_cem == 11);
4372
any_2subset_mode11_enabled = true;
4373
}
4374
}
4375
else if (g_block_mode_descs[i].m_num_partitions == 3)
4376
any_3subset_enabled = true;
4377
}
4378
4379
coptions.m_mode7_full_s_optimization = (comp_level >= 2);
4380
4381
const bool uber_mode_flag = (comp_level >= 3);
4382
coptions.m_allow_uber_mode = uber_mode_flag;
4383
4384
coptions.m_ultra_quant = (comp_level >= 4);
4385
4386
coptions.m_take_first_non_clamping_mode11_submode = (comp_level <= 2);
4387
coptions.m_take_first_non_clamping_mode7_submode = (comp_level <= 2);
4388
4389
coptions.m_disable_weight_plane_optimization = (comp_level >= 2);
4390
4391
// -------------------
4392
4393
uint32_t total_used_block_chans = 0;
4394
for (uint32_t i = 0; i < 3; i++)
4395
total_used_block_chans += (half_comp_stats[i].m_range > 0.0f);
4396
4397
const bool is_solid_block = (total_used_block_chans == 0);
4398
4399
basisu::comparative_stats<float> half_cross_chan_stats[3];
4400
4401
// R vs. G
4402
half_cross_chan_stats[0].calc_pearson(NUM_BLOCK_PIXELS,
4403
&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][1],
4404
3, 3,
4405
&half_comp_stats[0], &half_comp_stats[1]);
4406
4407
// R vs. B
4408
half_cross_chan_stats[1].calc_pearson(NUM_BLOCK_PIXELS,
4409
&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][2],
4410
3, 3,
4411
&half_comp_stats[0], &half_comp_stats[2]);
4412
4413
// G vs. B
4414
half_cross_chan_stats[2].calc_pearson(NUM_BLOCK_PIXELS,
4415
&half_pixels_as_floats[0][0][1], &half_pixels_as_floats[0][0][2],
4416
3, 3,
4417
&half_comp_stats[1], &half_comp_stats[2]);
4418
4419
const float rg_corr = fabsf((float)half_cross_chan_stats[0].m_pearson);
4420
const float rb_corr = fabsf((float)half_cross_chan_stats[1].m_pearson);
4421
const float gb_corr = fabsf((float)half_cross_chan_stats[2].m_pearson);
4422
4423
float min_corr = BIG_FLOAT_VAL, max_corr = -BIG_FLOAT_VAL;
4424
for (uint32_t i = 0; i < 3; i++)
4425
{
4426
#if 0
4427
// 9/5/2025, wrong metric, we're iterating channels pairs here, not individual channels.
4428
// On 3 active channel blocks this causes no difference.
4429
if (half_comp_stats[i].m_range > 0.0f)
4430
#else
4431
static const uint8_t s_chan_pairs[3][2] = { {0, 1}, {0, 2}, {1, 2} };
4432
4433
const uint32_t chanA = s_chan_pairs[i][0];
4434
const uint32_t chanB = s_chan_pairs[i][1];
4435
4436
if ((half_comp_stats[chanA].m_range > 0.0f) && (half_comp_stats[chanB].m_range > 0.0f))
4437
#endif
4438
{
4439
const float c = fabsf((float)half_cross_chan_stats[i].m_pearson);
4440
min_corr = minimum(min_corr, c);
4441
max_corr = maximum(max_corr, c);
4442
}
4443
}
4444
4445
bool use_single_subset_mode7 = true;
4446
if (comp_level <= 1)
4447
{
4448
// TODO: could also compute angle between principle axis and the grayscale axis.
4449
// TODO: Transform grayscale axis by covar matrix, compute variance vs. total variance
4450
const float MODE7_MIN_CHAN_CORR = .5f;
4451
const float MODE7_PCA_ANGLE_THRESH = .9f;
4452
use_single_subset_mode7 = is_grayscale || is_solid_block || ((total_used_block_chans == 1) || (min_corr >= MODE7_MIN_CHAN_CORR));
4453
4454
if (use_single_subset_mode7)
4455
{
4456
float cos_ang = fabsf(enc_block_stats.m_axis_q16.dot(vec3F(0.5773502691f)));
4457
if (cos_ang < MODE7_PCA_ANGLE_THRESH)
4458
use_single_subset_mode7 = false;
4459
}
4460
}
4461
4462
const float STRONG_CORR_THRESH = (comp_level <= 1) ? .5f : ((comp_level <= 3) ? .75f : .9f);
4463
4464
int desired_dp_chan = -1;
4465
if (total_used_block_chans <= 1)
4466
{
4467
// no need for dual plane (except possibly 2x2 weight grids for RDO)
4468
}
4469
else
4470
{
4471
if (min_corr >= STRONG_CORR_THRESH)
4472
{
4473
// all channel pairs strongly correlated, no need for dual plane
4474
debug_state.m_dp_stats[0].fetch_add(1, std::memory_order_relaxed);
4475
}
4476
else
4477
{
4478
if (total_used_block_chans == 2)
4479
{
4480
if (half_comp_stats[0].m_range == 0.0f)
4481
{
4482
// r unused, check for strong gb correlation
4483
if (gb_corr < STRONG_CORR_THRESH)
4484
desired_dp_chan = 1;
4485
}
4486
else if (half_comp_stats[1].m_range == 0.0f)
4487
{
4488
// g unused, check for strong rb correlation
4489
if (rb_corr < STRONG_CORR_THRESH)
4490
desired_dp_chan = 0;
4491
}
4492
else
4493
{
4494
// b unused, check for strong rg correlation
4495
if (rg_corr < STRONG_CORR_THRESH)
4496
desired_dp_chan = 0;
4497
}
4498
}
4499
else
4500
{
4501
assert(total_used_block_chans == 3);
4502
4503
// see if rg/rb is weakly correlated vs. gb
4504
if ((rg_corr < gb_corr) && (rb_corr < gb_corr))
4505
desired_dp_chan = 0;
4506
// see if gr/gb is weakly correlated vs. rb
4507
else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))
4508
desired_dp_chan = 1;
4509
// assume b is weakest
4510
else
4511
desired_dp_chan = 2;
4512
}
4513
4514
if (desired_dp_chan == -1)
4515
debug_state.m_dp_stats[1].fetch_add(1, std::memory_order_relaxed);
4516
else
4517
debug_state.m_dp_stats[2 + desired_dp_chan].fetch_add(1, std::memory_order_relaxed);
4518
}
4519
}
4520
4521
// 2x2 is special for RDO at higher lambdas - always pick a preferred channel.
4522
int desired_dp_chan_2x2 = 0;
4523
if (total_used_block_chans == 2)
4524
{
4525
if (half_comp_stats[0].m_range == 0.0f)
4526
desired_dp_chan_2x2 = 1;
4527
}
4528
else if (total_used_block_chans == 3)
4529
{
4530
// see if rg/rb is weakly correlated vs. gb
4531
if ((rg_corr < gb_corr) && (rb_corr < gb_corr))
4532
desired_dp_chan_2x2 = 0;
4533
// see if gr/gb is weakly correlated vs. rb
4534
else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))
4535
desired_dp_chan_2x2 = 1;
4536
// assume b is weakest
4537
else
4538
desired_dp_chan_2x2 = 2;
4539
}
4540
4541
// Gather all candidate encodings
4542
bool status = false;
4543
4544
// ---- Run candidate
4545
if ((global_cfg.m_use_runs) && (has_left_neighbor || has_upper_neighbor))
4546
{
4547
candidate_encoding candidate;
4548
candidate.m_coder.reserve(24);
4549
4550
candidate.m_encoding_type = encoding_type::cRun;
4551
4552
candidate.m_decomp_log_blk = prev_non_run_candidate_encoding.m_decomp_log_blk;
4553
candidate.m_coded_log_blk = prev_non_run_candidate_encoding.m_coded_log_blk;
4554
4555
memcpy(candidate.m_comp_pixels, prev_comp_pixels, sizeof(prev_comp_pixels));
4556
4557
if (!prev_run_len)
4558
{
4559
candidate.m_coder.put_bits(RUN_CODE, RUN_CODE_LEN);
4560
candidate.m_coder.put_vlc(0, 5);
4561
}
4562
else
4563
{
4564
// extend current run - compute the # of new bits needed for the extension.
4565
4566
uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();
4567
assert(prev_run_bits > 0);
4568
4569
// We're not actually going to code this, because the previously emitted run code will be extended.
4570
bitwise_coder temp_coder;
4571
temp_coder.put_bits(RUN_CODE, RUN_CODE_LEN);
4572
temp_coder.put_vlc((prev_run_len + 1) - 1, 5);
4573
4574
uint32_t cur_run_bits = temp_coder.get_total_bits_u32();
4575
assert(cur_run_bits >= prev_run_bits);
4576
4577
uint32_t total_new_bits = cur_run_bits - prev_run_bits;
4578
if (total_new_bits > 0)
4579
candidate.m_coder.put_bits(0, total_new_bits); // dummy bits
4580
}
4581
4582
candidate.m_run_len = prev_run_len + 1;
4583
4584
candidates.emplace_back(std::move(candidate));
4585
}
4586
4587
// ---- Reuse candidate
4588
if ((!is_solid_block) && (global_cfg.m_lambda > 0.0f))
4589
{
4590
for (uint32_t reuse_delta_index = 0; reuse_delta_index < global_cfg.m_num_reuse_xy_deltas; reuse_delta_index++)
4591
{
4592
const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;
4593
const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;
4594
4595
const int reuse_bx = bx + reuse_delta_x, reuse_by = by + reuse_delta_y;
4596
if ((reuse_bx < 0) || (reuse_bx >= (int)num_blocks_x))
4597
continue;
4598
if (reuse_by < (int)strip_first_by)
4599
break;
4600
4601
const candidate_encoding& prev_candidate = enc_state.coded_blocks(reuse_bx, reuse_by);
4602
4603
// TODO - support this.
4604
if (prev_candidate.m_encoding_type == encoding_type::cSolid)
4605
continue;
4606
assert((prev_candidate.m_encoding_type == encoding_type::cBlock) || (prev_candidate.m_encoding_type == encoding_type::cReuse));
4607
4608
candidate_encoding candidate;
4609
candidate.m_coder.reserve(24);
4610
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
4611
astc_helpers::log_astc_block& decomp_log_blk = candidate.m_decomp_log_blk;
4612
4613
const astc_helpers::log_astc_block& prev_coded_log_blk = prev_candidate.m_coded_log_blk;
4614
4615
const uint32_t grid_x = prev_coded_log_blk.m_grid_width, grid_y = prev_coded_log_blk.m_grid_height;
4616
const bool dual_plane = prev_candidate.m_coded_log_blk.m_dual_plane;
4617
const uint32_t num_grid_samples = grid_x * grid_y;
4618
const uint32_t num_endpoint_vals = get_num_endpoint_vals(prev_coded_log_blk.m_color_endpoint_modes[0]);
4619
4620
coded_log_blk = prev_candidate.m_coded_log_blk;
4621
decomp_log_blk = prev_candidate.m_decomp_log_blk;
4622
4623
if (prev_coded_log_blk.m_num_partitions == 1)
4624
{
4625
// Now encode the block using the transcoded endpoints
4626
basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
4627
4628
if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)
4629
{
4630
status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,
4631
astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4632
}
4633
else
4634
{
4635
status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,
4636
astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4637
}
4638
assert(status);
4639
4640
uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];
4641
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
4642
4643
if (dual_plane)
4644
{
4645
eval_selectors_dual_plane(prev_candidate.m_coded_log_blk.m_color_component_selector,
4646
BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4647
4648
downsample_ise_weights_dual_plane(
4649
coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4650
BLOCK_W, BLOCK_H,
4651
grid_x, grid_y,
4652
trial_weights0, trial_weights1, coded_log_blk.m_weights);
4653
4654
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * 2, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4655
}
4656
else
4657
{
4658
eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, coded_log_blk.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4659
4660
downsample_ise_weights(
4661
coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4662
BLOCK_W, BLOCK_H,
4663
grid_x, grid_y,
4664
trial_weights0, coded_log_blk.m_weights);
4665
4666
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4667
}
4668
4669
// Create the block the decoder would transcode into.
4670
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
4671
}
4672
else if (prev_coded_log_blk.m_num_partitions == 2)
4673
{
4674
assert(!dual_plane);
4675
4676
const int unique_pat_index = g_part2_seed_to_unique_index[coded_log_blk.m_partition_id];
4677
assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS2));
4678
4679
const partition_pattern_vec& pat_vec = g_partitions2[unique_pat_index];
4680
4681
vec4F part_pixels_q16[2][64];
4682
half_vec3 part_half_pixels[2][64];
4683
uint32_t part_total_pixels[2] = { 0 };
4684
4685
for (uint32_t y = 0; y < BLOCK_H; y++)
4686
{
4687
for (uint32_t x = 0; x < BLOCK_W; x++)
4688
{
4689
const uint32_t part_index = pat_vec[x + y * 6];
4690
4691
uint32_t l = part_total_pixels[part_index];
4692
4693
part_pixels_q16[part_index][l] = block_pixels_q16[y][x];
4694
part_half_pixels[part_index][l] = half_pixels[y][x];
4695
4696
part_total_pixels[part_index] = l + 1;
4697
} // x
4698
} // y
4699
4700
uint8_t blk_weights[2][BLOCK_W * BLOCK_H];
4701
4702
for (uint32_t part_index = 0; part_index < 2; part_index++)
4703
{
4704
basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
4705
4706
if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)
4707
{
4708
status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
4709
astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4710
}
4711
else
4712
{
4713
status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
4714
astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4715
}
4716
assert(status);
4717
4718
eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,
4719
(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4720
4721
} // part_index
4722
4723
uint8_t ise_weights[BLOCK_W * BLOCK_H];
4724
4725
uint32_t src_pixel_index[2] = { 0, 0 };
4726
for (uint32_t y = 0; y < BLOCK_H; y++)
4727
{
4728
for (uint32_t x = 0; x < BLOCK_W; x++)
4729
{
4730
const uint32_t part_index = pat_vec[x + y * 6];
4731
4732
ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
4733
src_pixel_index[part_index]++;
4734
} // x
4735
} // y
4736
4737
downsample_ise_weights(
4738
coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4739
BLOCK_W, BLOCK_H,
4740
grid_x, grid_y,
4741
ise_weights, coded_log_blk.m_weights);
4742
4743
// Transcode these codable weights to ASTC weights.
4744
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];
4745
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4746
4747
// Create the block the decoder would transcode into.
4748
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
4749
}
4750
else if (prev_coded_log_blk.m_num_partitions == 3)
4751
{
4752
assert(!dual_plane);
4753
4754
const int unique_pat_index = g_part3_seed_to_unique_index[coded_log_blk.m_partition_id];
4755
assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS3));
4756
4757
const partition_pattern_vec& pat = g_partitions3[unique_pat_index];
4758
4759
vec4F part_pixels_q16[3][64];
4760
half_vec3 part_half_pixels[3][64];
4761
uint32_t part_total_pixels[3] = { 0 };
4762
4763
for (uint32_t y = 0; y < BLOCK_H; y++)
4764
{
4765
for (uint32_t x = 0; x < BLOCK_W; x++)
4766
{
4767
const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];
4768
4769
uint32_t l = part_total_pixels[part_index];
4770
4771
part_pixels_q16[part_index][l] = block_pixels_q16[y][x];
4772
part_half_pixels[part_index][l] = half_pixels[y][x];
4773
4774
part_total_pixels[part_index] = l + 1;
4775
} // x
4776
} // y
4777
4778
uint8_t blk_weights[3][BLOCK_W * BLOCK_H];
4779
4780
for (uint32_t part_index = 0; part_index < 3; part_index++)
4781
{
4782
basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
4783
4784
status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
4785
astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4786
assert(status);
4787
4788
eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,
4789
(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4790
4791
} // part_index
4792
4793
uint8_t ise_weights[BLOCK_W * BLOCK_H];
4794
4795
uint32_t src_pixel_index[3] = { 0 };
4796
for (uint32_t y = 0; y < BLOCK_H; y++)
4797
{
4798
for (uint32_t x = 0; x < BLOCK_W; x++)
4799
{
4800
const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];
4801
4802
ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
4803
src_pixel_index[part_index]++;
4804
} // x
4805
} // y
4806
4807
downsample_ise_weights(
4808
coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4809
BLOCK_W, BLOCK_H,
4810
grid_x, grid_y,
4811
ise_weights, coded_log_blk.m_weights);
4812
4813
// Transcode these codable weights to ASTC weights.
4814
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];
4815
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4816
4817
// Create the block the decoder would transcode into.
4818
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
4819
}
4820
4821
if (!validate_log_blk(decomp_log_blk))
4822
{
4823
fmt_error_printf("pack_astc_block() failed\n");
4824
return false;
4825
}
4826
4827
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_log_blk, &candidate.m_comp_pixels[0][0]);
4828
if (!status)
4829
{
4830
fmt_error_printf("decode_astc_block() failed\n");
4831
return false;
4832
}
4833
4834
candidate.m_coder.put_bits(REUSE_CODE, REUSE_CODE_LEN);
4835
candidate.m_coder.put_bits(reuse_delta_index, REUSE_XY_DELTA_BITS);
4836
encode_values(candidate.m_coder, num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range);
4837
4838
candidate.m_encoding_type = encoding_type::cReuse;
4839
candidate.m_block_mode = prev_candidate.m_block_mode;
4840
candidate.m_endpoint_mode = prev_candidate.m_endpoint_mode;
4841
candidate.m_reuse_delta_index = reuse_delta_index;
4842
4843
candidates.emplace_back(std::move(candidate));
4844
4845
} // reuse_delta_index
4846
}
4847
4848
// ---- Solid candidate
4849
if (global_cfg.m_use_solid_blocks)
4850
{
4851
candidate_encoding candidate;
4852
candidate.m_coder.reserve(24);
4853
4854
// solid
4855
candidate.m_encoding_type = encoding_type::cSolid;
4856
4857
float r = 0.0f, g = 0.0f, b = 0.0f;
4858
const float LOG_BIAS = .125f;
4859
bool solid_block = true;
4860
for (uint32_t y = 0; y < BLOCK_H; y++)
4861
{
4862
for (uint32_t x = 0; x < BLOCK_W; x++)
4863
{
4864
if ((block_pixels[0][0][0] != block_pixels[y][x][0]) ||
4865
(block_pixels[0][0][1] != block_pixels[y][x][1]) ||
4866
(block_pixels[0][0][2] != block_pixels[y][x][2]))
4867
{
4868
solid_block = false;
4869
}
4870
4871
r += log2f(block_pixels[y][x][0] + LOG_BIAS);
4872
g += log2f(block_pixels[y][x][1] + LOG_BIAS);
4873
b += log2f(block_pixels[y][x][2] + LOG_BIAS);
4874
}
4875
}
4876
4877
if (solid_block)
4878
{
4879
r = block_pixels[0][0][0];
4880
g = block_pixels[0][0][1];
4881
b = block_pixels[0][0][2];
4882
}
4883
else
4884
{
4885
r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
4886
g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
4887
b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
4888
4889
r = minimum<float>(r, basist::MAX_HALF_FLOAT);
4890
g = minimum<float>(g, basist::MAX_HALF_FLOAT);
4891
b = minimum<float>(b, basist::MAX_HALF_FLOAT);
4892
}
4893
4894
basist::half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b);
4895
4896
candidate.m_solid_color[0] = rh;
4897
candidate.m_solid_color[1] = gh;
4898
candidate.m_solid_color[2] = bh;
4899
4900
candidate.m_coder.put_bits(SOLID_CODE, SOLID_CODE_LEN);
4901
4902
candidate.m_coder.put_bits(rh, 15);
4903
candidate.m_coder.put_bits(gh, 15);
4904
candidate.m_coder.put_bits(bh, 15);
4905
4906
vec3F cp(basist::half_to_float(rh), basist::half_to_float(gh), basist::half_to_float(bh));
4907
4908
for (uint32_t y = 0; y < BLOCK_H; y++)
4909
for (uint32_t x = 0; x < BLOCK_W; x++)
4910
candidate.m_comp_pixels[y][x] = cp;
4911
4912
astc_helpers::log_astc_block& log_blk = candidate.m_coded_log_blk;
4913
4914
log_blk.clear();
4915
log_blk.m_solid_color_flag_hdr = true;
4916
log_blk.m_solid_color[0] = rh;
4917
log_blk.m_solid_color[1] = gh;
4918
log_blk.m_solid_color[2] = bh;
4919
log_blk.m_solid_color[3] = basist::float_to_half(1.0f);
4920
4921
candidate.m_decomp_log_blk = log_blk;
4922
4923
candidates.emplace_back(std::move(candidate));
4924
}
4925
4926
if ((!is_solid_block) || (!global_cfg.m_use_solid_blocks))
4927
{
4928
static uint8_t s_parts2_normal[5] = { 0, 2, 4, 6, 8 };
4929
static uint8_t s_parts3_normal[5] = { 0, 0, 4, 6, 8 };
4930
4931
static uint8_t s_parts2_complex[5] = { 0, 4, 8, 10, 16 };
4932
static uint8_t s_parts3_complex[5] = { 0, 0, 8, 10, 16 };
4933
4934
static uint8_t s_parts2_very_complex[5] = { 0, 8, 12, 14, 20 };
4935
static uint8_t s_parts3_very_complex[5] = { 0, 0, 12, 14, 20 };
4936
4937
uint32_t total_parts2 = 0, total_parts3 = 0;
4938
4939
assert(comp_level < 5);
4940
if ((very_simple_block) && (comp_level <= 3))
4941
{
4942
// Block's std dev is so low that 2-3 subsets are unlikely to help much
4943
total_parts2 = 0;
4944
total_parts3 = 0;
4945
4946
debug_state.m_total_part2_stats[0].fetch_add(1, std::memory_order_relaxed);
4947
}
4948
else if (very_complex_block)
4949
{
4950
total_parts2 = s_parts2_very_complex[comp_level];
4951
total_parts3 = s_parts3_very_complex[comp_level];
4952
4953
if (global_cfg.m_extra_patterns_flag)
4954
{
4955
total_parts2 += (comp_level == 4) ? 30 : 20;
4956
total_parts3 += (comp_level == 4) ? 30 : 20;
4957
}
4958
4959
debug_state.m_total_part2_stats[2].fetch_add(1, std::memory_order_relaxed);
4960
}
4961
else if (complex_block)
4962
{
4963
total_parts2 = s_parts2_complex[comp_level];
4964
total_parts3 = s_parts3_complex[comp_level];
4965
4966
if (global_cfg.m_extra_patterns_flag)
4967
{
4968
total_parts2 += (comp_level == 4) ? 15 : 10;
4969
total_parts3 += (comp_level == 4) ? 15 : 10;
4970
}
4971
4972
debug_state.m_total_part2_stats[3].fetch_add(1, std::memory_order_relaxed);
4973
}
4974
else
4975
{
4976
// moderate complexity - use defaults
4977
total_parts2 = s_parts2_normal[comp_level];
4978
total_parts3 = s_parts3_normal[comp_level];
4979
4980
if (global_cfg.m_extra_patterns_flag)
4981
{
4982
total_parts2 += 5;
4983
total_parts3 += 5;
4984
}
4985
4986
debug_state.m_total_part2_stats[1].fetch_add(1, std::memory_order_relaxed);
4987
}
4988
4989
if (!any_2subset_enabled)
4990
total_parts2 = 0;
4991
4992
if (!any_3subset_enabled)
4993
total_parts3 = 0;
4994
4995
int best_parts2_mode11[NUM_UNIQUE_PARTITIONS2], best_parts2_mode7[NUM_UNIQUE_PARTITIONS2];
4996
bool has_estimated_parts2 = false;
4997
4998
if (total_parts2)
4999
{
5000
if (global_cfg.m_brute_force_partition_matching)
5001
{
5002
int candidate_pats2[NUM_UNIQUE_PARTITIONS2];
5003
for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS2; i++)
5004
candidate_pats2[i] = i;
5005
5006
if (any_2subset_enabled)
5007
{
5008
estimate_partitions_mode7_and_11(
5009
2,
5010
NUM_UNIQUE_PARTITIONS2, g_partitions2,
5011
NUM_UNIQUE_PARTITIONS2, (uint32_t*)candidate_pats2,
5012
&half_pixels_as_floats[0][0],
5013
coptions,
5014
total_parts2, best_parts2_mode11, best_parts2_mode7);
5015
}
5016
5017
has_estimated_parts2 = true;
5018
}
5019
else
5020
{
5021
if (comp_level >= 1)
5022
{
5023
const uint32_t MAX_CANDIDATES2 = 48;
5024
int candidate_pats2[MAX_CANDIDATES2 * 2];
5025
5026
uint32_t num_candidate_pats2 = maximum((total_parts2 * 3) / 2, very_complex_block ? MAX_CANDIDATES2 : (MAX_CANDIDATES2 / 2));
5027
num_candidate_pats2 = minimum<uint32_t>(num_candidate_pats2, (uint32_t)std::size(candidate_pats2));
5028
5029
has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, candidate_pats2, num_candidate_pats2);
5030
5031
if (has_estimated_parts2)
5032
{
5033
estimate_partitions_mode7_and_11(
5034
2,
5035
NUM_UNIQUE_PARTITIONS2, g_partitions2,
5036
num_candidate_pats2, (uint32_t*)candidate_pats2,
5037
&half_pixels_as_floats[0][0],
5038
coptions,
5039
total_parts2, best_parts2_mode11, best_parts2_mode7);
5040
}
5041
}
5042
else
5043
{
5044
has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, best_parts2_mode11, total_parts2);
5045
5046
if ((has_estimated_parts2) && (any_2subset_mode7_enabled))
5047
memcpy(best_parts2_mode7, best_parts2_mode11, total_parts2 * sizeof(best_parts2_mode7[0]));
5048
}
5049
}
5050
}
5051
5052
int best_parts3[NUM_UNIQUE_PARTITIONS3];
5053
bool has_estimated_parts3 = false;
5054
5055
if (total_parts3)
5056
{
5057
#if 0
5058
has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, best_parts3, total_parts3);
5059
#elif 1
5060
if (global_cfg.m_brute_force_partition_matching)
5061
{
5062
int candidate_pats3[NUM_UNIQUE_PARTITIONS3];
5063
for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS3; i++)
5064
candidate_pats3[i] = i;
5065
5066
estimate_partitions_mode7(
5067
3,
5068
NUM_UNIQUE_PARTITIONS3, g_partitions3,
5069
NUM_UNIQUE_PARTITIONS3, (uint32_t*)candidate_pats3,
5070
&half_pixels_as_floats[0][0],
5071
coptions,
5072
total_parts3, (uint32_t*)best_parts3);
5073
5074
has_estimated_parts3 = true;
5075
}
5076
else
5077
{
5078
const uint32_t MAX_CANDIDATES3 = 48;
5079
int candidate_pats3[MAX_CANDIDATES3 * 2];
5080
5081
uint32_t num_candidate_pats3 = maximum((total_parts3 * 3) / 2, very_complex_block ? MAX_CANDIDATES3 : (MAX_CANDIDATES3 / 2));
5082
num_candidate_pats3 = minimum<uint32_t>(num_candidate_pats3, (uint32_t)std::size(candidate_pats3));
5083
5084
has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, candidate_pats3, num_candidate_pats3);
5085
5086
if (has_estimated_parts3)
5087
{
5088
estimate_partitions_mode7(
5089
3,
5090
NUM_UNIQUE_PARTITIONS3, g_partitions3,
5091
num_candidate_pats3, (uint32_t*)candidate_pats3,
5092
&half_pixels_as_floats[0][0],
5093
coptions,
5094
total_parts3, (uint32_t*)best_parts3);
5095
}
5096
}
5097
#endif
5098
}
5099
5100
const opt_mode_t mode11_opt_mode = complex_block ? cWeightedLeastSquares : cOrdinaryLeastSquares;
5101
5102
// ---- Encoded block candidate
5103
for (uint32_t block_mode_iter = 0; block_mode_iter < (uint32_t)block_mode::cBMTotalModes; block_mode_iter++)
5104
{
5105
const block_mode bm = (block_mode)block_mode_iter;
5106
5107
if (comp_level == 0)
5108
{
5109
if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)
5110
continue;
5111
}
5112
else if (comp_level == 1)
5113
{
5114
if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)
5115
continue;
5116
}
5117
else if (comp_level == 2)
5118
{
5119
if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)
5120
continue;
5121
}
5122
5123
if (global_cfg.m_block_stat_optimizations_flag)
5124
{
5125
if ((comp_level <= 3) && (g_block_mode_descs[block_mode_iter].m_dp))
5126
{
5127
if ((global_cfg.m_lambda > 0.0f) && (!complex_block) && (g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))
5128
{
5129
if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan_2x2)
5130
continue;
5131
}
5132
else
5133
{
5134
if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan)
5135
continue;
5136
}
5137
}
5138
5139
if (comp_level <= 3)
5140
{
5141
const uint32_t grid_x = g_block_mode_descs[block_mode_iter].m_grid_x;
5142
const uint32_t grid_y = g_block_mode_descs[block_mode_iter].m_grid_y;
5143
5144
if (!g_block_mode_descs[block_mode_iter].m_dp)
5145
{
5146
// Minor gain (.5-1% less canidates)
5147
if (very_detailed_block)
5148
{
5149
if (grid_x * grid_y <= 12)
5150
{
5151
debug_state.m_detail_stats[0].fetch_add(1, std::memory_order_relaxed);
5152
continue;
5153
}
5154
}
5155
5156
// Major gains (10-25% less candidates)
5157
if (very_blurry_block)
5158
{
5159
if ((grid_x > 4) || (grid_y > 4) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))
5160
{
5161
debug_state.m_detail_stats[1].fetch_add(1, std::memory_order_relaxed);
5162
continue;
5163
}
5164
}
5165
if (super_blurry_block)
5166
{
5167
if ((grid_x > 3) || (grid_y > 3) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))
5168
{
5169
debug_state.m_detail_stats[2].fetch_add(1, std::memory_order_relaxed);
5170
continue;
5171
}
5172
}
5173
}
5174
5175
if (grid_x != grid_y)
5176
{
5177
if (grid_x < grid_y)
5178
{
5179
if (!filter_horizontally)
5180
{
5181
debug_state.m_detail_stats[3].fetch_add(1, std::memory_order_relaxed);
5182
continue;
5183
}
5184
}
5185
else
5186
{
5187
if (filter_horizontally)
5188
{
5189
debug_state.m_detail_stats[4].fetch_add(1, std::memory_order_relaxed);
5190
continue;
5191
}
5192
}
5193
}
5194
}
5195
5196
if (global_cfg.m_lambda == 0.0f)
5197
{
5198
// Rarely useful if lambda=0
5199
if ((g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))
5200
continue;
5201
}
5202
} // block_stat_optimizations_flag
5203
5204
if ((!use_single_subset_mode7) &&
5205
(g_block_mode_descs[block_mode_iter].m_cem == 7) &&
5206
(g_block_mode_descs[block_mode_iter].m_num_partitions == 1))
5207
{
5208
debug_state.m_total_mode7_skips.fetch_add(1, std::memory_order_relaxed);
5209
continue;
5210
}
5211
5212
for (uint32_t endpoint_mode_iter = 0; endpoint_mode_iter < (uint32_t)endpoint_mode::cTotal; endpoint_mode_iter++)
5213
{
5214
if (global_cfg.m_lambda == 0.0f)
5215
{
5216
// No use trying anything else
5217
if (endpoint_mode_iter != (uint32_t)endpoint_mode::cRaw)
5218
continue;
5219
}
5220
5221
if (global_cfg.m_disable_delta_endpoint_usage)
5222
{
5223
if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeftDelta))
5224
continue;
5225
}
5226
5227
if (!global_cfg.m_favor_higher_compression)
5228
{
5229
if (comp_level == 0)
5230
{
5231
if (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta)
5232
continue;
5233
}
5234
5235
if (comp_level <= 1)
5236
{
5237
if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeft) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpper))
5238
continue;
5239
}
5240
}
5241
5242
const endpoint_mode em = (endpoint_mode)endpoint_mode_iter;
5243
5244
switch (em)
5245
{
5246
case endpoint_mode::cUseLeft:
5247
case endpoint_mode::cUseUpper:
5248
{
5249
const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];
5250
const uint32_t cem = local_md.m_cem;
5251
5252
if (local_md.m_num_partitions > 1)
5253
break;
5254
5255
if ((em == endpoint_mode::cUseLeft) && (!has_left_neighbor))
5256
break;
5257
else if ((em == endpoint_mode::cUseUpper) && (!has_upper_neighbor))
5258
break;
5259
5260
candidate_encoding candidate;
5261
candidate.m_coder.reserve(24);
5262
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5263
5264
int nx = bx, ny = by;
5265
if (em == endpoint_mode::cUseLeft)
5266
nx--;
5267
else
5268
ny--;
5269
5270
const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);
5271
if (neighbor_blk.m_encoding_type == encoding_type::cSolid)
5272
break;
5273
assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));
5274
5275
const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];
5276
5277
if (neighbor_md.m_cem != cem)
5278
break;
5279
5280
assert(neighbor_blk.m_coded_log_blk.m_color_endpoint_modes[0] == cem);
5281
5282
const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;
5283
const bool dual_plane = local_md.m_dp;
5284
const uint32_t num_grid_samples = grid_x * grid_y;
5285
const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);
5286
5287
coded_log_blk.m_grid_width = (uint8_t)grid_x;
5288
coded_log_blk.m_grid_height = (uint8_t)grid_y;
5289
coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
5290
coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5291
coded_log_blk.m_num_partitions = 1;
5292
coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)neighbor_md.m_cem;
5293
coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;
5294
5295
// We're not explictly writing any endpoints, just reusing existing ones. So copy the neighbor's endpoints unchanged (so no loss).
5296
coded_log_blk.m_endpoint_ise_range = neighbor_blk.m_coded_log_blk.m_endpoint_ise_range;
5297
memcpy(coded_log_blk.m_endpoints, neighbor_blk.m_coded_log_blk.m_endpoints, num_endpoint_vals);
5298
5299
uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];
5300
5301
// Requantize the neighbor's endpoints to whatever we'll have to transcode into to make a valid ASTC encoding.
5302
basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,
5303
neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,
5304
local_md.m_transcode_endpoint_ise_range, transcode_endpoints);
5305
5306
// Now encode the block using the transcoded endpoints
5307
basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
5308
5309
if (cem == 7)
5310
{
5311
status = get_astc_hdr_mode_7_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,
5312
astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);
5313
}
5314
else
5315
{
5316
status = get_astc_hdr_mode_11_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,
5317
astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);
5318
}
5319
if (!status)
5320
break;
5321
5322
uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];
5323
if (dual_plane)
5324
{
5325
eval_selectors_dual_plane(local_md.m_dp_channel, BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
5326
5327
downsample_ise_weights_dual_plane(
5328
local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5329
BLOCK_W, BLOCK_H,
5330
grid_x, grid_y,
5331
trial_weights0, trial_weights1, coded_log_blk.m_weights);
5332
}
5333
else
5334
{
5335
eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, local_md.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
5336
5337
downsample_ise_weights(
5338
local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5339
BLOCK_W, BLOCK_H,
5340
grid_x, grid_y,
5341
trial_weights0, coded_log_blk.m_weights);
5342
}
5343
5344
// Transcode these codable weights to ASTC weights.
5345
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
5346
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);
5347
5348
// Create the block the decoder would transcode into.
5349
astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5350
decomp_blk.clear();
5351
5352
decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
5353
decomp_blk.m_dual_plane = local_md.m_dp;
5354
decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5355
decomp_blk.m_num_partitions = 1;
5356
decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;
5357
decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;
5358
5359
memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);
5360
5361
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
5362
5363
if (!validate_log_blk(decomp_blk))
5364
{
5365
fmt_error_printf("pack_astc_block() failed\n");
5366
return false;
5367
}
5368
5369
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5370
if (!status)
5371
{
5372
fmt_error_printf("decode_astc_block() failed\n");
5373
return false;
5374
}
5375
5376
candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5377
code_block(candidate.m_coder, candidate.m_coded_log_blk, (block_mode)block_mode_iter, em, nullptr);
5378
5379
candidate.m_encoding_type = encoding_type::cBlock;
5380
candidate.m_endpoint_mode = em;
5381
candidate.m_block_mode = bm;
5382
5383
candidates.emplace_back(std::move(candidate));
5384
5385
break;
5386
}
5387
case endpoint_mode::cUseLeftDelta:
5388
case endpoint_mode::cUseUpperDelta:
5389
{
5390
const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];
5391
const uint32_t cem = local_md.m_cem;
5392
5393
if (local_md.m_num_partitions > 1)
5394
break;
5395
5396
if ((em == endpoint_mode::cUseLeftDelta) && (!has_left_neighbor))
5397
break;
5398
else if ((em == endpoint_mode::cUseUpperDelta) && (!has_upper_neighbor))
5399
break;
5400
5401
candidate_encoding candidate;
5402
candidate.m_coder.reserve(24);
5403
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5404
5405
int nx = bx, ny = by;
5406
if (em == endpoint_mode::cUseLeftDelta)
5407
nx--;
5408
else
5409
ny--;
5410
5411
const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);
5412
if (neighbor_blk.m_encoding_type == encoding_type::cSolid)
5413
break;
5414
assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));
5415
5416
const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];
5417
5418
if (neighbor_md.m_cem != cem)
5419
break;
5420
5421
assert(neighbor_md.m_cem == local_md.m_cem);
5422
5423
const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;
5424
const bool dual_plane = local_md.m_dp;
5425
const uint32_t num_grid_samples = grid_x * grid_y;
5426
const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);
5427
5428
// Dequantize neighbor's endpoints to ISE 20
5429
uint8_t neighbor_endpoints_ise20[basist::NUM_MODE11_ENDPOINTS];
5430
basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,
5431
neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,
5432
astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20);
5433
5434
// Requantize neighbor's endpoints to our local desired coding ISE range
5435
uint8_t neighbor_endpoints_coding_ise_local[basist::NUM_MODE11_ENDPOINTS];
5436
basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20, local_md.m_endpoint_ise_range, neighbor_endpoints_coding_ise_local);
5437
5438
uint8_t blk_endpoints[basist::NUM_MODE11_ENDPOINTS];
5439
uint8_t blk_weights0[NUM_BLOCK_PIXELS], blk_weights1[NUM_BLOCK_PIXELS];
5440
5441
// Now try to encode the current block using the neighbor's endpoints submode.
5442
double err = 0.0f;
5443
uint32_t best_submode = 0;
5444
5445
if (cem == 7)
5446
{
5447
int maj_index, submode_index;
5448
decode_cem_7_config(neighbor_endpoints_ise20, submode_index, maj_index);
5449
5450
int first_submode = submode_index, last_submode = submode_index;
5451
5452
err = encode_astc_hdr_block_mode_7(
5453
NUM_BLOCK_PIXELS,
5454
(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
5455
local_md.m_weight_ise_range,
5456
best_submode,
5457
BIG_FLOAT_VAL,
5458
blk_endpoints, blk_weights0,
5459
coptions,
5460
local_md.m_endpoint_ise_range,
5461
first_submode, last_submode,
5462
&enc_block_stats);
5463
}
5464
else
5465
{
5466
int maj_index, submode_index;
5467
decode_cem_11_config(neighbor_endpoints_ise20, submode_index, maj_index);
5468
5469
int first_submode = -1, last_submode = -1;
5470
if (maj_index == 3)
5471
{
5472
// direct
5473
}
5474
else
5475
{
5476
first_submode = submode_index;
5477
last_submode = submode_index;
5478
}
5479
5480
if (dual_plane)
5481
{
5482
err = encode_astc_hdr_block_mode_11_dual_plane(
5483
NUM_BLOCK_PIXELS,
5484
(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
5485
local_md.m_dp_channel,
5486
local_md.m_weight_ise_range,
5487
best_submode,
5488
BIG_FLOAT_VAL,
5489
blk_endpoints, blk_weights0, blk_weights1,
5490
coptions,
5491
false,
5492
local_md.m_endpoint_ise_range,
5493
false, //uber_mode_flag,
5494
false,
5495
first_submode, last_submode, true);
5496
}
5497
else
5498
{
5499
err = encode_astc_hdr_block_mode_11(
5500
NUM_BLOCK_PIXELS,
5501
(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
5502
local_md.m_weight_ise_range,
5503
best_submode,
5504
BIG_FLOAT_VAL,
5505
blk_endpoints, blk_weights0,
5506
coptions,
5507
false,
5508
local_md.m_endpoint_ise_range,
5509
false, //uber_mode_flag,
5510
false,
5511
first_submode, last_submode, true,
5512
mode11_opt_mode,
5513
&enc_block_stats);
5514
}
5515
}
5516
5517
if (err == BIG_FLOAT_VAL)
5518
break;
5519
5520
uint8_t endpoint_deltas[basist::NUM_MODE11_ENDPOINTS];
5521
5522
// TODO: For now, just try 5 bits for each endpoint. Can tune later.
5523
// This isn't right, it's computing the deltas in ISE space.
5524
//const uint32_t NUM_ENDPOINT_DELTA_BITS = 5;
5525
const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;
5526
const int low_delta_limit = -(total_endpoint_delta_vals / 2), high_delta_limit = (total_endpoint_delta_vals / 2) - 1;
5527
5528
const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(local_md.m_endpoint_ise_range).m_ISE_to_rank;
5529
5530
bool all_deltas_in_limits = true;
5531
for (uint32_t i = 0; i < num_endpoint_vals; i++)
5532
{
5533
int endpoint_delta = (int)ise_to_rank[blk_endpoints[i]] - (int)ise_to_rank[neighbor_endpoints_coding_ise_local[i]];
5534
5535
if ((endpoint_delta < low_delta_limit) || (endpoint_delta > high_delta_limit))
5536
all_deltas_in_limits = false;
5537
5538
endpoint_deltas[i] = (uint8_t)(endpoint_delta + -low_delta_limit);
5539
}
5540
5541
if (all_deltas_in_limits)
5542
{
5543
coded_log_blk.m_grid_width = (uint8_t)grid_x;
5544
coded_log_blk.m_grid_height = (uint8_t)grid_y;
5545
coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
5546
coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5547
coded_log_blk.m_num_partitions = 1;
5548
coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
5549
coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;
5550
coded_log_blk.m_endpoint_ise_range = (uint8_t)local_md.m_endpoint_ise_range;
5551
5552
memcpy(coded_log_blk.m_endpoints, blk_endpoints, num_endpoint_vals);
5553
5554
uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];
5555
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
5556
5557
basist::astc_6x6_hdr::requantize_ise_endpoints(local_md.m_cem, local_md.m_endpoint_ise_range, blk_endpoints, local_md.m_transcode_endpoint_ise_range, transcode_endpoints);
5558
5559
if (dual_plane)
5560
{
5561
downsample_ise_weights_dual_plane(
5562
local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5563
BLOCK_W, BLOCK_H,
5564
grid_x, grid_y,
5565
blk_weights0, blk_weights1,
5566
coded_log_blk.m_weights);
5567
}
5568
else
5569
{
5570
downsample_ise_weights(
5571
local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5572
BLOCK_W, BLOCK_H,
5573
grid_x, grid_y,
5574
blk_weights0, coded_log_blk.m_weights);
5575
}
5576
5577
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);
5578
5579
// Create the block the decoder would transcode into.
5580
5581
astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5582
decomp_blk.clear();
5583
5584
decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
5585
decomp_blk.m_dual_plane = local_md.m_dp;
5586
decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5587
decomp_blk.m_num_partitions = 1;
5588
decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;
5589
decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;
5590
5591
memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);
5592
5593
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
5594
5595
if (!validate_log_blk(decomp_blk))
5596
{
5597
fmt_error_printf("pack_astc_block() failed\n");
5598
return false;
5599
}
5600
5601
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5602
if (!status)
5603
{
5604
fmt_error_printf("decode_astc_block() failed\n");
5605
return false;
5606
}
5607
5608
candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5609
code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, endpoint_deltas);
5610
5611
candidate.m_encoding_type = encoding_type::cBlock;
5612
candidate.m_endpoint_mode = em;
5613
candidate.m_block_mode = bm;
5614
5615
candidates.emplace_back(std::move(candidate));
5616
}
5617
5618
break;
5619
}
5620
case endpoint_mode::cRaw:
5621
{
5622
//if (candidates.size() == 339)
5623
// fmt_printf("!");
5624
5625
const auto& mode_desc = g_block_mode_descs[(uint32_t)bm];
5626
const uint32_t cem = mode_desc.m_cem;
5627
//const uint32_t num_endpoint_vals = get_num_endpoint_vals(cem);
5628
const bool dual_plane = mode_desc.m_dp;
5629
5630
if ((global_cfg.m_disable_twothree_subsets) && (mode_desc.m_num_partitions >= 2))
5631
break;
5632
5633
if (mode_desc.m_num_partitions == 3)
5634
{
5635
assert(!dual_plane);
5636
5637
if (!has_estimated_parts3)
5638
break;
5639
5640
assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);
5641
assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);
5642
5643
trial_result res;
5644
5645
status = encode_block_3_subsets(
5646
res,
5647
cem,
5648
mode_desc.m_grid_x, mode_desc.m_grid_y,
5649
mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
5650
&half_pixels[0][0], (vec4F*)block_pixels_q16,
5651
coptions,
5652
uber_mode_flag,
5653
best_parts3, total_parts3, comp_level, mode11_opt_mode);
5654
5655
if (!status)
5656
break;
5657
5658
assert(res.m_valid);
5659
5660
candidate_encoding candidate;
5661
candidate.m_coder.reserve(24);
5662
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5663
5664
coded_log_blk = res.m_log_blk;
5665
5666
astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5667
decomp_blk = res.m_log_blk;
5668
5669
if (!validate_log_blk(decomp_blk))
5670
{
5671
fmt_error_printf("pack_astc_block() failed\n");
5672
return false;
5673
}
5674
5675
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5676
if (!status)
5677
{
5678
fmt_error_printf("decode_astc_block() failed\n");
5679
return false;
5680
}
5681
5682
candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5683
code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
5684
5685
candidate.m_encoding_type = encoding_type::cBlock;
5686
candidate.m_endpoint_mode = em;
5687
candidate.m_block_mode = bm;
5688
5689
candidates.emplace_back(std::move(candidate));
5690
}
5691
else if (mode_desc.m_num_partitions == 2)
5692
{
5693
assert(!dual_plane);
5694
5695
if (!has_estimated_parts2)
5696
break;
5697
5698
assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);
5699
assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);
5700
5701
for (uint32_t est_part_iter = 0; est_part_iter < total_parts2; est_part_iter++)
5702
{
5703
trial_result results[2];
5704
5705
assert(((cem == 11) && any_2subset_mode11_enabled) || ((cem == 7) && any_2subset_mode7_enabled));
5706
5707
status = encode_block_2_subsets(
5708
results,
5709
mode_desc.m_grid_x, mode_desc.m_grid_y,
5710
mode_desc.m_cem,
5711
mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
5712
&half_pixels[0][0], (vec4F*)block_pixels_q16,
5713
coptions,
5714
uber_mode_flag,
5715
(cem == 11) ? best_parts2_mode11[est_part_iter] : best_parts2_mode7[est_part_iter],
5716
comp_level,
5717
mode11_opt_mode,
5718
true);
5719
5720
if (!status)
5721
continue;
5722
5723
for (uint32_t r_iter = 0; r_iter < 2; r_iter++)
5724
{
5725
const trial_result& res = results[r_iter];
5726
5727
if (!res.m_valid)
5728
continue;
5729
5730
candidate_encoding candidate;
5731
candidate.m_coder.reserve(24);
5732
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5733
5734
coded_log_blk = res.m_log_blk;
5735
5736
astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5737
decomp_blk = res.m_log_blk;
5738
5739
if (!validate_log_blk(decomp_blk))
5740
{
5741
fmt_error_printf("pack_astc_block() failed\n");
5742
return false;
5743
}
5744
5745
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5746
if (!status)
5747
{
5748
fmt_error_printf("decode_astc_block() failed\n");
5749
return false;
5750
}
5751
5752
candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5753
code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
5754
5755
candidate.m_encoding_type = encoding_type::cBlock;
5756
candidate.m_endpoint_mode = em;
5757
candidate.m_block_mode = bm;
5758
5759
candidates.emplace_back(std::move(candidate));
5760
5761
} // r_iter
5762
}
5763
}
5764
else
5765
{
5766
// 1 subset
5767
uint8_t blk_weights0[BLOCK_W * BLOCK_H], blk_weights1[BLOCK_W * BLOCK_H];
5768
uint32_t best_submode = 0;
5769
5770
candidate_encoding candidate;
5771
candidate.m_coder.reserve(24);
5772
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5773
5774
const uint32_t grid_x = mode_desc.m_grid_x, grid_y = mode_desc.m_grid_y;
5775
const uint32_t num_grid_samples = grid_x * grid_y;
5776
5777
const half_vec3* pBlock_pixels_half = &half_pixels[0][0];
5778
const vec4F* pBlock_pixels_q16 = &block_pixels_q16[0][0];
5779
5780
const uint32_t num_grid_samples_dp = num_grid_samples * (dual_plane ? 2 : 1);
5781
5782
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
5783
5784
coded_log_blk.m_grid_width = (uint8_t)grid_x;
5785
coded_log_blk.m_grid_height = (uint8_t)grid_y;
5786
coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
5787
coded_log_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;
5788
coded_log_blk.m_num_partitions = 1;
5789
coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;
5790
coded_log_blk.m_weight_ise_range = (uint8_t)mode_desc.m_weight_ise_range;
5791
coded_log_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_endpoint_ise_range;
5792
5793
if ((cem == 11) && (!dual_plane) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))
5794
{
5795
double e = encode_astc_hdr_block_downsampled_mode_11(
5796
BLOCK_W, BLOCK_H, grid_x, grid_y,
5797
mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
5798
NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5799
BIG_FLOAT_VAL,
5800
FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode,
5801
coded_log_blk.m_endpoints, coded_log_blk.m_weights, best_submode,
5802
coptions,
5803
&enc_block_stats);
5804
5805
if (e == BIG_FLOAT_VAL)
5806
break;
5807
}
5808
else
5809
{
5810
if (cem == 7)
5811
{
5812
assert(!dual_plane);
5813
5814
double e = encode_astc_hdr_block_mode_7(
5815
NUM_BLOCK_PIXELS,
5816
(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5817
mode_desc.m_weight_ise_range,
5818
best_submode,
5819
BIG_FLOAT_VAL,
5820
coded_log_blk.m_endpoints,
5821
blk_weights0,
5822
coptions,
5823
mode_desc.m_endpoint_ise_range,
5824
0, MAX_MODE7_SUBMODE_INDEX,
5825
&enc_block_stats);
5826
BASISU_NOTE_UNUSED(e);
5827
}
5828
else
5829
{
5830
double e;
5831
5832
if (dual_plane)
5833
{
5834
e = encode_astc_hdr_block_mode_11_dual_plane(
5835
NUM_BLOCK_PIXELS,
5836
(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5837
mode_desc.m_dp_channel,
5838
mode_desc.m_weight_ise_range,
5839
best_submode,
5840
BIG_FLOAT_VAL,
5841
coded_log_blk.m_endpoints,
5842
blk_weights0, blk_weights1,
5843
coptions,
5844
false,
5845
mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false);
5846
}
5847
else
5848
{
5849
e = encode_astc_hdr_block_mode_11(
5850
NUM_BLOCK_PIXELS,
5851
(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5852
mode_desc.m_weight_ise_range,
5853
best_submode,
5854
BIG_FLOAT_VAL,
5855
coded_log_blk.m_endpoints,
5856
blk_weights0,
5857
coptions,
5858
false,
5859
mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,
5860
mode11_opt_mode,
5861
&enc_block_stats);
5862
}
5863
5864
if (e == BIG_FLOAT_VAL)
5865
break;
5866
}
5867
5868
if (dual_plane)
5869
{
5870
downsample_ise_weights_dual_plane(
5871
mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,
5872
BLOCK_W, BLOCK_H,
5873
grid_x, grid_y,
5874
blk_weights0, blk_weights1,
5875
coded_log_blk.m_weights);
5876
}
5877
else
5878
{
5879
downsample_ise_weights(
5880
mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,
5881
BLOCK_W, BLOCK_H,
5882
grid_x, grid_y,
5883
blk_weights0, coded_log_blk.m_weights);
5884
5885
if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))
5886
{
5887
bool refine_status = refine_endpoints(cem,
5888
mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints,
5889
6, 6, mode_desc.m_grid_x, mode_desc.m_grid_y,
5890
coded_log_blk.m_weights, mode_desc.m_weight_ise_range,
5891
BLOCK_W * BLOCK_H,
5892
(basist::half_float(*)[3])pBlock_pixels_half, (vec4F*)pBlock_pixels_q16,
5893
nullptr,
5894
coptions, mode11_opt_mode);
5895
BASISU_NOTE_UNUSED(refine_status);
5896
}
5897
}
5898
}
5899
5900
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples_dp, coded_log_blk.m_weights, mode_desc.m_weight_ise_range, transcode_weights, mode_desc.m_transcode_weight_ise_range);
5901
5902
// Create the block the decoder would transcode into.
5903
astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5904
decomp_blk.clear();
5905
5906
decomp_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;
5907
decomp_blk.m_dual_plane = mode_desc.m_dp;
5908
decomp_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;
5909
decomp_blk.m_num_partitions = 1;
5910
decomp_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_transcode_endpoint_ise_range;
5911
decomp_blk.m_weight_ise_range = (uint8_t)mode_desc.m_transcode_weight_ise_range;
5912
5913
basist::astc_6x6_hdr::requantize_ise_endpoints(mode_desc.m_cem, mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, mode_desc.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
5914
5915
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
5916
5917
if (!validate_log_blk(decomp_blk))
5918
{
5919
fmt_error_printf("pack_astc_block() failed\n");
5920
return false;
5921
}
5922
5923
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5924
if (!status)
5925
{
5926
fmt_error_printf("decode_astc_block() failed\n");
5927
return false;
5928
}
5929
5930
candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5931
code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
5932
5933
candidate.m_encoding_type = encoding_type::cBlock;
5934
candidate.m_endpoint_mode = em;
5935
candidate.m_block_mode = bm;
5936
5937
candidates.emplace_back(std::move(candidate));
5938
}
5939
5940
break;
5941
}
5942
default:
5943
assert(0);
5944
fmt_debug_printf("Invalid endpoint mode\n");
5945
return false;
5946
5947
} // switch (em)
5948
5949
} // endpoint_mode_iter
5950
5951
} // block_mode_iter
5952
5953
} // is_solid_block
5954
5955
//------------------------------------------------
5956
5957
debug_state.m_total_candidates_considered.fetch_add(candidates.size_u32(), std::memory_order_relaxed);
5958
atomic_max(debug_state.m_max_candidates_considered, candidates.size_u32());
5959
5960
for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
5961
{
5962
auto& candidate = candidates[candidate_iter];
5963
5964
for (uint32_t y = 0; y < BLOCK_H; y++)
5965
for (uint32_t x = 0; x < BLOCK_W; x++)
5966
linear_rgb_to_itp(candidate.m_comp_pixels[y][x], candidate.m_comp_pixels_itp[y][x], global_cfg);
5967
}
5968
5969
// Find best overall candidate
5970
double best_t = BIG_FLOAT_VAL;
5971
int best_candidate_index = -1;
5972
5973
float best_d_ssim = BIG_FLOAT_VAL;
5974
5975
if (global_cfg.m_lambda == 0.0f)
5976
{
5977
for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
5978
{
5979
const auto& candidate = candidates[candidate_iter];
5980
5981
float candidate_d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);
5982
5983
if (candidate_d_ssim < best_d_ssim)
5984
best_d_ssim = candidate_d_ssim;
5985
5986
candidate_d_ssim *= SSIM_WEIGHT;
5987
5988
float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);
5989
5990
candidate_mse += candidate_d_ssim;
5991
5992
float total_deblock_penalty = 0.0f;
5993
if (global_cfg.m_deblocking_flag)
5994
{
5995
total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;
5996
}
5997
candidate_mse += total_deblock_penalty * SSIM_WEIGHT;
5998
5999
if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))
6000
{
6001
// Bias the encoder away from 2 level blocks on complex blocks
6002
// TODO: Perhaps only do this on large or non-interpolated grids
6003
if (complex_block)
6004
{
6005
if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)
6006
{
6007
candidate_mse *= TWO_LEVEL_PENALTY;
6008
}
6009
}
6010
6011
// Bias the encoder away from smaller weight grids if the block is very complex
6012
// TODO: Use the DCT to compute an approximation of the block energy/variance retained vs. lost by downsampling.
6013
if (complex_block)
6014
{
6015
if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))
6016
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;
6017
else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)
6018
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;
6019
else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)
6020
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;
6021
}
6022
}
6023
6024
float candidate_t = candidate_mse;
6025
6026
if (candidate_t < best_t)
6027
{
6028
best_t = candidate_t;
6029
best_candidate_index = candidate_iter;
6030
}
6031
6032
} // candidate_iter
6033
6034
if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))
6035
{
6036
debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);
6037
continue;
6038
}
6039
6040
const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);
6041
6042
if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&
6043
(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&
6044
(block_avg_y >= 1.5f))
6045
{
6046
debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);
6047
continue;
6048
}
6049
}
6050
else
6051
{
6052
assert(enc_state.smooth_block_mse_scales.get_width() > 0);
6053
6054
// Compute block's perceptual weighting
6055
float perceptual_scale = 0.0f;
6056
for (uint32_t y = 0; y < BLOCK_H; y++)
6057
for (uint32_t x = 0; x < BLOCK_W; x++)
6058
perceptual_scale = basisu::maximumf(perceptual_scale, enc_state.smooth_block_mse_scales.at_clamped(bx * BLOCK_W + x, by * BLOCK_H + y));
6059
6060
// Very roughly normalize the computed distortion vs. bits.
6061
perceptual_scale *= 10.0f;
6062
6063
for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
6064
{
6065
auto& candidate = candidates[candidate_iter];
6066
6067
float d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);
6068
6069
if (d_ssim < best_d_ssim)
6070
best_d_ssim = (float)d_ssim;
6071
6072
d_ssim *= SSIM_WEIGHT;
6073
6074
float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);
6075
6076
candidate_mse += d_ssim;
6077
6078
float total_deblock_penalty = 0.0f;
6079
if (global_cfg.m_deblocking_flag)
6080
{
6081
total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;
6082
}
6083
candidate_mse += total_deblock_penalty * SSIM_WEIGHT;
6084
6085
if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))
6086
{
6087
// Bias the encoder away from 2 level blocks on complex blocks
6088
if (complex_block)
6089
{
6090
if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)
6091
{
6092
candidate_mse *= TWO_LEVEL_PENALTY;
6093
}
6094
}
6095
6096
// Bias the encoder away from smaller weight grids if the block is very complex
6097
if (complex_block)
6098
{
6099
if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))
6100
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;
6101
else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)
6102
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;
6103
else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)
6104
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;
6105
}
6106
}
6107
6108
float mode_penalty = 1.0f;
6109
if (candidate.m_encoding_type == encoding_type::cSolid)
6110
mode_penalty *= SOLID_PENALTY;
6111
else if (candidate.m_encoding_type == encoding_type::cReuse)
6112
mode_penalty *= REUSE_PENALTY;
6113
else if (candidate.m_encoding_type == encoding_type::cRun)
6114
mode_penalty *= (complex_block ? RUN_PENALTY * 2.0f : RUN_PENALTY);
6115
6116
float candidate_bits = (float)candidate.m_coder.get_total_bits();
6117
float candidate_d = candidate_mse * mode_penalty;
6118
6119
const float D_POWER = 2.0f;
6120
float candidate_t = perceptual_scale * powf(candidate_d, D_POWER) + candidate_bits * (global_cfg.m_lambda * 1000.0f);
6121
6122
candidate.m_t = candidate_t;
6123
candidate.m_d = candidate_d;
6124
candidate.m_bits = candidate_bits;
6125
6126
if (candidate_t < best_t)
6127
{
6128
best_t = candidate_t;
6129
best_candidate_index = candidate_iter;
6130
}
6131
6132
} // candidate_iter
6133
6134
if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))
6135
{
6136
debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);
6137
continue;
6138
}
6139
6140
const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);
6141
6142
if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&
6143
(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&
6144
(block_avg_y >= 1.5f))
6145
{
6146
debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);
6147
continue;
6148
}
6149
6150
if (global_cfg.m_rdo_candidate_diversity_boost)
6151
{
6152
// candidate diversity boosting - consider candidates along/near the Pareto front
6153
const candidate_encoding& comp_candidate = candidates[best_candidate_index];
6154
6155
float best_d = BIG_FLOAT_VAL;
6156
6157
for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
6158
{
6159
const auto& candidate = candidates[candidate_iter];
6160
6161
if (candidate.m_bits <= comp_candidate.m_bits * global_cfg.m_rdo_candidate_diversity_boost_bit_window_weight)
6162
{
6163
if (candidate.m_d < best_d)
6164
{
6165
best_d = candidate.m_d;
6166
best_candidate_index = candidate_iter;
6167
}
6168
}
6169
}
6170
}
6171
6172
// candidate JND optimization - if there's a cheaper to code candidate that is nearly equivalent visually to the best candidate chose, choose that
6173
if (global_cfg.m_jnd_optimization)
6174
{
6175
const candidate_encoding& cur_comp_candidate = candidates[best_candidate_index];
6176
6177
float new_best_candidate_bits = BIG_FLOAT_VAL;
6178
int new_best_candidate_index = -1;
6179
6180
for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
6181
{
6182
if ((int)candidate_iter == best_candidate_index)
6183
continue;
6184
6185
const auto& candidate = candidates[candidate_iter];
6186
6187
if (candidate.m_bits >= cur_comp_candidate.m_bits)
6188
continue;
6189
6190
float max_delta_itp = 0.0f;
6191
for (uint32_t y = 0; y < BLOCK_H; y++)
6192
{
6193
for (uint32_t x = 0; x < BLOCK_W; x++)
6194
{
6195
float delta_itp = compute_pixel_delta_itp(cur_comp_candidate.m_comp_pixels_itp[y][x], candidate.m_comp_pixels_itp[y][x], block_pixels_as_itp[y][x], global_cfg.m_delta_itp_dark_adjustment);
6196
max_delta_itp = maximum(max_delta_itp, delta_itp);
6197
6198
if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)
6199
goto skip;
6200
}
6201
}
6202
6203
skip:
6204
if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)
6205
continue;
6206
6207
if (candidate.m_bits < new_best_candidate_bits)
6208
{
6209
new_best_candidate_bits = candidate.m_bits;
6210
new_best_candidate_index = candidate_iter;
6211
}
6212
}
6213
6214
if (new_best_candidate_index != -1)
6215
{
6216
best_candidate_index = new_best_candidate_index;
6217
debug_state.m_total_jnd_replacements.fetch_add(1, std::memory_order_relaxed);
6218
}
6219
}
6220
6221
} // if (lambda == 0.0f)
6222
6223
if (global_cfg.m_debug_images)
6224
{
6225
std::lock_guard<std::mutex> lck(debug_state.m_stat_vis_mutex);
6226
debug_state.m_stat_vis.fill_box(bx * 6, by * 6, 6, 6, vec4F(best_d_ssim, max_std_dev, lowpass_std_dev, 1.0f));
6227
}
6228
6229
if (best_candidate_index < 0)
6230
{
6231
assert(best_candidate_index >= 0);
6232
fmt_error_printf("No candidates!\n");
6233
return false;
6234
}
6235
6236
const auto& best_candidate = candidates[best_candidate_index];
6237
6238
assert(best_candidate.m_encoding_type != encoding_type::cInvalid);
6239
6240
if (best_candidate.m_encoding_type == encoding_type::cRun)
6241
{
6242
if (!prev_run_len)
6243
{
6244
if (prev_encoding.get_total_bits())
6245
{
6246
#if SYNC_MARKERS
6247
strip_coded_bits.put_bits(0xDEAD, 16);
6248
#endif
6249
6250
strip_coded_bits.append(prev_encoding);
6251
}
6252
6253
assert(best_candidate.m_coder.get_total_bits());
6254
6255
prev_encoding = best_candidate.m_coder;
6256
6257
prev_run_len = 1;
6258
}
6259
else
6260
{
6261
prev_run_len++;
6262
6263
const uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();
6264
assert(prev_run_bits);
6265
BASISU_NOTE_UNUSED(prev_run_bits);
6266
6267
const uint32_t num_dummy_bits = best_candidate.m_coder.get_total_bits_u32();
6268
BASISU_NOTE_UNUSED(num_dummy_bits);
6269
6270
// Rewrite the previous encoding to extend the run length.
6271
prev_encoding.restart();
6272
prev_encoding.put_bits(RUN_CODE, RUN_CODE_LEN);
6273
prev_encoding.put_vlc(prev_run_len - 1, 5);
6274
6275
assert(prev_encoding.get_total_bits() == prev_run_bits + num_dummy_bits);
6276
}
6277
}
6278
else
6279
{
6280
if (prev_encoding.get_total_bits())
6281
{
6282
#if SYNC_MARKERS
6283
strip_coded_bits.put_bits(0xDEAD, 16);
6284
#endif
6285
6286
strip_coded_bits.append(prev_encoding);
6287
}
6288
6289
prev_encoding = best_candidate.m_coder;
6290
prev_run_len = 0;
6291
}
6292
6293
memcpy(prev_comp_pixels, best_candidate.m_comp_pixels, sizeof(vec3F) * BLOCK_W * BLOCK_H);
6294
6295
prev_candidate_encoding = best_candidate;
6296
6297
if (best_candidate.m_encoding_type != encoding_type::cRun)
6298
prev_non_run_candidate_encoding = best_candidate;
6299
6300
{
6301
std::lock_guard<std::mutex> lck(debug_state.m_stats_mutex);
6302
6303
debug_state.m_encoding_type_hist[(uint32_t)best_candidate.m_encoding_type]++;
6304
6305
if (best_candidate.m_encoding_type == encoding_type::cBlock)
6306
{
6307
debug_state.m_endpoint_mode_hist[(uint32_t)best_candidate.m_endpoint_mode]++;
6308
}
6309
6310
if ((best_candidate.m_encoding_type == encoding_type::cReuse) || (best_candidate.m_encoding_type == encoding_type::cBlock))
6311
{
6312
const uint32_t bm_index = (uint32_t)best_candidate.m_block_mode;
6313
assert(bm_index < (uint32_t)block_mode::cBMTotalModes);
6314
6315
debug_state.m_block_mode_hist[bm_index]++;
6316
debug_state.m_block_mode_total_bits[bm_index] += best_candidate.m_coder.get_total_bits();
6317
6318
for (uint32_t i = 0; i < 3; i++)
6319
{
6320
debug_state.m_block_mode_comp_stats[bm_index][i].push_back(half_comp_stats[i]);
6321
debug_state.m_block_mode_comparative_stats[bm_index][i].push_back(half_cross_chan_stats[i]);
6322
}
6323
}
6324
6325
if (best_candidate.m_encoding_type == encoding_type::cReuse)
6326
{
6327
debug_state.m_reuse_num_parts[best_candidate.m_coded_log_blk.m_num_partitions].fetch_add(1, std::memory_order_relaxed);
6328
6329
if (best_candidate.m_coded_log_blk.m_dual_plane)
6330
debug_state.m_reuse_total_dp.fetch_add(1, std::memory_order_relaxed);
6331
}
6332
}
6333
6334
enc_state.coded_blocks(bx, by) = prev_non_run_candidate_encoding;
6335
6336
// Update decoded image
6337
vec4F decoded_float_pixels[BLOCK_H][BLOCK_W];
6338
for (uint32_t y = 0; y < BLOCK_H; y++)
6339
for (uint32_t x = 0; x < BLOCK_W; x++)
6340
decoded_float_pixels[y][x] = best_candidate.m_comp_pixels[y][x];
6341
6342
enc_state.packed_img.set_block_clipped((vec4F*)decoded_float_pixels, bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
6343
6344
status = astc_helpers::pack_astc_block(enc_state.final_astc_blocks(bx, by), best_candidate.m_decomp_log_blk, nullptr, nullptr);
6345
if (!status)
6346
{
6347
fmt_error_printf("Failed packing block\n");
6348
return false;
6349
}
6350
6351
const uint32_t r = debug_state.m_total_blocks_compressed.fetch_add(1, std::memory_order_relaxed);
6352
if ((r & 2047) == 2047)
6353
{
6354
if (global_cfg.m_status_output)
6355
{
6356
basisu::fmt_printf("{} of {} total blocks compressed, {3.2}%\n", r, total_blocks, (r * 100.0f) / total_blocks);
6357
}
6358
}
6359
6360
if ((global_cfg.m_debug_images) &&
6361
((best_candidate.m_encoding_type != encoding_type::cRun) && (best_candidate.m_encoding_type != encoding_type::cSolid)))
6362
{
6363
std::lock_guard<std::mutex> lck(debug_state.m_vis_image_mutex);
6364
6365
if (best_candidate.m_decomp_log_blk.m_num_partitions == 2)
6366
{
6367
const int part2_unique_index = g_part2_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];
6368
assert((part2_unique_index >= 0) && (part2_unique_index < (int)NUM_UNIQUE_PARTITIONS2));
6369
6370
const partition_pattern_vec& pat = g_partitions2[part2_unique_index];
6371
6372
for (uint32_t y = 0; y < 6; y++)
6373
{
6374
for (uint32_t x = 0; x < 6; x++)
6375
{
6376
const uint32_t p = pat[x + y * 6];
6377
debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, color_rgba(p ? 100 : 0, 128, p ? 100 : 0, 255));
6378
} // x
6379
} // y
6380
}
6381
else if (best_candidate.m_decomp_log_blk.m_num_partitions == 3)
6382
{
6383
//part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(0, 0, 255, 255));
6384
6385
const int part3_unique_index = g_part3_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];
6386
assert((part3_unique_index >= 0) && (part3_unique_index < (int)NUM_UNIQUE_PARTITIONS3));
6387
6388
const partition_pattern_vec& pat = g_partitions3[part3_unique_index];
6389
6390
for (uint32_t y = 0; y < 6; y++)
6391
{
6392
for (uint32_t x = 0; x < 6; x++)
6393
{
6394
const uint32_t p = pat[x + y * 6];
6395
color_rgba c(0, 0, 150, 255);
6396
if (p == 1)
6397
c.set(100, 0, 150, 255);
6398
else if (p == 2)
6399
c.set(0, 100, 150, 255);
6400
debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, c);
6401
} // x
6402
} // y
6403
}
6404
else if (best_candidate.m_decomp_log_blk.m_dual_plane)
6405
{
6406
debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 255, 255));
6407
}
6408
else
6409
{
6410
debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 0, 255));
6411
}
6412
6413
color_rgba c;
6414
c.set((best_candidate.m_coded_log_blk.m_grid_width * best_candidate.m_coded_log_blk.m_grid_height * 255 + 18) / 36);
6415
debug_state.m_grid_vis.fill_box(bx * 6, by * 6, 6, 6, c);
6416
6417
c.set(0, 0, 0, 255);
6418
if (complex_block)
6419
c[0] = 255;
6420
6421
if (very_complex_block)
6422
c[1] = 255;
6423
6424
if (outer_pass == 2)
6425
c[2] = 255;
6426
else if (outer_pass == 1)
6427
c[2] = 128;
6428
6429
debug_state.m_mode_vis.fill_box(bx * 6, by * 6, 6, 6, c);
6430
6431
c.set(0, 255, 0, 255);
6432
if (best_candidate.m_coded_log_blk.m_color_endpoint_modes[0] == 7)
6433
c.set(255, 0, 0, 255);
6434
debug_state.m_mode_vis2.fill_box(bx * 6, by * 6, 6, 6, c);
6435
6436
switch (best_candidate.m_encoding_type)
6437
{
6438
case encoding_type::cRun:
6439
c.set(0, 0, 0, 255);
6440
break;
6441
case encoding_type::cSolid:
6442
c.set(128, 128, 128, 255); // dark grey
6443
break;
6444
case encoding_type::cReuse:
6445
c.set(255, 255, 0, 255); // yellow
6446
break;
6447
case encoding_type::cBlock:
6448
{
6449
switch (best_candidate.m_endpoint_mode)
6450
{
6451
case endpoint_mode::cRaw:
6452
c.set(255, 0, 0, 255); // red
6453
break;
6454
case endpoint_mode::cUseLeft:
6455
c.set(0, 0, 255, 255); // blue
6456
break;
6457
case endpoint_mode::cUseUpper:
6458
c.set(0, 0, 192, 255); // darker blue
6459
break;
6460
case endpoint_mode::cUseLeftDelta:
6461
c.set(0, 255, 0, 255); // green
6462
break;
6463
case endpoint_mode::cUseUpperDelta:
6464
c.set(0, 192, 0, 255); // darker green
6465
break;
6466
default:
6467
break;
6468
}
6469
6470
break;
6471
}
6472
default:
6473
break;
6474
}
6475
6476
if (filtered_x_err < filtered_y_err)
6477
c[3] = 0;
6478
else
6479
c[3] = 255;
6480
6481
debug_state.m_enc_vis.fill_box(bx * 6, by * 6, 6, 6, c);
6482
}
6483
6484
break;
6485
6486
} // outer_pass
6487
6488
} // bx
6489
6490
} // by
6491
6492
if (prev_encoding.get_total_bits())
6493
{
6494
#if SYNC_MARKERS
6495
strip_coded_bits.put_bits(0xDEAD, 16);
6496
#endif
6497
6498
strip_coded_bits.append(prev_encoding);
6499
}
6500
6501
return true;
6502
}
6503
6504
bool g_initialized = false;
6505
6506
void global_init()
6507
{
6508
if (g_initialized)
6509
return;
6510
6511
interval_timer tm;
6512
tm.start();
6513
6514
init_pq_tables();
6515
6516
init_partitions2_6x6();
6517
init_partitions3_6x6();
6518
6519
init_contrib_lists();
6520
6521
g_initialized = true;
6522
6523
//fmt_printf("astc_6x6_hdr::global_init() total time: {}\n", tm.get_elapsed_secs());
6524
}
6525
6526
bool compress_photo(const basisu::imagef &orig_src_img, const astc_hdr_6x6_global_config &orig_global_cfg, job_pool *pJob_pool,
6527
basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics)
6528
{
6529
assert(g_initialized);
6530
if (!g_initialized)
6531
return false;
6532
6533
assert(pJob_pool);
6534
6535
if (orig_global_cfg.m_debug_output)
6536
{
6537
fmt_debug_printf("------ astc_6x6_hdr::compress_photo:\n");
6538
fmt_debug_printf("Source image dimensions: {}x{}\n", orig_src_img.get_width(), orig_src_img.get_height());
6539
fmt_debug_printf("Job pool total threads: {}\n", (uint64_t)pJob_pool->get_total_threads());
6540
orig_global_cfg.print();
6541
}
6542
6543
if (!orig_src_img.get_width() || !orig_src_img.get_height())
6544
{
6545
assert(false);
6546
fmt_error_printf("compress_photo: Invalid source image\n");
6547
return false;
6548
}
6549
6550
astc_hdr_6x6_global_config global_cfg(orig_global_cfg);
6551
6552
uastc_hdr_6x6_encode_state enc_state;
6553
enc_state.master_coptions.m_q_log_bias = Q_LOG_BIAS_6x6;
6554
enc_state.src_img = orig_src_img;
6555
6556
//src_img.crop(256, 256);
6557
6558
const uint32_t width = enc_state.src_img.get_width();
6559
const uint32_t height = enc_state.src_img.get_height();
6560
const uint32_t num_blocks_x = enc_state.src_img.get_block_width(BLOCK_W);
6561
const uint32_t num_blocks_y = enc_state.src_img.get_block_height(BLOCK_H);
6562
const uint32_t total_blocks = num_blocks_x * num_blocks_y;
6563
6564
for (uint32_t y = 0; y < height; y++)
6565
{
6566
for (uint32_t x = 0; x < width; x++)
6567
{
6568
for (uint32_t c = 0; c < 3; c++)
6569
{
6570
float f = enc_state.src_img(x, y)[c];
6571
6572
if (std::isinf(f) || std::isnan(f) || (f < 0.0f))
6573
f = 0;
6574
else if (f > basist::ASTC_HDR_MAX_VAL)
6575
f = basist::ASTC_HDR_MAX_VAL;
6576
6577
enc_state.src_img(x, y)[c] = f;
6578
6579
} // c
6580
6581
} // x
6582
} // y
6583
6584
if (global_cfg.m_debug_images)
6585
{
6586
write_exr((global_cfg.m_debug_image_prefix + "orig.exr").c_str(), enc_state.src_img, 3, 0);
6587
}
6588
6589
image src_img_compressed;
6590
tonemap_image_compressive2(src_img_compressed, enc_state.src_img);
6591
6592
if (global_cfg.m_debug_images)
6593
{
6594
save_png(global_cfg.m_debug_image_prefix + "compressive_tone_map.png", src_img_compressed);
6595
}
6596
6597
smooth_map_params rp;
6598
rp.m_debug_images = global_cfg.m_debug_images;
6599
6600
if (global_cfg.m_lambda != 0.0f)
6601
{
6602
if (global_cfg.m_status_output)
6603
fmt_printf("Creating RDO perceptual weighting maps\n");
6604
6605
create_smooth_maps2(enc_state.smooth_block_mse_scales, src_img_compressed, rp);
6606
}
6607
6608
if (global_cfg.m_status_output)
6609
fmt_printf("Blurring image\n");
6610
6611
enc_state.src_img_filtered1.resize(width, height);
6612
image_resample(enc_state.src_img, enc_state.src_img_filtered1, "gaussian", global_cfg.m_gaussian1_strength); //1.45f);
6613
6614
enc_state.src_img_filtered2.resize(width, height);
6615
image_resample(enc_state.src_img, enc_state.src_img_filtered2, "gaussian", global_cfg.m_gaussian2_strength); //1.83f);
6616
6617
if (global_cfg.m_debug_images)
6618
{
6619
write_exr((global_cfg.m_debug_image_prefix + "blurred1.exr").c_str(), enc_state.src_img_filtered1, 3, 0);
6620
write_exr((global_cfg.m_debug_image_prefix + "blurred2.exr").c_str(), enc_state.src_img_filtered2, 3, 0);
6621
}
6622
6623
if (global_cfg.m_status_output)
6624
fmt_printf("Transforming to ITP\n");
6625
6626
enc_state.src_img_itp.resize(width, height);
6627
convet_rgb_image_to_itp(enc_state.src_img, enc_state.src_img_itp, global_cfg);
6628
6629
enc_state.src_img_filtered1_itp.resize(width, height);
6630
convet_rgb_image_to_itp(enc_state.src_img_filtered1, enc_state.src_img_filtered1_itp, global_cfg);
6631
6632
enc_state.src_img_filtered2_itp.resize(width, height);
6633
convet_rgb_image_to_itp(enc_state.src_img_filtered2, enc_state.src_img_filtered2_itp, global_cfg);
6634
6635
if (global_cfg.m_lambda == 0.0f)
6636
global_cfg.m_favor_higher_compression = false;
6637
6638
uint32_t total_strips = 0, rows_per_strip = 0;
6639
if (!calc_strip_size(global_cfg.m_lambda, num_blocks_y, (uint32_t)pJob_pool->get_total_threads(), global_cfg.m_force_one_strip, total_strips, rows_per_strip, global_cfg))
6640
{
6641
fmt_error_printf("compress_photo: Failed computing strip sizes\n");
6642
return false;
6643
}
6644
6645
if (global_cfg.m_debug_output)
6646
fmt_printf("lambda: {}, comp_level: {}, highest_comp_level: {}, extra patterns: {}\n", global_cfg.m_lambda, global_cfg.m_master_comp_level, global_cfg.m_highest_comp_level, global_cfg.m_extra_patterns_flag);
6647
6648
enc_state.coded_blocks.resize(num_blocks_x, num_blocks_y);
6649
6650
bitwise_coder coded_bits;
6651
6652
coded_bits.put_bits(0xABCD, 16);
6653
coded_bits.put_bits(width, 16);
6654
coded_bits.put_bits(height, 16);
6655
6656
enc_state.packed_img.resize(width, height);
6657
6658
enc_state.strip_bits.resize(total_strips);
6659
6660
enc_state.final_astc_blocks.resize(num_blocks_x, num_blocks_y);
6661
6662
uastc_hdr_6x6_debug_state debug_state;
6663
6664
if (global_cfg.m_debug_images)
6665
debug_state.init(width, height);
6666
else
6667
debug_state.init(0, 0);
6668
6669
interval_timer tm;
6670
tm.start();
6671
6672
std::atomic_bool any_failed_flag;
6673
any_failed_flag.store(false);
6674
6675
for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)
6676
{
6677
const uint32_t strip_first_by = strip_index * rows_per_strip;
6678
6679
uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);
6680
if (strip_index == (total_strips - 1))
6681
strip_last_by = num_blocks_y - 1;
6682
6683
pJob_pool->add_job([&any_failed_flag, &global_cfg, &debug_state, &enc_state,
6684
strip_index, total_strips, strip_first_by, strip_last_by,
6685
num_blocks_x, num_blocks_y, total_blocks, width, height]
6686
{
6687
if (!any_failed_flag)
6688
{
6689
bool status = compress_strip_task(
6690
strip_index, total_strips, strip_first_by, strip_last_by,
6691
num_blocks_x, num_blocks_y, total_blocks, width, height,
6692
global_cfg, debug_state, enc_state);
6693
6694
if (!status)
6695
{
6696
fmt_error_printf("compress_photo: compress_strip_task() failed\n");
6697
any_failed_flag.store(true, std::memory_order_relaxed);
6698
}
6699
}
6700
} );
6701
6702
if (any_failed_flag)
6703
break;
6704
6705
} // strip_index
6706
6707
pJob_pool->wait_for_all();
6708
6709
if (any_failed_flag)
6710
{
6711
fmt_error_printf("One or more strips failed during compression\n");
6712
return false;
6713
}
6714
6715
if (global_cfg.m_debug_output)
6716
fmt_printf("Encoding time: {} secs\n", tm.get_elapsed_secs());
6717
6718
if (global_cfg.m_debug_output)
6719
debug_state.print(total_blocks);
6720
6721
if (global_cfg.m_debug_images)
6722
{
6723
save_png(global_cfg.m_debug_image_prefix + "part_vis.png", debug_state.m_part_vis);
6724
save_png(global_cfg.m_debug_image_prefix + "grid_vis.png", debug_state.m_grid_vis);
6725
save_png(global_cfg.m_debug_image_prefix + "mode_vis.png", debug_state.m_mode_vis);
6726
save_png(global_cfg.m_debug_image_prefix + "mode_vis2.png", debug_state.m_mode_vis2);
6727
save_png(global_cfg.m_debug_image_prefix + "enc_vis.png", debug_state.m_enc_vis);
6728
write_exr((global_cfg.m_debug_image_prefix + "stat_vis.exr").c_str(), debug_state.m_stat_vis, 3, 0);
6729
}
6730
6731
for (uint32_t i = 0; i < total_strips; i++)
6732
coded_bits.append(enc_state.strip_bits[i]);
6733
6734
coded_bits.put_bits(0xA742, 16);
6735
6736
coded_bits.flush();
6737
6738
if (global_cfg.m_output_images)
6739
{
6740
write_exr((global_cfg.m_output_image_prefix + "comp.exr").c_str(), enc_state.packed_img, 3, 0);
6741
}
6742
6743
if (global_cfg.m_debug_output)
6744
fmt_printf("\nTotal intermediate output bits/pixel: {3.4}\n", (float)coded_bits.get_total_bits() / (float)(width * height));
6745
6746
vector2D<astc_helpers::astc_block> decoded_blocks1;
6747
vector2D<astc_helpers::astc_block> decoded_blocks2;
6748
6749
if (global_cfg.m_debug_output)
6750
fmt_printf("decode_file\n");
6751
6752
uint32_t unpacked_width = 0, unpacked_height = 0;
6753
bool status = decode_file(coded_bits.get_bytes(), decoded_blocks1, unpacked_width, unpacked_height);
6754
if (!status)
6755
{
6756
fmt_error_printf("decode_file() failed\n");
6757
return false;
6758
}
6759
6760
if (global_cfg.m_debug_output)
6761
fmt_printf("decode_6x6_hdr\n");
6762
6763
status = decode_6x6_hdr(coded_bits.get_bytes().get_ptr(), coded_bits.get_bytes().size_in_bytes_u32(), decoded_blocks2, unpacked_width, unpacked_height);
6764
if (!status)
6765
{
6766
fmt_error_printf("decode_6x6_hdr_file() failed\n");
6767
return false;
6768
}
6769
6770
if ((enc_state.final_astc_blocks.get_width() != decoded_blocks1.get_width()) ||
6771
(enc_state.final_astc_blocks.get_height() != decoded_blocks1.get_height()))
6772
{
6773
fmt_error_printf("Decode size mismatch with decode_file\n");
6774
return false;
6775
}
6776
6777
if ((enc_state.final_astc_blocks.get_width() != decoded_blocks2.get_width()) ||
6778
(enc_state.final_astc_blocks.get_height() != decoded_blocks2.get_height()))
6779
{
6780
fmt_error_printf("Decode size mismatch with decode_6x6_hdr_file\n");
6781
return false;
6782
}
6783
6784
if (memcmp(decoded_blocks1.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks1.size_in_bytes()) != 0)
6785
{
6786
fmt_error_printf("Decoded ASTC blocks verification failed\n");
6787
return false;
6788
}
6789
6790
if (memcmp(decoded_blocks2.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks2.size_in_bytes()) != 0)
6791
{
6792
fmt_error_printf("Decoded ASTC blocks verification failed\n");
6793
return false;
6794
}
6795
6796
if (global_cfg.m_debug_output)
6797
basisu::fmt_printf("Decoded ASTC verification checks succeeded\n");
6798
6799
if (global_cfg.m_output_images)
6800
{
6801
if (write_astc_file((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), decoded_blocks1.get_ptr(), BLOCK_W, BLOCK_H, width, height))
6802
{
6803
basisu::platform_sleep(20);
6804
6805
uint8_vec astc_file_data;
6806
if (read_file_to_vec((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), astc_file_data))
6807
{
6808
if (astc_file_data.size() > 16)
6809
{
6810
astc_file_data.erase(0, 16);
6811
6812
size_t comp_size = 0;
6813
void* pComp_data = tdefl_compress_mem_to_heap(&astc_file_data[0], astc_file_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK);
6814
mz_free(pComp_data);
6815
6816
if (global_cfg.m_debug_output)
6817
{
6818
fmt_printf(".ASTC file size (less header): {}, bits/pixel: {}, Deflate bits/pixel: {}\n",
6819
(uint64_t)astc_file_data.size(),
6820
(float)astc_file_data.size() * 8.0f / (float)(width * height),
6821
(float)comp_size * 8.0f / (float)(width * height));
6822
}
6823
}
6824
}
6825
}
6826
}
6827
6828
// Must decode all the blocks (even padded rows/cols) to match what the transcoder does.
6829
imagef unpacked_astc_img(num_blocks_x * 6, num_blocks_y * 6);
6830
imagef unpacked_astc_google_img(num_blocks_x * 6, num_blocks_y * 6);
6831
6832
for (uint32_t y = 0; y < decoded_blocks1.get_height(); y++)
6833
{
6834
for (uint32_t x = 0; x < decoded_blocks1.get_width(); x++)
6835
{
6836
const auto& phys_blk = decoded_blocks1(x, y);
6837
6838
vec4F pixels[MAX_BLOCK_W * MAX_BLOCK_H];
6839
status = unpack_physical_astc_block(&phys_blk, BLOCK_W, BLOCK_H, pixels);
6840
if (!status)
6841
{
6842
fmt_error_printf("unpack_physical_astc_block() failed\n");
6843
return false;
6844
}
6845
6846
unpacked_astc_img.set_block_clipped(pixels, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);
6847
6848
vec4F pixels_google[MAX_BLOCK_W * MAX_BLOCK_H];
6849
status = unpack_physical_astc_block_google(&phys_blk, BLOCK_W, BLOCK_H, pixels_google);
6850
if (!status)
6851
{
6852
fmt_error_printf("unpack_physical_astc_block_google() failed\n");
6853
return false;
6854
}
6855
6856
unpacked_astc_google_img.set_block_clipped(pixels_google, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);
6857
6858
for (uint32_t i = 0; i < 36; i++)
6859
{
6860
if (pixels[i] != pixels_google[i])
6861
{
6862
fmt_error_printf("pixel unpack mismatch\n");
6863
return false;
6864
}
6865
}
6866
}
6867
}
6868
6869
if (global_cfg.m_debug_output)
6870
fmt_printf("\nUnpack succeeded\n");
6871
6872
imagef unpacked_bc6h_img;
6873
6874
{
6875
vector2D<basist::bc6h_block> bc6h_blocks;
6876
6877
fast_bc6h_params enc_params;
6878
6879
bool pack_status = pack_bc6h_image(unpacked_astc_img, bc6h_blocks, &unpacked_bc6h_img, enc_params);
6880
if (!pack_status)
6881
{
6882
fmt_error_printf("pack_bc6h_image() failed!");
6883
return false;
6884
}
6885
6886
unpacked_bc6h_img.crop(width, height);
6887
6888
if (global_cfg.m_output_images)
6889
{
6890
write_exr((global_cfg.m_output_image_prefix + "unpacked_bc6h.exr").c_str(), unpacked_bc6h_img, 3, 0);
6891
}
6892
}
6893
6894
unpacked_astc_img.crop(width, height);
6895
unpacked_astc_google_img.crop(width, height);
6896
6897
if (global_cfg.m_output_images)
6898
{
6899
write_exr((global_cfg.m_output_image_prefix + "unpacked_astc.exr").c_str(), unpacked_astc_img, 3, 0);
6900
write_exr((global_cfg.m_output_image_prefix + "unpacked_google_astc.exr").c_str(), unpacked_astc_google_img, 3, 0);
6901
}
6902
6903
// ASTC metrics
6904
if (global_cfg.m_image_stats)
6905
{
6906
image_metrics im;
6907
6908
if (global_cfg.m_debug_output)
6909
printf("\nASTC log2 float error metrics:\n");
6910
6911
for (uint32_t i = 0; i < 3; i++)
6912
{
6913
im.calc(enc_state.src_img, unpacked_astc_img, i, 1, true, true);
6914
6915
if (global_cfg.m_debug_output)
6916
{
6917
printf("%c: ", "RGBA"[i]);
6918
im.print_hp();
6919
}
6920
}
6921
6922
metrics.m_im_astc_log2.calc(enc_state.src_img, unpacked_astc_img, 0, 3, true, true);
6923
6924
if (global_cfg.m_debug_output)
6925
{
6926
printf("RGB: ");
6927
metrics.m_im_astc_log2.print_hp();
6928
6929
printf("\n");
6930
}
6931
}
6932
6933
if (global_cfg.m_image_stats)
6934
{
6935
image_metrics im;
6936
6937
if (global_cfg.m_debug_output)
6938
printf("ASTC half float space error metrics (a piecewise linear approximation of log2 error):\n");
6939
6940
for (uint32_t i = 0; i < 3; i++)
6941
{
6942
im.calc_half(enc_state.src_img, unpacked_astc_img, i, 1, true);
6943
6944
if (global_cfg.m_debug_output)
6945
{
6946
printf("%c: ", "RGBA"[i]);
6947
im.print_hp();
6948
}
6949
}
6950
6951
metrics.m_im_astc_half.calc_half(enc_state.src_img, unpacked_astc_img, 0, 3, true);
6952
6953
if (global_cfg.m_debug_output)
6954
{
6955
printf("RGB: ");
6956
metrics.m_im_astc_half.print_hp();
6957
}
6958
}
6959
6960
// BC6H metrics
6961
if (global_cfg.m_image_stats)
6962
{
6963
image_metrics im;
6964
6965
if (global_cfg.m_debug_output)
6966
printf("\nBC6H log2 float error metrics:\n");
6967
6968
for (uint32_t i = 0; i < 3; i++)
6969
{
6970
im.calc(enc_state.src_img, unpacked_bc6h_img, i, 1, true, true);
6971
6972
if (global_cfg.m_debug_output)
6973
{
6974
printf("%c: ", "RGBA"[i]);
6975
im.print_hp();
6976
}
6977
}
6978
6979
metrics.m_im_bc6h_log2.calc(enc_state.src_img, unpacked_bc6h_img, 0, 3, true, true);
6980
6981
if (global_cfg.m_debug_output)
6982
{
6983
printf("RGB: ");
6984
metrics.m_im_bc6h_log2.print_hp();
6985
6986
printf("\n");
6987
}
6988
}
6989
6990
if (global_cfg.m_image_stats)
6991
{
6992
image_metrics im;
6993
6994
if (global_cfg.m_debug_output)
6995
printf("BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n");
6996
6997
for (uint32_t i = 0; i < 3; i++)
6998
{
6999
im.calc_half(enc_state.src_img, unpacked_bc6h_img, i, 1, true);
7000
7001
if (global_cfg.m_debug_output)
7002
{
7003
printf("%c: ", "RGBA"[i]);
7004
im.print_hp();
7005
}
7006
}
7007
7008
metrics.m_im_bc6h_half.calc_half(enc_state.src_img, unpacked_bc6h_img, 0, 3, true);
7009
7010
if (global_cfg.m_debug_output)
7011
{
7012
printf("RGB: ");
7013
metrics.m_im_bc6h_half.print_hp();
7014
7015
printf("\n");
7016
}
7017
}
7018
7019
intermediate_tex_data.swap(coded_bits.get_bytes());
7020
7021
astc_tex_data.resize(decoded_blocks1.size_in_bytes());
7022
memcpy(astc_tex_data.data(), decoded_blocks1.get_ptr(), decoded_blocks1.size_in_bytes());
7023
7024
return true;
7025
}
7026
7027
} // namespace astc_6x6_hdr
7028
7029