Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp
9904 views
1
// File: basisu_astc_hdr_6x6_enc.cpp
2
#include "basisu_astc_hdr_6x6_enc.h"
3
#include "basisu_enc.h"
4
#include "basisu_astc_hdr_common.h"
5
#include "basisu_math.h"
6
#include "basisu_resampler.h"
7
#include "basisu_resampler_filters.h"
8
9
#define MINIZ_HEADER_FILE_ONLY
10
#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
11
#include "basisu_miniz.h"
12
13
#include "3rdparty/android_astc_decomp.h"
14
15
#include <array>
16
17
using namespace basisu;
18
using namespace buminiz;
19
using namespace basist::astc_6x6_hdr;
20
21
namespace astc_6x6_hdr
22
{
23
24
static void atomic_max(std::atomic<uint32_t>& atomic_var, uint32_t new_value)
25
{
26
uint32_t current = atomic_var.load(std::memory_order_relaxed);
27
for ( ; ; )
28
{
29
uint32_t new_max = std::max(current, new_value);
30
if (atomic_var.compare_exchange_weak(current, new_max, std::memory_order_relaxed, std::memory_order_relaxed))
31
break;
32
}
33
}
34
35
void astc_hdr_6x6_global_config::set_user_level(int level)
36
{
37
level = basisu::clamp<int>(level, 0, ASTC_HDR_6X6_MAX_USER_COMP_LEVEL);
38
39
m_master_comp_level = 0;
40
m_highest_comp_level = 0;
41
m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS;
42
m_extra_patterns_flag = false;
43
m_brute_force_partition_matching = false;
44
45
switch (level)
46
{
47
case 0:
48
{
49
// Both reduce compression a lot when lambda>0
50
m_favor_higher_compression = false;
51
m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS / 2;
52
break;
53
}
54
case 1:
55
{
56
m_master_comp_level = 0;
57
m_highest_comp_level = 0;
58
break;
59
}
60
case 2:
61
{
62
m_master_comp_level = 0;
63
m_highest_comp_level = 1;
64
break;
65
}
66
case 3:
67
{
68
m_master_comp_level = 1;
69
m_highest_comp_level = 1;
70
break;
71
}
72
case 4:
73
{
74
m_master_comp_level = 1;
75
m_highest_comp_level = 2;
76
break;
77
}
78
case 5:
79
{
80
m_master_comp_level = 1;
81
m_highest_comp_level = 3;
82
break;
83
}
84
case 6:
85
{
86
m_master_comp_level = 1;
87
m_highest_comp_level = 4;
88
break;
89
}
90
case 7:
91
{
92
m_master_comp_level = 2;
93
m_highest_comp_level = 2;
94
break;
95
}
96
case 8:
97
{
98
m_master_comp_level = 2;
99
m_highest_comp_level = 3;
100
break;
101
}
102
case 9:
103
{
104
m_master_comp_level = 2;
105
m_highest_comp_level = 4;
106
break;
107
}
108
case 10:
109
{
110
m_master_comp_level = 3;
111
m_highest_comp_level = 3;
112
break;
113
}
114
case 11:
115
{
116
m_master_comp_level = 3;
117
m_highest_comp_level = 4;
118
break;
119
}
120
case 12:
121
default:
122
{
123
m_master_comp_level = 4;
124
m_highest_comp_level = 4;
125
m_extra_patterns_flag = true;
126
m_brute_force_partition_matching = true;
127
break;
128
}
129
}
130
}
131
132
const float m1 = 0.1593017578125f; // (2610 / 2^14) * (1/100)
133
const float m2 = 78.84375f; // (2523 / 32) * (1/100)
134
const float c1 = 0.8359375f; // 3424 / (2^12)
135
const float c2 = 18.8515625f; // (2413 / 128)
136
const float c3 = 18.6875f; // (2392 / 128)
137
138
static float forwardPQ(float Y)
139
{
140
// 10,000 here is an absolute scale - it's in nits (cd per square meter)
141
float L = Y * (1.0f / 10000.0f);
142
143
float num = powf(L, m1);
144
float N = powf((c1 + c2 * num) / (1 + c3 * num), m2);
145
146
return N;
147
}
148
149
#if 0
150
static float inversePQ(float E)
151
{
152
float N = powf(E, 1.0f / m2);
153
154
float num = basisu::maximum<float>((N - c1), 0.0f) / (c2 - c3 * N);
155
float L = powf(num, 1.0f / m1);
156
157
return L * 10000.0f;
158
}
159
#endif
160
161
// PQ function approximation: convert input to bfloat16, look up in tables, bilinear interpolation between table entries.
162
// max_er: 0.000023007392883, max_rel_er: 0.000023472490284, avg_er: 0.000004330495689, 6-7x faster on x86
163
// Highest error is for values less than SMALLEST_PQ_VAL_IN.
164
//
165
// Approximation is round trip lossless for 10-12 bits at [0,10000] nits:
166
// for x [0,1024] (SCALE=1023) or for x [0,4095] (SCALE=4096):
167
// round(forwardPQTab(inversePQ(x / SCALE)) * SCALE) == x
168
//
169
// bfloat16 has enough precision to handle 8-bit sRGB to linear conversions:
170
// round(linear_to_srgb(bfloat16_to_float(float_to_bfloat16(srgb_to_linear(isRGB/255.0f))))*255.0) is lossless
171
172
const int PQ_APPROX_MIN_EXP = -16, PQ_APPROX_MAX_EXP = 16;
173
const int PQ_APPROX_EXP_RANGE = (PQ_APPROX_MAX_EXP - PQ_APPROX_MIN_EXP + 1);
174
175
const float SMALLEST_PQ_VAL_IN = 0.000015258829080f;
176
const float SMALLEST_PQ_VAL = 0.000551903737f; // forwardPQ(SMALLEST_PQ_VAL_IN)
177
178
const float LARGEST_PQ_VAL = 1.251312f;
179
180
float g_pq_approx_tabs[PQ_APPROX_EXP_RANGE][128];
181
182
static void init_pq_tables()
183
{
184
for (int exp = PQ_APPROX_MIN_EXP; exp <= PQ_APPROX_MAX_EXP; exp++)
185
{
186
for (int mant = 0; mant < 128; mant++)
187
{
188
bfloat16 b = bfloat16_init(1, exp, mant);
189
float bf = bfloat16_to_float(b);
190
191
float pq = forwardPQ(bf);
192
193
g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant] = pq;
194
}
195
}
196
197
//fmt_printf("{.15} {.15}\n", g_pq_approx_tabs[0][0], inversePQ(g_pq_approx_tabs[0][0]));
198
//fmt_printf("{.15}\n", forwardPQ(SMALLEST_PQ_VAL_IN));
199
}
200
201
static inline float forwardPQTab(float v)
202
{
203
assert(g_pq_approx_tabs[0][0]);
204
205
assert(v >= 0.0f);
206
if (v == 0.0f)
207
return 0.0f;
208
209
bfloat16 bf = float_to_bfloat16(v, false);
210
assert(v >= bfloat16_to_float(bf));
211
212
int exp = bfloat16_get_exp(bf);
213
214
if (exp < PQ_APPROX_MIN_EXP)
215
{
216
// not accurate but should be good enough for our uses
217
return lerp(0.0f, SMALLEST_PQ_VAL, minimum(1.0f, v / SMALLEST_PQ_VAL_IN));
218
}
219
else if (exp > PQ_APPROX_MAX_EXP)
220
return LARGEST_PQ_VAL;
221
222
int mant = bfloat16_get_mantissa(bf);
223
224
float a = g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant];
225
float bf_f32 = bfloat16_to_float(bf);
226
227
int next_mant = mant + 1;
228
int next_exp = exp;
229
if (next_mant == 128)
230
{
231
next_mant = 0;
232
next_exp++;
233
if (next_exp > PQ_APPROX_MAX_EXP)
234
return a;
235
}
236
237
float b = g_pq_approx_tabs[next_exp - PQ_APPROX_MIN_EXP][next_mant];
238
239
bfloat16 next_bf = bfloat16_init(1, next_exp, next_mant);
240
float next_bf_f32 = bfloat16_to_float(next_bf);
241
assert(v <= next_bf_f32);
242
243
float lerp_factor = (v - bf_f32) / (next_bf_f32 - bf_f32);
244
assert((lerp_factor >= 0) && (lerp_factor <= 1.0f));
245
246
return lerp(a, b, lerp_factor);
247
}
248
249
// 100 nits = ~.5 i
250
// This converts absolute linear RGB light in either REC 709 or REC2020/BT2100 color gamut to ICtCp, a coding space where Ct is scaled by 2.
251
// To convert to perceptual ITP for error/distance calculations, multiply the result Ct by .5 (or set itp_flag to true).
252
// Assumes REC 709 input, or REC 2020/BT.2100 RGB input if rec2020_bt2100_color_gamut is true.
253
//
254
// ITP info:
255
// https://www.portrait.com/resource-center/ictcp-color-difference-metric/
256
// https://professional.dolby.com/siteassets/pdfs/measuringperceptualcolorvolume_v07.253.pdf (see scale to JND's)
257
// This also converts from a ICtCp coding space to threshold or perceptually uniform space ITP.
258
//
259
// Linear REC709 to REC2020/BT.2100 gamut conversion:
260
// rgb_2100[0] = rgb_in[0] * 0.6274f + rgb_in[1] * 0.3293f + rgb_in[2] * 0.0433f;
261
// rgb_2100[1] = rgb_in[0] * 0.0691f + rgb_in[1] * 0.9195f + rgb_in[2] * 0.0114f;
262
// rgb_2100[2] = rgb_in[0] * 0.0164f + rgb_in[1] * 0.0880f + rgb_in[2] * 0.8956f;
263
// const float S = 1.0f / 4096.0f;
264
// l = (1688.0f * S) * rgb_2100[0] + (2146.0f * S) * rgb_2100[1] + (262.0f * S) * rgb_2100[2];
265
// m = (683.0f * S) * rgb_2100[0] + (2951.0f * S) * rgb_2100[1] + (462.0f * S) * rgb_2100[2];
266
// s = (99.0f * S) * rgb_2100[0] + (309.0f * S) * rgb_2100[1] + (3688.0f * S) * rgb_2100[2];
267
static void linear_rgb_to_ictcp(const vec3F& rgb_in, vec3F& ictcp, bool itp_flag = false, bool rec2020_bt2100_color_gamut = false)
268
{
269
vec3F rgb_2100(rgb_in);
270
271
float l, m, s;
272
if (!rec2020_bt2100_color_gamut)
273
{
274
// Assume REC 709 input color gamut
275
// (REC2020_to_LMS * REC709_to_2020) * input_color
276
l = rgb_2100[0] * 0.2958097f + rgb_2100[1] * 0.6230863f + rgb_2100[2] * 0.0811040f;
277
m = rgb_2100[0] * 0.1562512f + rgb_2100[1] * 0.7272980f + rgb_2100[2] * 0.1164508f;
278
s = rgb_2100[0] * 0.0351435f + rgb_2100[1] * 0.1565601f + rgb_2100[2] * 0.8082964f;
279
}
280
else
281
{
282
// Assumes REC2020/BT.2100 input color gamut (this is from the spec)
283
l = 0.412109375f * rgb_2100[0] + 0.52392578125f * rgb_2100[1] + 0.06396484375f * rgb_2100[2];
284
m = 0.166748046875f * rgb_2100[0] + 0.720458984375f * rgb_2100[1] + 0.11279296875f * rgb_2100[2];
285
s = 0.024169921875f * rgb_2100[0] + 0.075439453125f * rgb_2100[1] + 0.900390625f * rgb_2100[2];
286
}
287
288
float ld = forwardPQTab(l);
289
float md = forwardPQTab(m);
290
float sd = forwardPQTab(s);
291
292
ictcp[0] = .5f * ld + .5f * md;
293
294
// if ITP scale Ct by .5 (the ICtCp spec scaled Ct to better exploit the full scaled output, which is not perceptually linear)
295
if (itp_flag)
296
ictcp[1] = 0.806884765625f * ld + -1.6617431640625f * md + 0.8548583984375f * sd;
297
else
298
ictcp[1] = 1.61376953125f * ld + -3.323486328125f * md + 1.709716796875f * sd;
299
300
ictcp[2] = 4.378173828125f * ld + -4.24560546875f * md + -0.132568359375f * sd;
301
}
302
303
static inline void linear_rgb_to_itp(const vec3F& rgb_in, vec3F& itp, const astc_hdr_6x6_global_config &cfg)
304
{
305
linear_rgb_to_ictcp(rgb_in, itp, true, cfg.m_rec2020_bt2100_color_gamut);
306
}
307
308
#if 0
309
// Outputs rec2020/bt2100 color gamut (i.e. this doesn't convert back to REC709 gamut).
310
static void ictcp_to_linear_rgb(const vec3F& ictcp, vec3F& rgb, bool itp_flag = false)
311
{
312
float ct = ictcp[1];
313
314
if (itp_flag)
315
ct *= 2.0f;
316
317
float ld = ictcp[0] + ct * 0.008609037037932726f + ictcp[2] * 0.11102962500302596f;
318
float md = ictcp[0] + ct * -0.008609037037932726f + ictcp[2] * -0.11102962500302596f;
319
float sd = ictcp[0] + ct * 0.5600313357106792f + ictcp[2] * -0.32062717498731885f;
320
321
float l = inversePQ(ld);
322
float m = inversePQ(md);
323
float s = inversePQ(sd);
324
325
rgb[0] = l * 3.436606694333079f + m * -2.5064521186562705f + s * 0.06984542432319149f;
326
rgb[1] = l * -0.7913295555989289f + m * 1.983600451792291f + s * -0.192270896193362f;
327
rgb[2] = l * -0.025949899690592672f + m * -0.09891371471172646f + s * 1.1248636144023192f;
328
}
329
#endif
330
331
struct half_vec3
332
{
333
basist::half_float m_vals[3];
334
335
inline half_vec3() { }
336
337
inline half_vec3(basist::half_float x, basist::half_float y, basist::half_float z)
338
{
339
m_vals[0] = x;
340
m_vals[1] = y;
341
m_vals[2] = z;
342
}
343
344
inline half_vec3(const half_vec3& other)
345
{
346
*this = other;
347
}
348
349
inline half_vec3& operator= (const half_vec3& rhs)
350
{
351
m_vals[0] = rhs.m_vals[0];
352
m_vals[1] = rhs.m_vals[1];
353
m_vals[2] = rhs.m_vals[2];
354
return *this;
355
}
356
357
inline void clear()
358
{
359
clear_obj(m_vals);
360
}
361
362
inline half_vec3 &set(basist::half_float x, basist::half_float y, basist::half_float z)
363
{
364
m_vals[0] = x;
365
m_vals[1] = y;
366
m_vals[2] = z;
367
return *this;
368
}
369
370
inline half_vec3& set(float x, float y, float z)
371
{
372
m_vals[0] = basist::float_to_half(x);
373
m_vals[1] = basist::float_to_half(y);
374
m_vals[2] = basist::float_to_half(z);
375
return *this;
376
}
377
378
template<typename T>
379
inline half_vec3& set_vec(const T& vec)
380
{
381
m_vals[0] = basist::float_to_half(vec[0]);
382
m_vals[1] = basist::float_to_half(vec[1]);
383
m_vals[2] = basist::float_to_half(vec[2]);
384
return *this;
385
}
386
387
template<typename T>
388
inline T get_vec() const
389
{
390
return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]));
391
}
392
393
inline basist::half_float operator[] (uint32_t c) const { assert(c < 3); return m_vals[c]; }
394
inline basist::half_float& operator[] (uint32_t c) { assert(c < 3); return m_vals[c]; }
395
396
float get_float_comp(uint32_t c) const
397
{
398
assert(c < 3);
399
return basist::half_to_float(m_vals[c]);
400
}
401
402
half_vec3& set_float_comp(uint32_t c, float v)
403
{
404
assert(c < 3);
405
m_vals[c] = basist::float_to_half(v);
406
return *this;
407
}
408
};
409
410
struct half_vec4
411
{
412
basist::half_float m_vals[4];
413
414
inline half_vec4() { }
415
416
inline half_vec4(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)
417
{
418
m_vals[0] = x;
419
m_vals[1] = y;
420
m_vals[2] = z;
421
m_vals[3] = w;
422
}
423
424
inline half_vec4(const half_vec4& other)
425
{
426
*this = other;
427
}
428
429
inline half_vec4& operator= (const half_vec4& rhs)
430
{
431
m_vals[0] = rhs.m_vals[0];
432
m_vals[1] = rhs.m_vals[1];
433
m_vals[2] = rhs.m_vals[2];
434
m_vals[3] = rhs.m_vals[3];
435
return *this;
436
}
437
438
inline void clear()
439
{
440
clear_obj(m_vals);
441
}
442
443
inline half_vec4& set(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)
444
{
445
m_vals[0] = x;
446
m_vals[1] = y;
447
m_vals[2] = z;
448
m_vals[3] = w;
449
return *this;
450
}
451
452
inline half_vec4& set(float x, float y, float z, float w)
453
{
454
m_vals[0] = basist::float_to_half(x);
455
m_vals[1] = basist::float_to_half(y);
456
m_vals[2] = basist::float_to_half(z);
457
m_vals[3] = basist::float_to_half(w);
458
return *this;
459
}
460
461
template<typename T>
462
inline half_vec4& set_vec(const T& vec)
463
{
464
m_vals[0] = basist::float_to_half(vec[0]);
465
m_vals[1] = basist::float_to_half(vec[1]);
466
m_vals[2] = basist::float_to_half(vec[2]);
467
m_vals[3] = basist::float_to_half(vec[3]);
468
return *this;
469
}
470
471
template<typename T>
472
inline T get_vec() const
473
{
474
return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]), basist::half_to_float(m_vals[3]));
475
}
476
477
inline basist::half_float operator[] (uint32_t c) const { assert(c < 4); return m_vals[c]; }
478
inline basist::half_float &operator[] (uint32_t c) { assert(c < 4); return m_vals[c]; }
479
480
float get_float_comp(uint32_t c) const
481
{
482
assert(c < 4);
483
return basist::half_to_float(m_vals[c]);
484
}
485
486
half_vec4& set_float_comp(uint32_t c, float v)
487
{
488
assert(c < 4);
489
m_vals[c] = basist::float_to_half(v);
490
return *this;
491
}
492
};
493
494
const uint32_t MAX_BLOCK_W = 6, MAX_BLOCK_H = 6;
495
496
struct trial_result
497
{
498
astc_helpers::log_astc_block m_log_blk;
499
double m_err;
500
bool m_valid;
501
};
502
503
//----------------------------------------------------------
504
505
const uint32_t NUM_PART3_MAPPINGS = 6;
506
static uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] =
507
{
508
{ 0, 1, 2 },
509
{ 1, 2, 0 },
510
{ 2, 0, 1 },
511
{ 0, 2, 1 },
512
{ 1, 0, 2 },
513
{ 2, 1, 0 }
514
};
515
516
struct partition_pattern_vec
517
{
518
uint8_t m_parts[6 * 6];
519
520
partition_pattern_vec()
521
{
522
clear();
523
}
524
525
partition_pattern_vec(const partition_pattern_vec& other)
526
{
527
*this = other;
528
}
529
530
void clear()
531
{
532
memset(m_parts, 0, sizeof(m_parts));
533
}
534
535
partition_pattern_vec& operator= (const partition_pattern_vec& rhs)
536
{
537
if (this == &rhs)
538
return *this;
539
memcpy(m_parts, rhs.m_parts, 36);
540
return *this;
541
}
542
543
uint8_t operator[] (uint32_t i) const { assert(i < 36); return m_parts[i]; }
544
uint8_t& operator[] (uint32_t i) { assert(i < 36); return m_parts[i]; }
545
546
uint8_t operator() (uint32_t x, uint32_t y) const { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }
547
uint8_t& operator() (uint32_t x, uint32_t y) { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }
548
549
int get_squared_distance(const partition_pattern_vec& other) const
550
{
551
int total_dist = 0;
552
for (uint32_t i = 0; i < 36; i++)
553
total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]);
554
return total_dist;
555
}
556
557
float get_distance(const partition_pattern_vec& other) const
558
{
559
return sqrtf((float)get_squared_distance(other));
560
}
561
562
partition_pattern_vec get_permuted2(uint32_t permute_index) const
563
{
564
assert(permute_index <= 1);
565
566
partition_pattern_vec res;
567
for (uint32_t i = 0; i < 36; i++)
568
{
569
assert(m_parts[i] <= 1);
570
res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index);
571
}
572
573
return res;
574
}
575
576
partition_pattern_vec get_permuted3(uint32_t permute_index) const
577
{
578
assert(permute_index <= 5);
579
580
partition_pattern_vec res;
581
for (uint32_t i = 0; i < 36; i++)
582
{
583
assert(m_parts[i] <= 2);
584
res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]];
585
}
586
587
return res;
588
}
589
590
partition_pattern_vec get_canonicalized() const
591
{
592
partition_pattern_vec res;
593
594
int new_labels[3] = { -1, -1, -1 };
595
uint32_t next_index = 0;
596
for (uint32_t i = 0; i < 36; i++)
597
{
598
uint32_t p = m_parts[i];
599
if (new_labels[p] == -1)
600
new_labels[p] = next_index++;
601
602
res.m_parts[i] = (uint8_t)new_labels[p];
603
}
604
605
return res;
606
}
607
608
bool operator== (const partition_pattern_vec& rhs) const
609
{
610
return memcmp(m_parts, rhs.m_parts, sizeof(m_parts)) == 0;
611
}
612
613
operator size_t() const
614
{
615
return basisu::hash_hsieh(m_parts, sizeof(m_parts));
616
}
617
};
618
619
struct vp_tree_node
620
{
621
partition_pattern_vec m_vantage_point;
622
uint32_t m_point_index;
623
float m_dist;
624
625
int m_inner_node, m_outer_node;
626
};
627
628
#define BRUTE_FORCE_PART_SEARCH (0)
629
630
class vp_tree
631
{
632
public:
633
vp_tree()
634
{
635
}
636
637
void clear()
638
{
639
m_nodes.clear();
640
}
641
642
// This requires no redundant patterns, i.e. all must be unique.
643
bool init(uint32_t n, const partition_pattern_vec* pUnique_pats)
644
{
645
clear();
646
647
uint_vec pat_indices(n);
648
for (uint32_t i = 0; i < n; i++)
649
pat_indices[i] = i;
650
651
std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);
652
653
if (root_idx.first == -1)
654
return false;
655
656
m_nodes.resize(1);
657
m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first];
658
m_nodes[0].m_point_index = root_idx.first;
659
m_nodes[0].m_dist = root_idx.second;
660
m_nodes[0].m_inner_node = -1;
661
m_nodes[0].m_outer_node = -1;
662
663
uint_vec inner_list, outer_list;
664
665
inner_list.reserve(n / 2);
666
outer_list.reserve(n / 2);
667
668
for (uint32_t pat_index = 0; pat_index < n; pat_index++)
669
{
670
if ((int)pat_index == root_idx.first)
671
continue;
672
673
const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]);
674
675
if (dist <= root_idx.second)
676
inner_list.push_back(pat_index);
677
else
678
outer_list.push_back(pat_index);
679
}
680
681
if (inner_list.size())
682
{
683
m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list);
684
if (m_nodes[0].m_inner_node < 0)
685
return false;
686
}
687
688
if (outer_list.size())
689
{
690
m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list);
691
if (m_nodes[0].m_outer_node < 0)
692
return false;
693
}
694
695
return true;
696
}
697
698
struct result
699
{
700
uint32_t m_pat_index;
701
uint32_t m_mapping_index;
702
float m_dist;
703
704
bool operator< (const result& rhs) const { return m_dist < rhs.m_dist; }
705
bool operator> (const result& rhs) const { return m_dist > rhs.m_dist; }
706
};
707
708
class result_queue
709
{
710
enum { MaxSupportedSize = 256 + 1 };
711
712
public:
713
result_queue() :
714
m_cur_size(0)
715
{
716
}
717
718
size_t get_size() const
719
{
720
return m_cur_size;
721
}
722
723
bool empty() const
724
{
725
return !m_cur_size;
726
}
727
728
typedef std::array<result, MaxSupportedSize + 1> result_array_type;
729
730
const result_array_type& get_elements() const { return m_elements; }
731
result_array_type& get_elements() { return m_elements; }
732
733
void clear()
734
{
735
m_cur_size = 0;
736
}
737
738
void reserve(uint32_t n)
739
{
740
BASISU_NOTE_UNUSED(n);
741
}
742
743
const result& top() const
744
{
745
assert(m_cur_size);
746
return m_elements[1];
747
}
748
749
bool insert(const result& val, uint32_t max_size)
750
{
751
assert(max_size < MaxSupportedSize);
752
753
if (m_cur_size >= MaxSupportedSize)
754
return false;
755
756
m_elements[++m_cur_size] = val;
757
up_heap(m_cur_size);
758
759
if (m_cur_size > max_size)
760
pop();
761
762
return true;
763
}
764
765
bool pop()
766
{
767
if (m_cur_size == 0)
768
return false;
769
770
m_elements[1] = m_elements[m_cur_size--];
771
down_heap(1);
772
return true;
773
}
774
775
float get_highest_dist() const
776
{
777
if (!m_cur_size)
778
return 0.0f;
779
780
return top().m_dist;
781
}
782
783
private:
784
result_array_type m_elements;
785
size_t m_cur_size;
786
787
void up_heap(size_t index)
788
{
789
while ((index > 1) && (m_elements[index] > m_elements[index >> 1]))
790
{
791
std::swap(m_elements[index], m_elements[index >> 1]);
792
index >>= 1;
793
}
794
}
795
796
void down_heap(size_t index)
797
{
798
for ( ; ; )
799
{
800
size_t largest = index, left_child = 2 * index, right_child = 2 * index + 1;
801
802
if ((left_child <= m_cur_size) && (m_elements[left_child] > m_elements[largest]))
803
largest = left_child;
804
805
if ((right_child <= m_cur_size) && (m_elements[right_child] > m_elements[largest]))
806
largest = right_child;
807
808
if (largest == index)
809
break;
810
811
std::swap(m_elements[index], m_elements[largest]);
812
index = largest;
813
}
814
}
815
};
816
817
void find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results)
818
{
819
assert((num_subsets >= 2) && (num_subsets <= 3));
820
821
results.clear();
822
823
if (!m_nodes.size())
824
return;
825
826
uint32_t num_desired_pats;
827
partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS];
828
829
if (num_subsets == 2)
830
{
831
num_desired_pats = 2;
832
for (uint32_t i = 0; i < 2; i++)
833
desired_pats[i] = desired_pat.get_permuted2(i);
834
}
835
else
836
{
837
num_desired_pats = NUM_PART3_MAPPINGS;
838
for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++)
839
desired_pats[i] = desired_pat.get_permuted3(i);
840
}
841
842
#if 0
843
find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results);
844
#else
845
find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results);
846
#endif
847
}
848
849
private:
850
basisu::vector<vp_tree_node> m_nodes;
851
852
void find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)
853
{
854
float best_dist_to_vantage = BIG_FLOAT_VAL;
855
uint32_t best_mapping = 0;
856
for (uint32_t i = 0; i < num_desired_pats; i++)
857
{
858
float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);
859
if (dist < best_dist_to_vantage)
860
{
861
best_dist_to_vantage = dist;
862
best_mapping = i;
863
}
864
}
865
866
result r;
867
r.m_dist = best_dist_to_vantage;
868
r.m_mapping_index = best_mapping;
869
r.m_pat_index = m_nodes[node_index].m_point_index;
870
871
results.insert(r, max_results);
872
873
if (best_dist_to_vantage <= m_nodes[node_index].m_dist)
874
{
875
// inner first
876
if (m_nodes[node_index].m_inner_node >= 0)
877
find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);
878
879
if (m_nodes[node_index].m_outer_node >= 0)
880
{
881
if ( (results.get_size() < max_results) ||
882
((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())
883
)
884
{
885
find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);
886
}
887
}
888
}
889
else
890
{
891
// outer first
892
if (m_nodes[node_index].m_outer_node >= 0)
893
find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);
894
895
if (m_nodes[node_index].m_inner_node >= 0)
896
{
897
if ( (results.get_size() < max_results) ||
898
((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())
899
)
900
{
901
find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);
902
}
903
}
904
}
905
}
906
907
void find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)
908
{
909
uint_vec node_stack;
910
node_stack.reserve(16);
911
node_stack.push_back(init_node_index);
912
913
do
914
{
915
const uint32_t node_index = node_stack.back();
916
node_stack.pop_back();
917
918
float best_dist_to_vantage = BIG_FLOAT_VAL;
919
uint32_t best_mapping = 0;
920
for (uint32_t i = 0; i < num_desired_pats; i++)
921
{
922
float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);
923
if (dist < best_dist_to_vantage)
924
{
925
best_dist_to_vantage = dist;
926
best_mapping = i;
927
}
928
}
929
930
result r;
931
r.m_dist = best_dist_to_vantage;
932
r.m_mapping_index = best_mapping;
933
r.m_pat_index = m_nodes[node_index].m_point_index;
934
935
results.insert(r, max_results);
936
937
if (best_dist_to_vantage <= m_nodes[node_index].m_dist)
938
{
939
if (m_nodes[node_index].m_outer_node >= 0)
940
{
941
if ((results.get_size() < max_results) ||
942
((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())
943
)
944
{
945
node_stack.push_back(m_nodes[node_index].m_outer_node);
946
}
947
}
948
949
// inner first
950
if (m_nodes[node_index].m_inner_node >= 0)
951
{
952
node_stack.push_back(m_nodes[node_index].m_inner_node);
953
}
954
}
955
else
956
{
957
if (m_nodes[node_index].m_inner_node >= 0)
958
{
959
if ((results.get_size() < max_results) ||
960
((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())
961
)
962
{
963
node_stack.push_back(m_nodes[node_index].m_inner_node);
964
}
965
}
966
967
// outer first
968
if (m_nodes[node_index].m_outer_node >= 0)
969
{
970
node_stack.push_back(m_nodes[node_index].m_outer_node);
971
}
972
}
973
974
} while (!node_stack.empty());
975
}
976
977
// returns the index of the new node, or -1 on error
978
int create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices)
979
{
980
std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);
981
982
if (root_idx.first < 0)
983
return -1;
984
985
m_nodes.resize(m_nodes.size() + 1);
986
const uint32_t new_node_index = m_nodes.size_u32() - 1;
987
988
m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first];
989
m_nodes[new_node_index].m_point_index = root_idx.first;
990
m_nodes[new_node_index].m_dist = root_idx.second;
991
m_nodes[new_node_index].m_inner_node = -1;
992
m_nodes[new_node_index].m_outer_node = -1;
993
994
uint_vec inner_list, outer_list;
995
996
inner_list.reserve(pat_indices.size_u32() / 2);
997
outer_list.reserve(pat_indices.size_u32() / 2);
998
999
for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++)
1000
{
1001
const uint32_t pat_index = pat_indices[pat_indices_iter];
1002
1003
if ((int)pat_index == root_idx.first)
1004
continue;
1005
1006
const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]);
1007
1008
if (dist <= root_idx.second)
1009
inner_list.push_back(pat_index);
1010
else
1011
outer_list.push_back(pat_index);
1012
}
1013
1014
if (inner_list.size())
1015
m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list);
1016
1017
if (outer_list.size())
1018
m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list);
1019
1020
return new_node_index;
1021
}
1022
1023
// returns the pattern index of the vantage point (-1 on error), and the optimal split distance
1024
std::pair<int, float> find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec &pat_indices)
1025
{
1026
BASISU_NOTE_UNUSED(num_unique_pats);
1027
1028
const uint32_t n = pat_indices.size_u32();
1029
1030
assert(n);
1031
if (n == 1)
1032
return std::pair(pat_indices[0], 0.0f);
1033
1034
float best_split_metric = -1.0f;
1035
int best_split_pat = -1;
1036
float best_split_dist = 0.0f;
1037
float best_split_var = 0.0f;
1038
1039
basisu::vector< std::pair<float, uint32_t> > dists;
1040
dists.reserve(n);
1041
1042
float_vec float_dists;
1043
float_dists.reserve(n);
1044
1045
for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++)
1046
{
1047
const uint32_t split_pat_index = pat_indices[pat_indices_iter];
1048
assert(split_pat_index < num_unique_pats);
1049
1050
const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index];
1051
1052
dists.resize(0);
1053
float_dists.resize(0);
1054
1055
for (uint32_t j = 0; j < n; j++)
1056
{
1057
const uint32_t pat_index = pat_indices[j];
1058
assert(pat_index < num_unique_pats);
1059
1060
if (pat_index == split_pat_index)
1061
continue;
1062
1063
float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);
1064
dists.emplace_back(std::pair(dist, pat_index));
1065
1066
float_dists.push_back(dist);
1067
}
1068
1069
stats<double> s;
1070
s.calc(float_dists.size_u32(), float_dists.data());
1071
1072
std::sort(dists.begin(), dists.end(), [](const auto &a, const auto &b) {
1073
return a.first < b.first;
1074
});
1075
1076
const uint32_t num_dists = dists.size_u32();
1077
float split_dist = dists[num_dists / 2].first;
1078
if ((num_dists & 1) == 0)
1079
split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f;
1080
1081
uint32_t total_inner = 0, total_outer = 0;
1082
1083
for (uint32_t j = 0; j < n; j++)
1084
{
1085
const uint32_t pat_index = pat_indices[j];
1086
if (pat_index == split_pat_index)
1087
continue;
1088
1089
float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);
1090
1091
if (dist <= split_dist)
1092
total_inner++;
1093
else
1094
total_outer++;
1095
}
1096
1097
float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer);
1098
1099
if ( (split_metric > best_split_metric) ||
1100
((split_metric == best_split_metric) && (s.m_var > best_split_var)) )
1101
{
1102
best_split_metric = split_metric;
1103
best_split_dist = split_dist;
1104
best_split_pat = split_pat_index;
1105
best_split_var = (float)s.m_var;
1106
}
1107
}
1108
1109
return std::pair(best_split_pat, best_split_dist);
1110
}
1111
};
1112
1113
struct partition
1114
{
1115
uint64_t m_p;
1116
1117
inline partition() :
1118
m_p(0)
1119
{
1120
}
1121
1122
inline partition(uint64_t p) :
1123
m_p(p)
1124
{
1125
assert(p < (1ULL << 36));
1126
}
1127
1128
inline partition& operator=(uint64_t p)
1129
{
1130
assert(p < (1ULL << 36));
1131
m_p = p;
1132
return *this;
1133
}
1134
1135
inline bool operator< (const partition& p) const
1136
{
1137
return m_p < p.m_p;
1138
}
1139
1140
inline bool operator== (const partition& p) const
1141
{
1142
return m_p == p.m_p;
1143
}
1144
1145
inline operator size_t() const
1146
{
1147
return hash_hsieh((const uint8_t *)&m_p, sizeof(m_p));
1148
}
1149
};
1150
1151
partition_pattern_vec g_partitions2[NUM_UNIQUE_PARTITIONS2];
1152
int g_part2_seed_to_unique_index[1024];
1153
vp_tree g_part2_vp_tree;
1154
1155
static inline vec3F vec3F_norm_approx(vec3F axis)
1156
{
1157
float l = axis.norm();
1158
axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec3F(0.577350269f);
1159
return axis;
1160
}
1161
1162
static void init_partitions2_6x6()
1163
{
1164
#if 0
1165
// makes pattern bits to the 10-bit ASTC seed index
1166
typedef basisu::hash_map<uint64_t, uint32_t> partition2_hash_map;
1167
partition2_hash_map phash;
1168
phash.reserve(1024);
1169
1170
for (uint32_t i = 0; i < 1024; i++)
1171
{
1172
uint64_t p_bits = 0;
1173
uint64_t p_bits_inv = 0;
1174
1175
for (uint32_t y = 0; y < 6; y++)
1176
{
1177
for (uint32_t x = 0; x < 6; x++)
1178
{
1179
uint64_t p = astc_helpers::compute_texel_partition(i, x, y, 0, 2, false);
1180
assert(p < 2);
1181
1182
p_bits |= (p << (x + y * 6));
1183
p_bits_inv |= ((1 - p) << (x + y * 6));
1184
}
1185
}
1186
1187
if (!p_bits)
1188
continue;
1189
if (p_bits == ((1ULL << 36) - 1))
1190
continue;
1191
1192
assert(p_bits < (1ULL << 36));
1193
assert(p_bits_inv < (1ULL << 36));
1194
1195
if (phash.contains(p_bits))
1196
{
1197
}
1198
else if (phash.contains(p_bits_inv))
1199
{
1200
}
1201
else
1202
{
1203
auto res = phash.insert(p_bits, i);
1204
assert(res.second);
1205
BASISU_NOTE_UNUSED(res);
1206
}
1207
}
1208
1209
uint32_t num_unique_partitions2 = 0;
1210
1211
for (const auto& r : phash)
1212
{
1213
assert(r.second < 1024);
1214
1215
const uint32_t unique_index = num_unique_partitions2;
1216
assert(unique_index < NUM_UNIQUE_PARTITIONS2);
1217
1218
partition_pattern_vec pat_vec;
1219
for (uint32_t i = 0; i < 36; i++)
1220
pat_vec[i] = (uint8_t)((r.first >> i) & 1);
1221
1222
g_partitions2[unique_index] = pat_vec;
1223
1224
assert(g_part2_unique_index_to_seed[unique_index] == r.second);
1225
g_part2_seed_to_unique_index[r.second] = unique_index;
1226
1227
num_unique_partitions2++;
1228
}
1229
assert(num_unique_partitions2 == NUM_UNIQUE_PARTITIONS2);
1230
#else
1231
for (uint32_t unique_index = 0; unique_index < NUM_UNIQUE_PARTITIONS2; unique_index++)
1232
{
1233
const uint32_t seed_index = g_part2_unique_index_to_seed[unique_index];
1234
assert(seed_index < 1024);
1235
1236
assert(g_part2_seed_to_unique_index[seed_index] == 0);
1237
g_part2_seed_to_unique_index[seed_index] = unique_index;
1238
1239
partition_pattern_vec& pat_vec = g_partitions2[unique_index];
1240
1241
for (uint32_t y = 0; y < 6; y++)
1242
{
1243
for (uint32_t x = 0; x < 6; x++)
1244
{
1245
uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, 2, false);
1246
assert(p < 2);
1247
1248
pat_vec[x + y * 6] = p;
1249
}
1250
}
1251
}
1252
#endif
1253
1254
g_part2_vp_tree.init(NUM_UNIQUE_PARTITIONS2, g_partitions2);
1255
}
1256
1257
static bool estimate_partition2_6x6(
1258
const basist::half_float pBlock_pixels_half[][3],
1259
int* pBest_parts, uint32_t num_best_parts)
1260
{
1261
const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H;
1262
1263
vec3F training_vecs[BLOCK_T], mean(0.0f);
1264
1265
for (uint32_t i = 0; i < BLOCK_T; i++)
1266
{
1267
vec3F& v = training_vecs[i];
1268
1269
v[0] = (float)pBlock_pixels_half[i][0];
1270
v[1] = (float)pBlock_pixels_half[i][1];
1271
v[2] = (float)pBlock_pixels_half[i][2];
1272
1273
mean += v;
1274
}
1275
mean *= (1.0f / (float)BLOCK_T);
1276
1277
vec3F max_vals(-BIG_FLOAT_VAL);
1278
1279
for (uint32_t i = 0; i < BLOCK_T; i++)
1280
{
1281
vec3F& v = training_vecs[i];
1282
max_vals = vec3F::component_max(max_vals, v);
1283
}
1284
1285
// Initialize principle axis approximation
1286
vec3F axis(max_vals - mean);
1287
1288
// Incremental approx. PCA - only viable if we have a reasonably fast approximation for 1.0/sqrt(x).
1289
for (uint32_t i = 0; i < BLOCK_T; i++)
1290
{
1291
axis = vec3F_norm_approx(axis);
1292
1293
vec3F color(training_vecs[i] - mean);
1294
1295
float d = color.dot(axis);
1296
1297
axis += color * d;
1298
}
1299
1300
if (axis.norm() < SMALL_FLOAT_VAL)
1301
axis.set(0.57735027f);
1302
else
1303
axis.normalize_in_place();
1304
1305
#if BRUTE_FORCE_PART_SEARCH
1306
int desired_parts[BLOCK_H][BLOCK_W]; // [y][x]
1307
for (uint32_t i = 0; i < BLOCK_T; i++)
1308
{
1309
float proj = (training_vecs[i] - mean).dot(axis);
1310
1311
desired_parts[i / BLOCK_W][i % BLOCK_W] = proj < 0.0f;
1312
}
1313
#else
1314
partition_pattern_vec desired_part;
1315
1316
for (uint32_t i = 0; i < BLOCK_T; i++)
1317
{
1318
float proj = (training_vecs[i] - mean).dot(axis);
1319
1320
desired_part.m_parts[i] = proj < 0.0f;
1321
}
1322
#endif
1323
1324
//interval_timer tm;
1325
//tm.start();
1326
1327
#if BRUTE_FORCE_PART_SEARCH
1328
uint32_t part_similarity[NUM_UNIQUE_PARTITIONS2];
1329
1330
for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS2; part_index++)
1331
{
1332
const partition_pattern_vec &pat_vec = g_partitions2[part_index];
1333
1334
int total_sim_non_inv = 0;
1335
int total_sim_inv = 0;
1336
1337
for (uint32_t y = 0; y < BLOCK_H; y++)
1338
{
1339
for (uint32_t x = 0; x < BLOCK_W; x++)
1340
{
1341
int part = pat_vec[x + y * 6];
1342
1343
if (part == desired_parts[y][x])
1344
total_sim_non_inv++;
1345
1346
if ((part ^ 1) == desired_parts[y][x])
1347
total_sim_inv++;
1348
}
1349
}
1350
1351
int total_sim = maximum(total_sim_non_inv, total_sim_inv);
1352
1353
part_similarity[part_index] = (total_sim << 16) | part_index;
1354
1355
} // part_index;
1356
1357
std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS2);
1358
1359
for (uint32_t i = 0; i < num_best_parts; i++)
1360
pBest_parts[i] = part_similarity[(NUM_UNIQUE_PARTITIONS2 - 1) - i] & 0xFFFF;
1361
#else
1362
vp_tree::result_queue results;
1363
results.reserve(num_best_parts);
1364
g_part2_vp_tree.find_nearest(2, desired_part, results, num_best_parts);
1365
1366
assert(results.get_size() == num_best_parts);
1367
1368
const auto& elements = results.get_elements();
1369
1370
for (uint32_t i = 0; i < results.get_size(); i++)
1371
pBest_parts[i] = elements[1 + i].m_pat_index;
1372
#endif
1373
1374
//fmt_printf("{} ", tm.get_elapsed_ms());
1375
1376
return true;
1377
}
1378
1379
const uint32_t MIN_REFINE_LEVEL = 0;
1380
1381
static bool encode_block_2_subsets(
1382
trial_result res[2],
1383
uint32_t grid_w, uint32_t grid_h,
1384
uint32_t cem,
1385
uint32_t weights_ise_range, uint32_t endpoints_ise_range,
1386
const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,
1387
astc_hdr_codec_base_options& coptions,
1388
bool uber_mode_flag,
1389
int unique_pat_index,
1390
uint32_t comp_level,
1391
opt_mode_t mode11_opt_mode,
1392
bool refine_endpoints_flag)
1393
{
1394
const uint32_t num_endpoint_vals = (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;
1395
1396
res[0].m_valid = false;
1397
res[1].m_valid = false;
1398
1399
const uint32_t BLOCK_W = 6, BLOCK_H = 6;
1400
1401
astc_helpers::log_astc_block best_log_blk;
1402
clear_obj(best_log_blk);
1403
1404
best_log_blk.m_num_partitions = 2;
1405
best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;
1406
best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;
1407
best_log_blk.m_grid_width = (uint8_t)grid_w;
1408
best_log_blk.m_grid_height = (uint8_t)grid_h;
1409
1410
best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;
1411
best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;
1412
1413
partition_pattern_vec* pPat = &g_partitions2[unique_pat_index];
1414
const uint32_t p_seed = g_part2_unique_index_to_seed[unique_pat_index];
1415
1416
vec4F part_pixels_q16[2][64];
1417
half_vec3 part_half_pixels[2][64];
1418
uint8_t part_pixel_index[2][64];
1419
uint32_t part_total_pixels[2] = { 0 };
1420
1421
for (uint32_t y = 0; y < BLOCK_H; y++)
1422
{
1423
for (uint32_t x = 0; x < BLOCK_W; x++)
1424
{
1425
uint32_t part_index = (*pPat)[x + y * BLOCK_W];
1426
1427
uint32_t l = part_total_pixels[part_index];
1428
1429
part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];
1430
part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];
1431
part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);
1432
1433
part_total_pixels[part_index] = l + 1;
1434
} // x
1435
} // y
1436
1437
uint8_t blk_endpoints[2][basist::NUM_MODE11_ENDPOINTS];
1438
uint8_t blk_weights[2][BLOCK_W * BLOCK_H];
1439
uint32_t best_submode[2];
1440
1441
for (uint32_t part_iter = 0; part_iter < 2; part_iter++)
1442
{
1443
assert(part_total_pixels[part_iter]);
1444
1445
double e;
1446
if (cem == 7)
1447
{
1448
e = encode_astc_hdr_block_mode_7(
1449
part_total_pixels[part_iter],
1450
(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1451
best_log_blk.m_weight_ise_range,
1452
best_submode[part_iter],
1453
BIG_FLOAT_VAL,
1454
blk_endpoints[part_iter],
1455
blk_weights[part_iter],
1456
coptions,
1457
best_log_blk.m_endpoint_ise_range);
1458
}
1459
else
1460
{
1461
assert(cem == 11);
1462
1463
e = encode_astc_hdr_block_mode_11(
1464
part_total_pixels[part_iter],
1465
(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1466
best_log_blk.m_weight_ise_range,
1467
best_submode[part_iter],
1468
BIG_FLOAT_VAL,
1469
blk_endpoints[part_iter],
1470
blk_weights[part_iter],
1471
coptions,
1472
false,
1473
best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,
1474
mode11_opt_mode);
1475
}
1476
1477
if (e == BIG_FLOAT_VAL)
1478
return false;
1479
1480
} // part_iter
1481
1482
uint8_t ise_weights[BLOCK_W * BLOCK_H];
1483
1484
uint32_t src_pixel_index[2] = { 0, 0 };
1485
for (uint32_t y = 0; y < BLOCK_H; y++)
1486
{
1487
for (uint32_t x = 0; x < BLOCK_W; x++)
1488
{
1489
uint32_t part_index = (*pPat)[x + y * BLOCK_W];
1490
ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
1491
src_pixel_index[part_index]++;
1492
} // x
1493
} // y
1494
1495
if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))
1496
{
1497
best_log_blk.m_partition_id = (uint16_t)p_seed;
1498
1499
memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);
1500
memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);
1501
memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);
1502
1503
res[0].m_valid = true;
1504
res[0].m_log_blk = best_log_blk;
1505
}
1506
else
1507
{
1508
uint8_t desired_weights[BLOCK_H * BLOCK_W];
1509
1510
const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;
1511
1512
for (uint32_t by = 0; by < BLOCK_H; by++)
1513
for (uint32_t bx = 0; bx < BLOCK_W; bx++)
1514
desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];
1515
1516
uint8_t downsampled_weights[BLOCK_H * BLOCK_W];
1517
1518
const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);
1519
if (!pDownsample_matrix)
1520
{
1521
assert(0);
1522
return false;
1523
}
1524
1525
downsample_weight_grid(
1526
pDownsample_matrix,
1527
BLOCK_W, BLOCK_H, // source/from dimension (block size)
1528
grid_w, grid_h, // dest/to dimension (grid size)
1529
desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]
1530
downsampled_weights); // [wy][wx]
1531
1532
best_log_blk.m_partition_id = (uint16_t)p_seed;
1533
memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);
1534
memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);
1535
1536
const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;
1537
1538
for (uint32_t gy = 0; gy < grid_h; gy++)
1539
for (uint32_t gx = 0; gx < grid_w; gx++)
1540
best_log_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];
1541
1542
res[0].m_valid = true;
1543
res[0].m_log_blk = best_log_blk;
1544
1545
if ((refine_endpoints_flag) && (comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))
1546
{
1547
bool any_refined = false;
1548
1549
for (uint32_t part_iter = 0; part_iter < 2; part_iter++)
1550
{
1551
bool refine_status = refine_endpoints(
1552
cem,
1553
endpoints_ise_range,
1554
best_log_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize
1555
BLOCK_W, BLOCK_H, // block dimensions
1556
grid_w, grid_h, best_log_blk.m_weights, weights_ise_range, // weight grid
1557
part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1558
&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets
1559
coptions, mode11_opt_mode);
1560
1561
if (refine_status)
1562
any_refined = true;
1563
}
1564
1565
if (any_refined)
1566
{
1567
res[1].m_valid = true;
1568
res[1].m_log_blk = best_log_blk;
1569
}
1570
}
1571
}
1572
1573
return true;
1574
}
1575
1576
typedef basisu::hash_map<partition_pattern_vec, std::pair<uint32_t, uint32_t > > partition3_hash_map;
1577
1578
partition_pattern_vec g_partitions3[NUM_UNIQUE_PARTITIONS3];
1579
int g_part3_seed_to_unique_index[1024];
1580
vp_tree g_part3_vp_tree;
1581
1582
static void init_partitions3_6x6()
1583
{
1584
uint32_t t = 0;
1585
1586
for (uint32_t i = 0; i < 1024; i++)
1587
g_part3_seed_to_unique_index[i] = -1;
1588
1589
partition3_hash_map part3_hash;
1590
part3_hash.reserve(512);
1591
1592
for (uint32_t seed_index = 0; seed_index < 1024; seed_index++)
1593
{
1594
partition_pattern_vec p3;
1595
uint32_t part_hist[3] = { 0 };
1596
1597
for (uint32_t y = 0; y < 6; y++)
1598
{
1599
for (uint32_t x = 0; x < 6; x++)
1600
{
1601
uint64_t p = astc_helpers::compute_texel_partition(seed_index, x, y, 0, 3, false);
1602
assert(p < 3);
1603
1604
p3.m_parts[x + y * 6] = (uint8_t)p;
1605
part_hist[p]++;
1606
}
1607
}
1608
1609
if (!part_hist[0] || !part_hist[1] || !part_hist[2])
1610
continue;
1611
1612
uint32_t j;
1613
for (j = 0; j < NUM_PART3_MAPPINGS; j++)
1614
{
1615
partition_pattern_vec temp_part3(p3.get_permuted3(j));
1616
1617
if (part3_hash.contains(temp_part3))
1618
break;
1619
}
1620
if (j < NUM_PART3_MAPPINGS)
1621
continue;
1622
1623
part3_hash.insert(p3, std::make_pair(seed_index, t) );
1624
1625
assert(g_part3_unique_index_to_seed[t] == seed_index);
1626
g_part3_seed_to_unique_index[seed_index] = t;
1627
g_partitions3[t] = p3;
1628
1629
t++;
1630
}
1631
1632
g_part3_vp_tree.init(NUM_UNIQUE_PARTITIONS3, g_partitions3);
1633
}
1634
1635
static bool estimate_partition3_6x6(
1636
const basist::half_float pBlock_pixels_half[][3],
1637
int* pBest_parts, uint32_t num_best_parts)
1638
{
1639
const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H, NUM_SUBSETS = 3;
1640
1641
assert(num_best_parts && (num_best_parts <= NUM_UNIQUE_PARTITIONS3));
1642
1643
vec3F training_vecs[BLOCK_T], mean(0.0f);
1644
1645
float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL;
1646
vec3F cluster_centroids[NUM_SUBSETS];
1647
1648
for (uint32_t i = 0; i < BLOCK_T; i++)
1649
{
1650
vec3F& v = training_vecs[i];
1651
1652
v.set((float)pBlock_pixels_half[i][0], (float)pBlock_pixels_half[i][1], (float)pBlock_pixels_half[i][2]);
1653
1654
float inten = v.dot(vec3F(1.0f));
1655
if (inten < darkest_inten)
1656
{
1657
darkest_inten = inten;
1658
cluster_centroids[0] = v;
1659
}
1660
1661
if (inten > brightest_inten)
1662
{
1663
brightest_inten = inten;
1664
cluster_centroids[1] = v;
1665
}
1666
}
1667
1668
if (cluster_centroids[0] == cluster_centroids[1])
1669
return false;
1670
1671
float furthest_dist2 = 0.0f;
1672
for (uint32_t i = 0; i < BLOCK_T; i++)
1673
{
1674
vec3F& v = training_vecs[i];
1675
1676
float dist_a = v.squared_distance(cluster_centroids[0]);
1677
if (dist_a == 0.0f)
1678
continue;
1679
1680
float dist_b = v.squared_distance(cluster_centroids[1]);
1681
if (dist_b == 0.0f)
1682
continue;
1683
1684
float dist2 = dist_a + dist_b;
1685
if (dist2 > furthest_dist2)
1686
{
1687
furthest_dist2 = dist2;
1688
cluster_centroids[2] = v;
1689
}
1690
}
1691
1692
if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2]))
1693
return false;
1694
1695
uint32_t cluster_pixels[NUM_SUBSETS][BLOCK_T];
1696
uint32_t num_cluster_pixels[NUM_SUBSETS];
1697
vec3F new_cluster_means[NUM_SUBSETS];
1698
1699
const uint32_t NUM_ITERS = 4;
1700
1701
for (uint32_t s = 0; s < NUM_ITERS; s++)
1702
{
1703
memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels));
1704
memset(new_cluster_means, 0, sizeof(new_cluster_means));
1705
1706
for (uint32_t i = 0; i < BLOCK_T; i++)
1707
{
1708
float d[NUM_SUBSETS] = {
1709
training_vecs[i].squared_distance(cluster_centroids[0]),
1710
training_vecs[i].squared_distance(cluster_centroids[1]),
1711
training_vecs[i].squared_distance(cluster_centroids[2]) };
1712
1713
float min_d = d[0];
1714
uint32_t min_idx = 0;
1715
for (uint32_t j = 1; j < NUM_SUBSETS; j++)
1716
{
1717
if (d[j] < min_d)
1718
{
1719
min_d = d[j];
1720
min_idx = j;
1721
}
1722
}
1723
1724
cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i;
1725
new_cluster_means[min_idx] += training_vecs[i];
1726
num_cluster_pixels[min_idx]++;
1727
} // i
1728
1729
for (uint32_t j = 0; j < NUM_SUBSETS; j++)
1730
{
1731
if (!num_cluster_pixels[j])
1732
return false;
1733
1734
cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j];
1735
}
1736
} // s
1737
1738
partition_pattern_vec desired_part;
1739
for (uint32_t p = 0; p < NUM_SUBSETS; p++)
1740
{
1741
for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)
1742
{
1743
const uint32_t pix_index = cluster_pixels[p][i];
1744
desired_part[pix_index] = (uint8_t)p;
1745
}
1746
}
1747
1748
#if BRUTE_FORCE_PART_SEARCH
1749
partition_pattern_vec desired_parts[NUM_PART3_MAPPINGS];
1750
for (uint32_t j = 0; j < NUM_PART3_MAPPINGS; j++)
1751
desired_parts[j] = desired_part.get_permuted3(j);
1752
1753
uint32_t part_similarity[NUM_UNIQUE_PARTITIONS3];
1754
1755
for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS3; part_index++)
1756
{
1757
const partition_pattern_vec& pat = g_partitions3[part_index];
1758
1759
uint32_t lowest_pat_dist = UINT32_MAX;
1760
for (uint32_t p = 0; p < NUM_PART3_MAPPINGS; p++)
1761
{
1762
uint32_t dist = pat.get_squared_distance(desired_parts[p]);
1763
if (dist < lowest_pat_dist)
1764
lowest_pat_dist = dist;
1765
}
1766
1767
part_similarity[part_index] = (lowest_pat_dist << 16) | part_index;
1768
1769
} // part_index;
1770
1771
std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS3);
1772
1773
for (uint32_t i = 0; i < num_best_parts; i++)
1774
pBest_parts[i] = part_similarity[i] & 0xFFFF;
1775
#else
1776
vp_tree::result_queue results;
1777
results.reserve(num_best_parts);
1778
g_part3_vp_tree.find_nearest(3, desired_part, results, num_best_parts);
1779
1780
assert(results.get_size() == num_best_parts);
1781
1782
const auto& elements = results.get_elements();
1783
1784
for (uint32_t i = 0; i < results.get_size(); i++)
1785
pBest_parts[i] = elements[1 + i].m_pat_index;
1786
#endif
1787
1788
return true;
1789
}
1790
1791
static bool encode_block_3_subsets(
1792
trial_result& res,
1793
uint32_t cem,
1794
uint32_t grid_w, uint32_t grid_h,
1795
uint32_t weights_ise_range, uint32_t endpoints_ise_range,
1796
const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,
1797
astc_hdr_codec_base_options& coptions,
1798
bool uber_mode_flag,
1799
const int* pEst_patterns, int num_est_patterns,
1800
uint32_t comp_level,
1801
opt_mode_t mode11_opt_mode)
1802
{
1803
BASISU_NOTE_UNUSED(uber_mode_flag);
1804
const uint32_t BLOCK_W = 6, BLOCK_H = 6, NUM_SUBSETS = 3;
1805
const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem);
1806
1807
res.m_valid = false;
1808
1809
double best_e = BIG_FLOAT_VAL;
1810
1811
astc_helpers::log_astc_block best_log_blk;
1812
clear_obj(best_log_blk);
1813
1814
best_log_blk.m_num_partitions = NUM_SUBSETS;
1815
best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;
1816
best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;
1817
best_log_blk.m_color_endpoint_modes[2] = (uint8_t)cem;
1818
best_log_blk.m_grid_width = (uint8_t)grid_w;
1819
best_log_blk.m_grid_height = (uint8_t)grid_h;
1820
1821
best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;
1822
best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;
1823
1824
const uint32_t n = num_est_patterns ? num_est_patterns : NUM_UNIQUE_PARTITIONS3;
1825
1826
for (uint32_t unique_p_iter = 0; unique_p_iter < n; unique_p_iter++)
1827
{
1828
const uint32_t unique_part_index = num_est_patterns ? pEst_patterns[unique_p_iter] : unique_p_iter;
1829
assert(unique_part_index < NUM_UNIQUE_PARTITIONS3);
1830
const partition_pattern_vec*pPart = &g_partitions3[unique_part_index];
1831
1832
vec4F part_pixels_q16[NUM_SUBSETS][64];
1833
half_vec3 part_half_pixels[NUM_SUBSETS][64];
1834
uint8_t part_pixel_index[NUM_SUBSETS][64];
1835
uint32_t part_total_pixels[NUM_SUBSETS] = { 0 };
1836
1837
for (uint32_t y = 0; y < BLOCK_H; y++)
1838
{
1839
for (uint32_t x = 0; x < BLOCK_W; x++)
1840
{
1841
const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];
1842
1843
uint32_t l = part_total_pixels[part_index];
1844
1845
part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];
1846
part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];
1847
part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);
1848
1849
part_total_pixels[part_index] = l + 1;
1850
} // x
1851
} // y
1852
1853
uint8_t blk_endpoints[NUM_SUBSETS][basist::NUM_MODE11_ENDPOINTS];
1854
uint8_t blk_weights[NUM_SUBSETS][BLOCK_W * BLOCK_H];
1855
uint32_t best_submode[NUM_SUBSETS];
1856
1857
double e = 0.0f;
1858
for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)
1859
{
1860
assert(part_total_pixels[part_iter]);
1861
1862
if (cem == 7)
1863
{
1864
e += encode_astc_hdr_block_mode_7(
1865
part_total_pixels[part_iter],
1866
(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1867
best_log_blk.m_weight_ise_range,
1868
best_submode[part_iter],
1869
BIG_FLOAT_VAL,
1870
blk_endpoints[part_iter],
1871
blk_weights[part_iter],
1872
coptions,
1873
best_log_blk.m_endpoint_ise_range);
1874
}
1875
else
1876
{
1877
assert(cem == 11);
1878
1879
e += encode_astc_hdr_block_mode_11(
1880
part_total_pixels[part_iter],
1881
(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1882
best_log_blk.m_weight_ise_range,
1883
best_submode[part_iter],
1884
BIG_FLOAT_VAL,
1885
blk_endpoints[part_iter],
1886
blk_weights[part_iter],
1887
coptions,
1888
false, best_log_blk.m_endpoint_ise_range, uber_mode_flag, false,
1889
FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode);
1890
}
1891
1892
} // part_iter
1893
1894
uint8_t ise_weights[BLOCK_W * BLOCK_H];
1895
1896
uint32_t src_pixel_index[NUM_SUBSETS] = { 0 };
1897
for (uint32_t y = 0; y < BLOCK_H; y++)
1898
{
1899
for (uint32_t x = 0; x < BLOCK_W; x++)
1900
{
1901
const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];
1902
1903
ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
1904
src_pixel_index[part_index]++;
1905
} // x
1906
} // y
1907
1908
if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))
1909
{
1910
if (e < best_e)
1911
{
1912
best_e = e;
1913
best_log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];
1914
1915
for (uint32_t p = 0; p < NUM_SUBSETS; p++)
1916
memcpy(best_log_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);
1917
1918
memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);
1919
}
1920
}
1921
else
1922
{
1923
uint8_t desired_weights[BLOCK_H * BLOCK_W];
1924
1925
const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;
1926
1927
for (uint32_t by = 0; by < BLOCK_H; by++)
1928
for (uint32_t bx = 0; bx < BLOCK_W; bx++)
1929
desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];
1930
1931
uint8_t downsampled_weights[BLOCK_H * BLOCK_W];
1932
1933
const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);
1934
if (!pDownsample_matrix)
1935
{
1936
assert(0);
1937
return false;
1938
}
1939
1940
downsample_weight_grid(
1941
pDownsample_matrix,
1942
BLOCK_W, BLOCK_H, // source/from dimension (block size)
1943
grid_w, grid_h, // dest/to dimension (grid size)
1944
desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]
1945
downsampled_weights); // [wy][wx]
1946
1947
astc_helpers::log_astc_block trial_blk(best_log_blk);
1948
1949
trial_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];
1950
1951
for (uint32_t p = 0; p < NUM_SUBSETS; p++)
1952
memcpy(trial_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);
1953
1954
const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;
1955
1956
for (uint32_t gy = 0; gy < grid_h; gy++)
1957
for (uint32_t gx = 0; gx < grid_w; gx++)
1958
trial_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];
1959
1960
if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))
1961
{
1962
for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)
1963
{
1964
bool refine_status = refine_endpoints(
1965
cem,
1966
endpoints_ise_range,
1967
trial_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize
1968
BLOCK_W, BLOCK_H, // block dimensions
1969
grid_w, grid_h, trial_blk.m_weights, weights_ise_range, // weight grid
1970
part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
1971
&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets
1972
coptions, mode11_opt_mode);
1973
1974
BASISU_NOTE_UNUSED(refine_status);
1975
}
1976
}
1977
1978
half_vec4 decoded_pixels_half4[BLOCK_H][BLOCK_W]; // [y][x]
1979
bool status = astc_helpers::decode_block(trial_blk, decoded_pixels_half4, BLOCK_W, BLOCK_H, astc_helpers::cDecodeModeHDR16);
1980
assert(status);
1981
if (!status)
1982
return false;
1983
1984
half_vec3 decoded_pixels_half3[BLOCK_H][BLOCK_W];
1985
for (uint32_t y = 0; y < BLOCK_H; y++)
1986
for (uint32_t x = 0; x < BLOCK_W; x++)
1987
decoded_pixels_half3[y][x].set(decoded_pixels_half4[y][x][0], decoded_pixels_half4[y][x][1], decoded_pixels_half4[y][x][2]);
1988
1989
double trial_err = compute_block_error(BLOCK_W * BLOCK_H, (const basist::half_float*)pBlock_pixels_half, (const basist::half_float*)decoded_pixels_half3, coptions);
1990
if (trial_err < best_e)
1991
{
1992
best_e = trial_err;
1993
best_log_blk = trial_blk;
1994
}
1995
}
1996
1997
} // unique_p_iter
1998
1999
if (best_e < BIG_FLOAT_VAL)
2000
{
2001
res.m_log_blk = best_log_blk;
2002
res.m_valid = true;
2003
res.m_err = best_e;
2004
}
2005
else
2006
{
2007
res.m_valid = false;
2008
}
2009
2010
return res.m_valid;
2011
}
2012
2013
static uint32_t encode_values(bitwise_coder &coder, uint32_t total_values, const uint8_t *pVals, uint32_t endpoint_range)
2014
{
2015
const uint32_t MAX_VALS = 64;
2016
uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3];
2017
uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1;
2018
2019
assert((total_values) && (total_values <= MAX_VALS));
2020
2021
const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0];
2022
const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1];
2023
const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2];
2024
2025
for (uint32_t i = 0; i < total_values; i++)
2026
{
2027
uint32_t val = pVals[i];
2028
2029
uint32_t bits = val & ((1 << ep_bits) - 1);
2030
uint32_t tq = val >> ep_bits;
2031
2032
bit_values[i] = bits;
2033
2034
if (ep_trits)
2035
{
2036
assert(tq < 3);
2037
tq_accum += tq * tq_mul;
2038
tq_mul *= 3;
2039
if (tq_mul == 243)
2040
{
2041
assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
2042
tq_values[total_tq_values++] = tq_accum;
2043
tq_accum = 0;
2044
tq_mul = 1;
2045
}
2046
}
2047
else if (ep_quints)
2048
{
2049
assert(tq < 5);
2050
tq_accum += tq * tq_mul;
2051
tq_mul *= 5;
2052
if (tq_mul == 125)
2053
{
2054
assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
2055
tq_values[total_tq_values++] = tq_accum;
2056
tq_accum = 0;
2057
tq_mul = 1;
2058
}
2059
}
2060
}
2061
2062
uint32_t total_bits_output = 0;
2063
2064
for (uint32_t i = 0; i < total_tq_values; i++)
2065
{
2066
const uint32_t num_bits = ep_trits ? 8 : 7;
2067
coder.put_bits(tq_values[i], num_bits);
2068
total_bits_output += num_bits;
2069
}
2070
2071
if (tq_mul > 1)
2072
{
2073
uint32_t num_bits;
2074
if (ep_trits)
2075
{
2076
if (tq_mul == 3)
2077
num_bits = 2;
2078
else if (tq_mul == 9)
2079
num_bits = 4;
2080
else if (tq_mul == 27)
2081
num_bits = 5;
2082
else //if (tq_mul == 81)
2083
num_bits = 7;
2084
}
2085
else
2086
{
2087
if (tq_mul == 5)
2088
num_bits = 3;
2089
else //if (tq_mul == 25)
2090
num_bits = 5;
2091
}
2092
coder.put_bits(tq_accum, num_bits);
2093
total_bits_output += num_bits;
2094
}
2095
2096
for (uint32_t i = 0; i < total_values; i++)
2097
{
2098
coder.put_bits(bit_values[i], ep_bits);
2099
total_bits_output += ep_bits;
2100
}
2101
2102
return total_bits_output;
2103
}
2104
2105
static inline uint32_t get_num_endpoint_vals(uint32_t cem)
2106
{
2107
assert((cem == 7) || (cem == 11));
2108
return (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;
2109
}
2110
2111
static void code_block(bitwise_coder& coder,
2112
const astc_helpers::log_astc_block& log_blk,
2113
block_mode block_mode_index,
2114
endpoint_mode em, const uint8_t *pEP_deltas)
2115
{
2116
coder.put_truncated_binary((uint32_t)block_mode_index, (uint32_t)block_mode::cBMTotalModes);
2117
coder.put_truncated_binary((uint32_t)em, (uint32_t)endpoint_mode::cTotal);
2118
2119
const uint32_t num_endpoint_vals = get_num_endpoint_vals(log_blk.m_color_endpoint_modes[0]);
2120
2121
if ((em == endpoint_mode::cUseLeftDelta) || (em == endpoint_mode::cUseUpperDelta))
2122
{
2123
assert(log_blk.m_num_partitions == 1);
2124
2125
for (uint32_t i = 0; i < num_endpoint_vals; i++)
2126
coder.put_bits(pEP_deltas[i], NUM_ENDPOINT_DELTA_BITS);
2127
}
2128
else if (em == endpoint_mode::cRaw)
2129
{
2130
if (log_blk.m_num_partitions == 2)
2131
{
2132
const int unique_partition_index = g_part2_seed_to_unique_index[log_blk.m_partition_id];
2133
assert(unique_partition_index != -1);
2134
2135
coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS2);
2136
}
2137
else if (log_blk.m_num_partitions == 3)
2138
{
2139
const int unique_partition_index = g_part3_seed_to_unique_index[log_blk.m_partition_id];
2140
assert(unique_partition_index != -1);
2141
2142
coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS3);
2143
}
2144
2145
encode_values(coder, num_endpoint_vals * log_blk.m_num_partitions, log_blk.m_endpoints, log_blk.m_endpoint_ise_range);
2146
}
2147
2148
encode_values(coder, log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1), log_blk.m_weights, log_blk.m_weight_ise_range);
2149
}
2150
2151
struct smooth_map_params
2152
{
2153
bool m_no_mse_scaling;
2154
2155
float m_max_smooth_std_dev;
2156
float m_smooth_max_mse_scale;
2157
2158
float m_max_med_smooth_std_dev;
2159
float m_med_smooth_max_mse_scale;
2160
2161
float m_max_ultra_smooth_std_dev;
2162
float m_ultra_smooth_max_mse_scale;
2163
2164
bool m_debug_images;
2165
2166
smooth_map_params()
2167
{
2168
clear();
2169
}
2170
2171
void clear()
2172
{
2173
m_no_mse_scaling = false;
2174
2175
// 3x3 region
2176
m_max_smooth_std_dev = 100.0f;
2177
m_smooth_max_mse_scale = 13000.0f;
2178
2179
// 7x7 region
2180
m_max_med_smooth_std_dev = 9.0f;
2181
m_med_smooth_max_mse_scale = 15000.0f;
2182
2183
// 11x11 region
2184
m_max_ultra_smooth_std_dev = 4.0f;
2185
//m_ultra_smooth_max_mse_scale = 4500.0f;
2186
//m_ultra_smooth_max_mse_scale = 10000.0f;
2187
//m_ultra_smooth_max_mse_scale = 50000.0f;
2188
//m_ultra_smooth_max_mse_scale = 100000.0f;
2189
//m_ultra_smooth_max_mse_scale = 400000.0f;
2190
//m_ultra_smooth_max_mse_scale = 800000.0f;
2191
m_ultra_smooth_max_mse_scale = 2000000.0f;
2192
2193
m_debug_images = true;
2194
}
2195
};
2196
2197
Resampler::Contrib_List* g_contrib_lists[7]; // 1-6
2198
2199
static void init_contrib_lists()
2200
{
2201
for (uint32_t dst_width = 1; dst_width <= 6; dst_width++)
2202
//g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);
2203
g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_BELL_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);
2204
}
2205
2206
#if 0
2207
static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, half_vec3 *pDst_block_half3, vec4F *pDst_block_q16)
2208
{
2209
vec3F temp_block[6][6]; // [y][x]
2210
2211
// first filter rows to temp_block
2212
if (grid_x == 6)
2213
{
2214
memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);
2215
}
2216
else
2217
{
2218
Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
2219
2220
for (uint32_t y = 0; y < 6; y++)
2221
{
2222
for (uint32_t x = 0; x < 6; x++)
2223
{
2224
vec3F p(0.0f);
2225
2226
for (uint32_t i = 0; i < pRow_lists[x].n; i++)
2227
p += pSrc_block[y * 6 + pRow_lists[x].p[i].pixel] * pRow_lists[x].p[i].weight;
2228
2229
p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2230
2231
temp_block[y][x] = p;
2232
} // x
2233
} // y
2234
}
2235
2236
// filter columns
2237
if (grid_y == 6)
2238
{
2239
for (uint32_t y = 0; y < 6; y++)
2240
{
2241
for (uint32_t x = 0; x < 6; x++)
2242
{
2243
for (uint32_t c = 0; c < 3; c++)
2244
{
2245
const basist::half_float h = basist::float_to_half(temp_block[y][x][c]);
2246
2247
pDst_block_half3[x + y * 6][c] = h;
2248
pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);
2249
}
2250
2251
pDst_block_q16[x + y * 6][3] = 0.0f;
2252
} // x
2253
} // y
2254
}
2255
else
2256
{
2257
Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
2258
2259
for (uint32_t x = 0; x < 6; x++)
2260
{
2261
for (uint32_t y = 0; y < 6; y++)
2262
{
2263
vec3F p(0.0f);
2264
2265
for (uint32_t i = 0; i < pCol_lists[y].n; i++)
2266
p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
2267
2268
p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2269
2270
for (uint32_t c = 0; c < 3; c++)
2271
{
2272
const basist::half_float h = basist::float_to_half(p[c]);
2273
2274
pDst_block_half3[x + y * 6][c] = h;
2275
pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);
2276
}
2277
2278
pDst_block_q16[x + y * 6][3] = 0.0f;
2279
2280
} // x
2281
} // y
2282
}
2283
}
2284
#endif
2285
2286
static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec4F* pSrc_block, vec4F* pDst_block)
2287
{
2288
vec4F temp_block[6][6]; // [y][x]
2289
2290
// first filter rows to temp_block
2291
if (grid_x == 6)
2292
{
2293
memcpy(temp_block, pSrc_block, sizeof(vec4F) * 6 * 6);
2294
}
2295
else
2296
{
2297
Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
2298
2299
for (uint32_t y = 0; y < 6; y++)
2300
{
2301
for (uint32_t x = 0; x < 6; x++)
2302
{
2303
vec3F p(0.0f);
2304
2305
for (uint32_t i = 0; i < pRow_lists[x].n; i++)
2306
p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;
2307
2308
p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2309
2310
temp_block[y][x] = p;
2311
} // x
2312
} // y
2313
}
2314
2315
// filter columns
2316
if (grid_y == 6)
2317
{
2318
for (uint32_t y = 0; y < 6; y++)
2319
{
2320
for (uint32_t x = 0; x < 6; x++)
2321
{
2322
for (uint32_t c = 0; c < 3; c++)
2323
pDst_block[x + y * 6][c] = temp_block[y][x][c];
2324
} // x
2325
} // y
2326
}
2327
else
2328
{
2329
Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
2330
2331
for (uint32_t x = 0; x < 6; x++)
2332
{
2333
for (uint32_t y = 0; y < 6; y++)
2334
{
2335
vec3F p(0.0f);
2336
2337
for (uint32_t i = 0; i < pCol_lists[y].n; i++)
2338
p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
2339
2340
p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
2341
2342
pDst_block[x + y * 6] = p;
2343
2344
} // x
2345
} // y
2346
}
2347
}
2348
2349
static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, vec3F* pDst_block)
2350
{
2351
vec3F temp_block[6][6]; // [y][x]
2352
2353
// first filter rows to temp_block
2354
if (grid_x == 6)
2355
{
2356
memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);
2357
}
2358
else
2359
{
2360
Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];
2361
2362
for (uint32_t y = 0; y < 6; y++)
2363
{
2364
for (uint32_t x = 0; x < 6; x++)
2365
{
2366
vec3F p(0.0f);
2367
2368
for (uint32_t i = 0; i < pRow_lists[x].n; i++)
2369
p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;
2370
2371
temp_block[y][x] = p;
2372
} // x
2373
} // y
2374
}
2375
2376
// filter columns
2377
if (grid_y == 6)
2378
{
2379
memcpy(pDst_block, temp_block, sizeof(vec3F) * 6 * 6);
2380
}
2381
else
2382
{
2383
Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];
2384
2385
for (uint32_t x = 0; x < 6; x++)
2386
{
2387
for (uint32_t y = 0; y < 6; y++)
2388
{
2389
vec3F& p = pDst_block[x + y * 6];
2390
p.set(0.0f);
2391
2392
for (uint32_t i = 0; i < pCol_lists[y].n; i++)
2393
p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
2394
} // x
2395
} // y
2396
}
2397
}
2398
2399
static float diff_blocks(const vec4F* pA, const vec4F* pB)
2400
{
2401
const uint32_t BLOCK_T = 36;
2402
2403
float diff = 0.0f;
2404
for (uint32_t i = 0; i < BLOCK_T; i++)
2405
diff += square(pA[i][0] - pB[i][0]) + square(pA[i][1] - pB[i][1]) + square(pA[i][2] - pB[i][2]);
2406
2407
return diff * (1.0f / (float)BLOCK_T);
2408
}
2409
2410
static float sub_and_compute_std_dev(const vec3F* pA, const vec3F* pB)
2411
{
2412
const uint32_t BLOCK_T = 36;
2413
2414
vec3F mean(0.0f);
2415
2416
for (uint32_t i = 0; i < BLOCK_T; i++)
2417
{
2418
vec3F diff(pA[i] - pB[i]);
2419
mean += diff;
2420
}
2421
2422
mean *= (1.0f / (float)BLOCK_T);
2423
2424
vec3F diff_sum(0.0f);
2425
for (uint32_t i = 0; i < BLOCK_T; i++)
2426
{
2427
vec3F diff(pA[i] - pB[i]);
2428
diff -= mean;
2429
diff_sum += vec3F::component_mul(diff, diff);
2430
}
2431
2432
vec3F var(diff_sum * (1.0f / (float)BLOCK_T));
2433
2434
vec3F std_dev(sqrtf(var[0]), sqrtf(var[1]), sqrtf(var[2]));
2435
2436
return maximum(std_dev[0], std_dev[1], std_dev[2]);
2437
}
2438
2439
static void create_smooth_maps2(
2440
vector2D<float>& smooth_block_mse_scales,
2441
const image& orig_img,
2442
smooth_map_params& params, image* pUltra_smooth_img = nullptr)
2443
{
2444
const uint32_t width = orig_img.get_width();
2445
const uint32_t height = orig_img.get_height();
2446
//const uint32_t total_pixels = orig_img.get_total_pixels();
2447
const uint32_t num_comps = 3;
2448
2449
if (params.m_no_mse_scaling)
2450
{
2451
smooth_block_mse_scales.set_all(1.0f);
2452
return;
2453
}
2454
2455
smooth_block_mse_scales.resize(width, height);
2456
2457
image smooth_vis, med_smooth_vis, ultra_smooth_vis;
2458
2459
if (params.m_debug_images)
2460
{
2461
smooth_vis.resize(width, height);
2462
med_smooth_vis.resize(width, height);
2463
ultra_smooth_vis.resize(width, height);
2464
}
2465
2466
for (uint32_t y = 0; y < height; y++)
2467
{
2468
for (uint32_t x = 0; x < width; x++)
2469
{
2470
{
2471
tracked_stat_dbl comp_stats[4];
2472
for (int yd = -1; yd <= 1; yd++)
2473
{
2474
for (int xd = -1; xd <= 1; xd++)
2475
{
2476
const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
2477
2478
comp_stats[0].update((float)p[0]);
2479
comp_stats[1].update((float)p[1]);
2480
comp_stats[2].update((float)p[2]);
2481
}
2482
}
2483
2484
float max_std_dev = 0.0f;
2485
for (uint32_t i = 0; i < num_comps; i++)
2486
max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
2487
2488
float yl = clampf(max_std_dev / params.m_max_smooth_std_dev, 0.0f, 1.0f);
2489
//yl = powf(yl, 2.0f);
2490
yl = powf(yl, 1.0f / 2.0f); // substantially less bits
2491
2492
smooth_block_mse_scales(x, y) = lerp(params.m_smooth_max_mse_scale, 1.0f, yl);
2493
2494
if (params.m_debug_images)
2495
{
2496
//smooth_vis(x, y).set(clamp((int)((smooth_block_mse_scales(x, y) - 1.0f) / (params.m_smooth_max_mse_scale - 1.0f) * 255.0f + .5f), 0, 255));
2497
// white=high local activity (edges/detail)
2498
// black=low local activity (smooth - error is amplified)
2499
smooth_vis(x, y).set(clamp((int)((yl * 255.0f) + .5f), 0, 255));
2500
}
2501
}
2502
2503
{
2504
tracked_stat_dbl comp_stats[4];
2505
2506
const int S = 3;
2507
for (int yd = -S; yd < S; yd++)
2508
{
2509
for (int xd = -S; xd < S; xd++)
2510
{
2511
const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
2512
2513
comp_stats[0].update((float)p[0]);
2514
comp_stats[1].update((float)p[1]);
2515
comp_stats[2].update((float)p[2]);
2516
}
2517
}
2518
2519
float max_std_dev = 0.0f;
2520
for (uint32_t i = 0; i < num_comps; i++)
2521
max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
2522
2523
float yl = clampf(max_std_dev / params.m_max_med_smooth_std_dev, 0.0f, 1.0f);
2524
//yl = powf(yl, 2.0f);
2525
2526
smooth_block_mse_scales(x, y) = lerp(params.m_med_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);
2527
2528
if (params.m_debug_images)
2529
med_smooth_vis(x, y).set((int)std::round(yl * 255.0f));
2530
}
2531
2532
{
2533
tracked_stat_dbl comp_stats[4];
2534
2535
const int S = 5;
2536
for (int yd = -S; yd < S; yd++)
2537
{
2538
for (int xd = -S; xd < S; xd++)
2539
{
2540
const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);
2541
2542
comp_stats[0].update((float)p[0]);
2543
comp_stats[1].update((float)p[1]);
2544
comp_stats[2].update((float)p[2]);
2545
}
2546
}
2547
2548
float max_std_dev = 0.0f;
2549
for (uint32_t i = 0; i < num_comps; i++)
2550
max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());
2551
2552
float yl = clampf(max_std_dev / params.m_max_ultra_smooth_std_dev, 0.0f, 1.0f);
2553
yl = powf(yl, 2.0f);
2554
2555
smooth_block_mse_scales(x, y) = lerp(params.m_ultra_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);
2556
2557
if (params.m_debug_images)
2558
ultra_smooth_vis(x, y).set((int)std::round(yl * 255.0f));
2559
}
2560
2561
}
2562
}
2563
2564
if (params.m_debug_images)
2565
{
2566
save_png("dbg_smooth_vis.png", smooth_vis);
2567
save_png("dbg_med_smooth_vis.png", med_smooth_vis);
2568
save_png("dbg_ultra_smooth_vis.png", ultra_smooth_vis);
2569
2570
image vis_img(width, height);
2571
2572
float max_scale = 0.0f;
2573
for (uint32_t y = 0; y < height; y++)
2574
for (uint32_t x = 0; x < width; x++)
2575
max_scale = basisu::maximumf(max_scale, smooth_block_mse_scales(x, y));
2576
2577
for (uint32_t y = 0; y < height; y++)
2578
for (uint32_t x = 0; x < width; x++)
2579
vis_img(x, y).set((int)std::round(smooth_block_mse_scales(x, y) * 255.0f / max_scale));
2580
2581
save_png("scale_vis.png", vis_img);
2582
}
2583
2584
if (pUltra_smooth_img)
2585
*pUltra_smooth_img = ultra_smooth_vis;
2586
}
2587
2588
const float REALLY_DARK_I_THRESHOLD = 0.0625f;
2589
const float REALLY_DARK_MSE_ERR_SCALE = 128.0f;
2590
const float REALLY_DARK_DELTA_ITP_JND_SCALE = 5.0f;
2591
2592
static float compute_pixel_mse_itp(const vec3F& orig_pixel_itp, const vec3F& comp_pixel_itp, bool delta_itp_dark_adjustment)
2593
{
2594
float delta_i = orig_pixel_itp[0] - comp_pixel_itp[0];
2595
float delta_t = orig_pixel_itp[1] - comp_pixel_itp[1];
2596
float delta_p = orig_pixel_itp[2] - comp_pixel_itp[2];
2597
2598
float err = (delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p);
2599
2600
if (delta_itp_dark_adjustment)
2601
{
2602
// We have to process a large range of inputs, including extremely dark inputs.
2603
// Artifically amplify MSE on very dark pixels - otherwise they'll be overly compressed at higher lambdas.
2604
// This is to better handle very dark signals which could be explictly overexposed.
2605
float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig_pixel_itp[0]);
2606
s = lerp(REALLY_DARK_MSE_ERR_SCALE, 1.0f, s);
2607
err *= s;
2608
}
2609
2610
return err;
2611
}
2612
2613
static float compute_block_mse_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp, bool delta_itp_dark_adjustment)
2614
{
2615
float total_mse = 0.0f;
2616
2617
for (uint32_t y = 0; y < block_h; y++)
2618
{
2619
for (uint32_t x = 0; x < block_w; x++)
2620
{
2621
total_mse += compute_pixel_mse_itp(pOrig_pixels_itp[x + y * block_w], pComp_pixels_itp[x + y * block_w], delta_itp_dark_adjustment);
2622
} // x
2623
} // y
2624
2625
return total_mse * (1.0f / (float)(block_w * block_h));
2626
}
2627
2628
static float compute_block_ssim_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp)
2629
{
2630
const uint32_t n = block_w * block_h;
2631
assert(n <= 36);
2632
2633
stats<float> x_stats[3], y_stats[3];
2634
comparative_stats<float> xy_cov[3];
2635
2636
for (uint32_t c = 0; c < 3; c++)
2637
{
2638
x_stats[c].calc_simplified(n, &pOrig_pixels_itp[0][c], 3);
2639
y_stats[c].calc_simplified(n, &pComp_pixels_itp[0][c], 3);
2640
}
2641
2642
for (uint32_t c = 0; c < 3; c++)
2643
xy_cov[c].calc_cov(n, &pOrig_pixels_itp[0][c], &pComp_pixels_itp[0][c], 3, 3, &x_stats[c], &y_stats[c]);
2644
2645
float ssim[3];
2646
const double d = 1.0f, k1 = .01f, k2 = .03f;
2647
2648
// weight mean error more highly to reduce blocking
2649
float ap = 1.5f, bp = 1.0f, cp = 1.0f;
2650
2651
const double s_c1 = square(k1 * d), s_c2 = square(k2 * d);
2652
const double s_c3(s_c2 * .5f);
2653
2654
for (uint32_t c = 0; c < 3; c++)
2655
{
2656
float lum = (float)((2.0f * x_stats[c].m_avg * y_stats[c].m_avg + s_c1) / (square(x_stats[c].m_avg) + square(y_stats[c].m_avg) + s_c1));
2657
lum = saturate(lum);
2658
2659
float con = (float)((2.0f * x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c2) / (x_stats[c].m_var + y_stats[c].m_var + s_c2));
2660
con = saturate(con);
2661
2662
float str = (float)((xy_cov[c].m_cov + s_c3) / (x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c3));
2663
str = saturate(str);
2664
2665
ssim[c] = powf(lum, ap) * powf(con, bp) * powf(str, cp);
2666
}
2667
2668
#if 0
2669
float final_ssim = (ssim[0] * .4f + ssim[1] * .3f + ssim[2] * .3f);
2670
#elif 1
2671
float final_ssim = ssim[0] * ssim[1] * ssim[2];
2672
#else
2673
const float LP = .75f;
2674
float final_ssim = ssim[0] * powf((ssim[1] + ssim[2]) * .5f, LP);
2675
#endif
2676
2677
return final_ssim;
2678
}
2679
2680
// delta ITP, 1.0 is JND (Rec. ITU-R BT.2124), modified for higher error at low light
2681
static float compute_pixel_delta_itp(const vec3F& a, const vec3F& b, const vec3F& orig, bool delta_itp_dark_adjustment)
2682
{
2683
float delta_i = a[0] - b[0];
2684
float delta_t = a[1] - b[1];
2685
float delta_p = a[2] - b[2];
2686
2687
float err = 720.0f * sqrtf((delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p));
2688
2689
float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig[0]);
2690
2691
if (delta_itp_dark_adjustment)
2692
{
2693
// This is to better handle very dark signals which could be explictly overexposed.
2694
s = lerp(REALLY_DARK_DELTA_ITP_JND_SCALE, 1.0f, s);
2695
err *= s;
2696
}
2697
2698
return err;
2699
}
2700
2701
struct candidate_encoding
2702
{
2703
encoding_type m_encoding_type;
2704
2705
basist::half_float m_solid_color[3];
2706
2707
uint32_t m_run_len;
2708
2709
vec3F m_comp_pixels[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]
2710
vec3F m_comp_pixels_itp[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]
2711
2712
endpoint_mode m_endpoint_mode;
2713
block_mode m_block_mode;
2714
2715
bitwise_coder m_coder;
2716
2717
// The block to code, which may not be valid ASTC. This may have to be transcoded (by requantizing the weights/endpoints) before it's valid ASTC.
2718
// Note the endpoints may be coded endpoints OR transcoded endpoints, depending on the encoding type.
2719
astc_helpers::log_astc_block m_coded_log_blk;
2720
2721
// The block the decoder outputs.
2722
astc_helpers::log_astc_block m_decomp_log_blk;
2723
2724
int m_reuse_delta_index;
2725
2726
float m_t, m_d, m_bits;
2727
2728
candidate_encoding()
2729
{
2730
clear();
2731
}
2732
2733
candidate_encoding(const candidate_encoding &other)
2734
{
2735
*this = other;
2736
}
2737
2738
candidate_encoding(candidate_encoding&& other)
2739
{
2740
*this = std::move(other);
2741
}
2742
2743
candidate_encoding& operator=(const candidate_encoding& rhs)
2744
{
2745
if (this == &rhs)
2746
return *this;
2747
2748
m_encoding_type = rhs.m_encoding_type;
2749
memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));
2750
m_run_len = rhs.m_run_len;
2751
memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));
2752
m_endpoint_mode = rhs.m_endpoint_mode;
2753
m_block_mode = rhs.m_block_mode;
2754
m_coder = rhs.m_coder;
2755
m_coded_log_blk = rhs.m_coded_log_blk;
2756
m_decomp_log_blk = rhs.m_decomp_log_blk;
2757
m_reuse_delta_index = rhs.m_reuse_delta_index;
2758
2759
return *this;
2760
}
2761
2762
candidate_encoding& operator=(candidate_encoding&& rhs)
2763
{
2764
if (this == &rhs)
2765
return *this;
2766
2767
m_encoding_type = rhs.m_encoding_type;
2768
memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));
2769
m_run_len = rhs.m_run_len;
2770
memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));
2771
m_endpoint_mode = rhs.m_endpoint_mode;
2772
m_block_mode = rhs.m_block_mode;
2773
m_coder = std::move(rhs.m_coder);
2774
m_coded_log_blk = rhs.m_coded_log_blk;
2775
m_decomp_log_blk = rhs.m_decomp_log_blk;
2776
m_reuse_delta_index = rhs.m_reuse_delta_index;
2777
2778
return *this;
2779
}
2780
2781
void clear()
2782
{
2783
m_encoding_type = encoding_type::cInvalid;
2784
2785
clear_obj(m_solid_color);
2786
2787
m_run_len = 0;
2788
2789
clear_obj(m_comp_pixels);
2790
2791
m_endpoint_mode = endpoint_mode::cInvalid;
2792
m_block_mode = block_mode::cInvalid;
2793
2794
m_coder.restart();
2795
2796
m_coded_log_blk.clear();
2797
m_decomp_log_blk.clear();
2798
2799
m_t = 0;
2800
m_d = 0;
2801
m_bits = 0;
2802
2803
m_reuse_delta_index = 0;
2804
}
2805
};
2806
2807
bool decode_astc_block(uint32_t block_w, uint32_t block_h, astc_helpers::log_astc_block &log_blk, vec3F *pPixels)
2808
{
2809
assert((block_w <= 6) && (block_h <= 6));
2810
2811
half_vec4 decoded_pixels_half4[6 * 6]; // [y][x]
2812
bool status = astc_helpers::decode_block(log_blk, decoded_pixels_half4, block_w, block_h, astc_helpers::cDecodeModeHDR16);
2813
assert(status);
2814
2815
if (!status)
2816
return false;
2817
2818
for (uint32_t y = 0; y < block_h; y++)
2819
{
2820
for (uint32_t x = 0; x < block_w; x++)
2821
{
2822
pPixels[x + y * block_w].set(
2823
basist::half_to_float(decoded_pixels_half4[x + y * block_w][0]),
2824
basist::half_to_float(decoded_pixels_half4[x + y * block_w][1]),
2825
basist::half_to_float(decoded_pixels_half4[x + y * block_w][2]));
2826
} // x
2827
} //y
2828
2829
return true;
2830
}
2831
2832
static inline bool validate_log_blk(const astc_helpers::log_astc_block &decomp_blk)
2833
{
2834
astc_helpers::astc_block phys_blk;
2835
return astc_helpers::pack_astc_block(phys_blk, decomp_blk);
2836
}
2837
2838
#define SYNC_MARKERS (0)
2839
2840
static bool decode_file(const uint8_vec& comp_data, vector2D<astc_helpers::astc_block>& decoded_blocks, uint32_t &width, uint32_t &height)
2841
{
2842
interval_timer tm;
2843
tm.start();
2844
2845
const uint32_t BLOCK_W = 6, BLOCK_H = 6;
2846
2847
width = 0;
2848
height = 0;
2849
2850
if (comp_data.size() <= 2*3)
2851
return false;
2852
2853
basist::bitwise_decoder decoder;
2854
if (!decoder.init(comp_data.data(), comp_data.size_u32()))
2855
return false;
2856
2857
if (decoder.get_bits(16) != 0xABCD)
2858
return false;
2859
2860
width = decoder.get_bits(16);
2861
height = decoder.get_bits(16);
2862
2863
if (!width || !height || (width > MAX_ASTC_HDR_6X6_DIM) || (height > MAX_ASTC_HDR_6X6_DIM))
2864
return false;
2865
2866
const uint32_t num_blocks_x = (width + BLOCK_W - 1) / BLOCK_W;
2867
const uint32_t num_blocks_y = (height + BLOCK_H - 1) / BLOCK_H;
2868
const uint32_t total_blocks = num_blocks_x * num_blocks_y;
2869
2870
decoded_blocks.resize(num_blocks_x, num_blocks_y);
2871
//memset(decoded_blocks.get_ptr(), 0, decoded_blocks.size_in_bytes());
2872
2873
vector2D<astc_helpers::log_astc_block> decoded_log_blocks(num_blocks_x, num_blocks_y);
2874
//memset(decoded_log_blocks.get_ptr(), 0, decoded_log_blocks.size_in_bytes());
2875
2876
uint32_t cur_bx = 0, cur_by = 0;
2877
uint32_t step_counter = 0;
2878
BASISU_NOTE_UNUSED(step_counter);
2879
2880
while (cur_by < num_blocks_y)
2881
{
2882
step_counter++;
2883
2884
//if ((cur_bx == 9) && (cur_by == 13))
2885
// printf("!");
2886
2887
#if SYNC_MARKERS
2888
uint32_t mk = decoder.get_bits(16);
2889
if (mk != 0xDEAD)
2890
{
2891
printf("!");
2892
assert(0);
2893
return false;
2894
}
2895
#endif
2896
if (decoder.get_bits_remaining() < 1)
2897
return false;
2898
2899
encoding_type et = encoding_type::cBlock;
2900
2901
uint32_t b0 = decoder.get_bits(1);
2902
if (!b0)
2903
{
2904
uint32_t b1 = decoder.get_bits(1);
2905
if (b1)
2906
et = encoding_type::cReuse;
2907
else
2908
{
2909
uint32_t b2 = decoder.get_bits(1);
2910
if (b2)
2911
et = encoding_type::cSolid;
2912
else
2913
et = encoding_type::cRun;
2914
}
2915
}
2916
2917
switch (et)
2918
{
2919
case encoding_type::cRun:
2920
{
2921
if (!cur_bx && !cur_by)
2922
return false;
2923
2924
const uint32_t run_len = decoder.decode_vlc(5) + 1;
2925
2926
uint32_t num_blocks_remaining = total_blocks - (cur_bx + cur_by * num_blocks_x);
2927
if (run_len > num_blocks_remaining)
2928
return false;
2929
2930
uint32_t prev_bx = cur_bx, prev_by = cur_by;
2931
2932
if (cur_bx)
2933
prev_bx--;
2934
else
2935
{
2936
prev_bx = num_blocks_x - 1;
2937
prev_by--;
2938
}
2939
2940
const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);
2941
const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);
2942
2943
for (uint32_t i = 0; i < run_len; i++)
2944
{
2945
decoded_log_blocks(cur_bx, cur_by) = prev_log_blk;
2946
decoded_blocks(cur_bx, cur_by) = prev_phys_blk;
2947
2948
cur_bx++;
2949
if (cur_bx == num_blocks_x)
2950
{
2951
cur_bx = 0;
2952
cur_by++;
2953
}
2954
}
2955
2956
break;
2957
}
2958
case encoding_type::cSolid:
2959
{
2960
const basist::half_float rh = (basist::half_float)decoder.get_bits(15);
2961
const basist::half_float gh = (basist::half_float)decoder.get_bits(15);
2962
const basist::half_float bh = (basist::half_float)decoder.get_bits(15);
2963
2964
astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
2965
2966
log_blk.clear();
2967
log_blk.m_solid_color_flag_hdr = true;
2968
log_blk.m_solid_color[0] = rh;
2969
log_blk.m_solid_color[1] = gh;
2970
log_blk.m_solid_color[2] = bh;
2971
log_blk.m_solid_color[3] = basist::float_to_half(1.0f);
2972
2973
bool status = astc_helpers::pack_astc_block(decoded_blocks(cur_bx, cur_by), log_blk);
2974
if (!status)
2975
return false;
2976
2977
cur_bx++;
2978
if (cur_bx == num_blocks_x)
2979
{
2980
cur_bx = 0;
2981
cur_by++;
2982
}
2983
2984
break;
2985
}
2986
case encoding_type::cReuse:
2987
{
2988
if (!cur_bx && !cur_by)
2989
return false;
2990
2991
const uint32_t reuse_delta_index = decoder.get_bits(REUSE_XY_DELTA_BITS);
2992
2993
const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;
2994
const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;
2995
2996
const int prev_bx = cur_bx + reuse_delta_x, prev_by = cur_by + reuse_delta_y;
2997
if ((prev_bx < 0) || (prev_bx >= (int)num_blocks_x))
2998
return false;
2999
if (prev_by < 0)
3000
return false;
3001
3002
const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);
3003
const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);
3004
3005
if (prev_log_blk.m_solid_color_flag_hdr)
3006
return false;
3007
3008
astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3009
astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3010
3011
log_blk = prev_log_blk;
3012
3013
const uint32_t total_grid_weights = log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1);
3014
3015
bool status = basist::astc_6x6_hdr::decode_values(decoder, total_grid_weights, log_blk.m_weight_ise_range, log_blk.m_weights);
3016
if (!status)
3017
return false;
3018
3019
astc_helpers::log_astc_block decomp_blk;
3020
status = astc_helpers::unpack_block(&prev_phys_blk, decomp_blk, BLOCK_W, BLOCK_H);
3021
if (!status)
3022
return false;
3023
3024
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3025
basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, decomp_blk.m_weight_ise_range);
3026
3027
copy_weight_grid(log_blk.m_dual_plane, log_blk.m_grid_width, log_blk.m_grid_height, transcode_weights, decomp_blk);
3028
3029
status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3030
if (!status)
3031
return false;
3032
3033
cur_bx++;
3034
if (cur_bx == num_blocks_x)
3035
{
3036
cur_bx = 0;
3037
cur_by++;
3038
}
3039
3040
break;
3041
}
3042
case encoding_type::cBlock:
3043
{
3044
const block_mode bm = (block_mode)decoder.decode_truncated_binary((uint32_t)block_mode::cBMTotalModes);
3045
const endpoint_mode em = (endpoint_mode)decoder.decode_truncated_binary((uint32_t)endpoint_mode::cTotal);
3046
3047
switch (em)
3048
{
3049
case endpoint_mode::cUseLeft:
3050
case endpoint_mode::cUseUpper:
3051
{
3052
int neighbor_bx = cur_bx, neighbor_by = cur_by;
3053
3054
if (em == endpoint_mode::cUseLeft)
3055
neighbor_bx--;
3056
else
3057
neighbor_by--;
3058
3059
if ((neighbor_bx < 0) || (neighbor_by < 0))
3060
return false;
3061
3062
const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);
3063
if (!neighbor_blk.m_color_endpoint_modes[0])
3064
return false;
3065
3066
const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
3067
const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
3068
3069
if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])
3070
return false;
3071
3072
astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3073
astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3074
3075
log_blk.clear();
3076
log_blk.m_num_partitions = 1;
3077
log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3078
log_blk.m_endpoint_ise_range = neighbor_blk.m_endpoint_ise_range;
3079
log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
3080
log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
3081
log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
3082
log_blk.m_dual_plane = (uint8_t)bmd.m_dp;
3083
log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3084
3085
memcpy(log_blk.m_endpoints, neighbor_blk.m_endpoints, num_endpoint_values);
3086
3087
const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
3088
3089
bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
3090
if (!status)
3091
return false;
3092
3093
astc_helpers::log_astc_block decomp_blk;
3094
decomp_blk.clear();
3095
3096
decomp_blk.m_num_partitions = 1;
3097
decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3098
decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
3099
decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
3100
decomp_blk.m_dual_plane = bmd.m_dp;
3101
decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3102
3103
basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
3104
3105
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3106
basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
3107
3108
copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
3109
3110
status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3111
if (!status)
3112
return false;
3113
3114
cur_bx++;
3115
if (cur_bx == num_blocks_x)
3116
{
3117
cur_bx = 0;
3118
cur_by++;
3119
}
3120
3121
break;
3122
}
3123
case endpoint_mode::cUseLeftDelta:
3124
case endpoint_mode::cUseUpperDelta:
3125
{
3126
int neighbor_bx = cur_bx, neighbor_by = cur_by;
3127
3128
if (em == endpoint_mode::cUseLeftDelta)
3129
neighbor_bx--;
3130
else
3131
neighbor_by--;
3132
3133
if ((neighbor_bx < 0) || (neighbor_by < 0))
3134
return false;
3135
3136
const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);
3137
if (!neighbor_blk.m_color_endpoint_modes[0])
3138
return false;
3139
3140
const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
3141
const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
3142
3143
if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])
3144
return false;
3145
3146
astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3147
astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3148
3149
log_blk.clear();
3150
log_blk.m_num_partitions = 1;
3151
log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3152
log_blk.m_dual_plane = bmd.m_dp;
3153
log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3154
3155
log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;
3156
basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, neighbor_blk.m_endpoint_ise_range, neighbor_blk.m_endpoints, bmd.m_endpoint_ise_range, log_blk.m_endpoints);
3157
3158
const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;
3159
const int low_delta_limit = -(total_endpoint_delta_vals / 2); // high_delta_limit = (total_endpoint_delta_vals / 2) - 1;
3160
3161
const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_ISE_to_rank;
3162
const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_rank_to_ISE;
3163
const int total_endpoint_levels = astc_helpers::get_ise_levels(log_blk.m_endpoint_ise_range);
3164
3165
for (uint32_t i = 0; i < num_endpoint_values; i++)
3166
{
3167
int cur_val = ise_to_rank[log_blk.m_endpoints[i]];
3168
3169
int delta = (int)decoder.get_bits(NUM_ENDPOINT_DELTA_BITS) + low_delta_limit;
3170
3171
cur_val += delta;
3172
if ((cur_val < 0) || (cur_val >= total_endpoint_levels))
3173
return false;
3174
3175
log_blk.m_endpoints[i] = rank_to_ise[cur_val];
3176
}
3177
3178
log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
3179
log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
3180
log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
3181
3182
const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
3183
3184
bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
3185
if (!status)
3186
return false;
3187
3188
astc_helpers::log_astc_block decomp_blk;
3189
decomp_blk.clear();
3190
3191
decomp_blk.m_num_partitions = 1;
3192
decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
3193
decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
3194
decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
3195
decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp;
3196
decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3197
3198
basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
3199
3200
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3201
basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
3202
3203
copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
3204
3205
status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3206
if (!status)
3207
return false;
3208
3209
cur_bx++;
3210
if (cur_bx == num_blocks_x)
3211
{
3212
cur_bx = 0;
3213
cur_by++;
3214
}
3215
3216
break;
3217
}
3218
case endpoint_mode::cRaw:
3219
{
3220
const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
3221
3222
const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);
3223
3224
astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
3225
astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
3226
3227
log_blk.clear();
3228
log_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;
3229
3230
for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
3231
log_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;
3232
3233
log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;
3234
log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
3235
3236
log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
3237
log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
3238
log_blk.m_dual_plane = (uint8_t)bmd.m_dp;
3239
log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3240
3241
if (bmd.m_num_partitions == 2)
3242
{
3243
const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS2);
3244
log_blk.m_partition_id = (uint16_t)g_part2_unique_index_to_seed[unique_partition_index];
3245
}
3246
else if (bmd.m_num_partitions == 3)
3247
{
3248
const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS3);
3249
log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_partition_index];
3250
}
3251
3252
bool status = decode_values(decoder, num_endpoint_values * bmd.m_num_partitions, bmd.m_endpoint_ise_range, log_blk.m_endpoints);
3253
if (!status)
3254
return false;
3255
3256
const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);
3257
3258
status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
3259
if (!status)
3260
return false;
3261
3262
astc_helpers::log_astc_block decomp_blk;
3263
decomp_blk.clear();
3264
3265
decomp_blk.m_dual_plane = bmd.m_dp;
3266
decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
3267
decomp_blk.m_partition_id = log_blk.m_partition_id;
3268
3269
decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;
3270
3271
for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
3272
decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;
3273
3274
decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
3275
decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
3276
3277
for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
3278
basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, bmd.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p);
3279
3280
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
3281
basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);
3282
3283
copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk);
3284
3285
status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
3286
if (!status)
3287
return false;
3288
3289
cur_bx++;
3290
if (cur_bx == num_blocks_x)
3291
{
3292
cur_bx = 0;
3293
cur_by++;
3294
}
3295
3296
break;
3297
}
3298
default:
3299
{
3300
assert(0);
3301
return false;
3302
}
3303
}
3304
3305
break;
3306
}
3307
default:
3308
{
3309
assert(0);
3310
return false;
3311
}
3312
}
3313
}
3314
3315
if (decoder.get_bits(16) != 0xA742)
3316
{
3317
fmt_error_printf("End marker not found!\n");
3318
return false;
3319
}
3320
3321
//fmt_printf("Total decode_file() time: {} secs\n", tm.get_elapsed_secs());
3322
3323
return true;
3324
}
3325
3326
static bool unpack_physical_astc_block(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)
3327
{
3328
astc_helpers::log_astc_block log_blk;
3329
if (!astc_helpers::unpack_block(pBlock, log_blk, block_width, block_height))
3330
return false;
3331
3332
basist::half_float half_block[MAX_BLOCK_W * MAX_BLOCK_H][4];
3333
if (!astc_helpers::decode_block(log_blk, half_block, block_width, block_height, astc_helpers::cDecodeModeHDR16))
3334
return false;
3335
3336
const uint32_t total_block_pixels = block_width * block_height;
3337
for (uint32_t p = 0; p < total_block_pixels; p++)
3338
{
3339
pPixels[p][0] = basist::half_to_float(half_block[p][0]);
3340
pPixels[p][1] = basist::half_to_float(half_block[p][1]);
3341
pPixels[p][2] = basist::half_to_float(half_block[p][2]);
3342
pPixels[p][3] = basist::half_to_float(half_block[p][3]);
3343
}
3344
3345
return true;
3346
}
3347
3348
static bool unpack_physical_astc_block_google(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)
3349
{
3350
return basisu_astc::astc::decompress_hdr((float *)pPixels, (uint8_t*)pBlock, block_width, block_height);
3351
}
3352
3353
static bool pack_bc6h_image(const imagef &src_img, vector2D<basist::bc6h_block> &bc6h_blocks, imagef *pPacked_bc6h_img, const fast_bc6h_params &enc_params)
3354
{
3355
const uint32_t width = src_img.get_width();
3356
const uint32_t height = src_img.get_height();
3357
3358
if (pPacked_bc6h_img)
3359
pPacked_bc6h_img->resize(width, height);
3360
3361
interval_timer tm;
3362
double total_enc_time = 0.0f;
3363
3364
const uint32_t num_blocks_x = src_img.get_block_width(4);
3365
const uint32_t num_blocks_y = src_img.get_block_height(4);
3366
3367
bc6h_blocks.resize(num_blocks_x, num_blocks_y);
3368
3369
for (uint32_t by = 0; by < num_blocks_y; by++)
3370
{
3371
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
3372
{
3373
// Extract source image block
3374
vec4F block_pixels[4][4]; // [y][x]
3375
src_img.extract_block_clamped(&block_pixels[0][0], bx * 4, by * 4, 4, 4);
3376
3377
basist::half_float half_pixels[16 * 3]; // [y][x]
3378
3379
for (uint32_t y = 0; y < 4; y++)
3380
{
3381
for (uint32_t x = 0; x < 4; x++)
3382
{
3383
for (uint32_t c = 0; c < 3; c++)
3384
{
3385
float v = block_pixels[y][x][c];
3386
3387
basist::half_float h = basist::float_to_half(v);
3388
3389
half_pixels[(x + y * 4) * 3 + c] = h;
3390
3391
} // c
3392
3393
} // x
3394
} // y
3395
3396
basist::bc6h_block& bc6h_blk = bc6h_blocks(bx, by);
3397
3398
tm.start();
3399
3400
basist::astc_6x6_hdr::fast_encode_bc6h(half_pixels, &bc6h_blk, enc_params);
3401
3402
total_enc_time += tm.get_elapsed_secs();
3403
3404
if (pPacked_bc6h_img)
3405
{
3406
basist::half_float unpacked_blk[16 * 3];
3407
bool status = unpack_bc6h(&bc6h_blk, unpacked_blk, false);
3408
assert(status);
3409
if (!status)
3410
{
3411
fmt_error_printf("unpack_bc6h() failed\n");
3412
return false;
3413
}
3414
3415
for (uint32_t y = 0; y < 4; y++)
3416
{
3417
for (uint32_t x = 0; x < 4; x++)
3418
{
3419
vec4F p;
3420
3421
for (uint32_t c = 0; c < 3; c++)
3422
{
3423
float v = basist::half_to_float(unpacked_blk[(x + y * 4) * 3 + c]);
3424
p[c] = v;
3425
3426
} // c
3427
3428
p[3] = 1.0f;
3429
3430
pPacked_bc6h_img->set_clipped(bx * 4 + x, by * 4 + y, p);
3431
} // x
3432
} // y
3433
}
3434
3435
} // bx
3436
} // by
3437
3438
//fmt_printf("Total BC6H encode time: {}\n", total_enc_time);
3439
3440
return true;
3441
}
3442
3443
static float dist_to_line_squared(const vec3F& p, const vec3F &line_org, const vec3F &line_dir)
3444
{
3445
vec3F q(p - line_org);
3446
vec3F v(q - q.dot(line_dir) * line_dir);
3447
return v.dot(v);
3448
}
3449
3450
static void estimate_partitions_mode7_and_11(
3451
uint32_t num_parts, // 2 or 3 partitions
3452
uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns
3453
uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine
3454
const vec3F *pHalf_pixels_as_floats, // block's half pixel values casted to floats
3455
const astc_hdr_codec_base_options& coptions, // options
3456
uint32_t num_desired_pats,
3457
int *pDesired_pat_indices_mode11, int *pDesired_pat_indices_mode7) // output indices
3458
{
3459
BASISU_NOTE_UNUSED(coptions);
3460
BASISU_NOTE_UNUSED(num_unique_pats);
3461
3462
const uint32_t BLOCK_W = 6, BLOCK_H = 6, MAX_PARTS = 3; // BLOCK_T = 6 * 6
3463
assert(num_parts <= MAX_PARTS);
3464
3465
struct candidate_res
3466
{
3467
float m_total_sq_dist;
3468
uint32_t m_index;
3469
bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }
3470
};
3471
3472
const uint32_t MAX_CANDIDATES = 1024;
3473
assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));
3474
3475
candidate_res mode11_candidates[MAX_CANDIDATES];
3476
candidate_res mode7_candidates[MAX_CANDIDATES];
3477
3478
const vec3F grayscale_axis(0.5773502691f);
3479
3480
for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)
3481
{
3482
const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];
3483
assert(unique_part_index < num_unique_pats);
3484
3485
const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];
3486
3487
vec3F part_means[MAX_PARTS];
3488
uint32_t part_total_texels[MAX_PARTS] = { 0 };
3489
3490
for (uint32_t i = 0; i < num_parts; i++)
3491
part_means[i].clear();
3492
3493
for (uint32_t y = 0; y < BLOCK_H; y++)
3494
{
3495
for (uint32_t x = 0; x < BLOCK_W; x++)
3496
{
3497
const uint32_t part_index = (*pPat)(x, y);
3498
assert(part_index < num_parts);
3499
3500
part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];
3501
part_total_texels[part_index]++;
3502
3503
} // x
3504
} // y
3505
3506
for (uint32_t i = 0; i < num_parts; i++)
3507
{
3508
assert(part_total_texels[i]);
3509
part_means[i] /= (float)part_total_texels[i];
3510
}
3511
3512
float part_cov[MAX_PARTS][6];
3513
memset(part_cov, 0, sizeof(part_cov));
3514
3515
for (uint32_t y = 0; y < BLOCK_H; y++)
3516
{
3517
for (uint32_t x = 0; x < BLOCK_W; x++)
3518
{
3519
const uint32_t part_index = (*pPat)(x, y);
3520
assert(part_index < num_parts);
3521
3522
const vec3F p(pHalf_pixels_as_floats[x + y * BLOCK_W] - part_means[part_index]);
3523
3524
const float r = p[0], g = p[1], b = p[2];
3525
3526
part_cov[part_index][0] += r * r;
3527
part_cov[part_index][1] += r * g;
3528
part_cov[part_index][2] += r * b;
3529
part_cov[part_index][3] += g * g;
3530
part_cov[part_index][4] += g * b;
3531
part_cov[part_index][5] += b * b;
3532
3533
} // x
3534
} // y
3535
3536
// For each partition compute the total variance of all channels.
3537
float total_variance[MAX_PARTS];
3538
for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3539
total_variance[part_index] = part_cov[part_index][0] + part_cov[part_index][3] + part_cov[part_index][5];
3540
3541
vec3F part_axis[MAX_PARTS];
3542
float mode11_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis
3543
float mode7_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis
3544
3545
for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3546
{
3547
float* pCov = &part_cov[part_index][0];
3548
3549
float xr = .9f, xg = 1.0f, xb = .7f;
3550
3551
const uint32_t NUM_POWER_ITERS = 4;
3552
for (uint32_t iter = 0; iter < NUM_POWER_ITERS; iter++)
3553
{
3554
float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];
3555
float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];
3556
float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];
3557
3558
float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
3559
3560
if (m >= 1e-10f)
3561
{
3562
m = 1.0f / m;
3563
3564
r *= m;
3565
g *= m;
3566
b *= m;
3567
}
3568
3569
xr = r;
3570
xg = g;
3571
xb = b;
3572
}
3573
3574
float len_sq = xr * xr + xg * xg + xb * xb;
3575
3576
if (len_sq < 1e-10f)
3577
{
3578
xr = grayscale_axis[0];
3579
xg = grayscale_axis[0];
3580
xb = grayscale_axis[0];
3581
}
3582
else
3583
{
3584
len_sq = 1.0f / sqrtf(len_sq);
3585
3586
xr *= len_sq;
3587
xg *= len_sq;
3588
xb *= len_sq;
3589
}
3590
3591
{
3592
// Transform the principle axis by the covariance matrix, which will scale the vector by its eigenvalue (the variance of the dataset projected onto the principle axis).
3593
float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];
3594
float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];
3595
float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];
3596
3597
// Estimate the principle eigenvalue by computing the magnitude of the transformed vector.
3598
// The result is the variance along the principle axis.
3599
//float z1 = sqrtf(r * r + g * g + b * b); // this works with the principle axis
3600
//float z2 = r * xr + g * xg + b * xb; // compute length projected along xr,xg,xb
3601
3602
mode11_eigenvalue_est[part_index] = r * xr + g * xg + b * xb;
3603
}
3604
3605
{
3606
const float yrgb = grayscale_axis[0];
3607
3608
// Transform the grayscale axis by the covariance matrix, which will scale the vector by the eigenvalue (which is the variance of the dataset projected onto this vector).
3609
float r = yrgb * pCov[0] + yrgb * pCov[1] + yrgb * pCov[2];
3610
float g = yrgb * pCov[1] + yrgb * pCov[3] + yrgb * pCov[4];
3611
float b = yrgb * pCov[2] + yrgb * pCov[4] + yrgb * pCov[5];
3612
3613
mode7_eigenvalue_est[part_index] = r * yrgb + g * yrgb + b * yrgb;
3614
}
3615
3616
} // part_index
3617
3618
// Compute the total variance (squared error) of the other 2 axes by subtracting the total variance of all channels by the variance of the principle axis.
3619
// TODO: Could also compute the ratio of the principle axis's variance vs. the total variance.
3620
float mode11_total_sq_dist_to_line_alt = 0.0f;
3621
for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3622
{
3623
float d = maximum(0.0f, total_variance[part_index] - mode11_eigenvalue_est[part_index]);
3624
mode11_total_sq_dist_to_line_alt += d;
3625
}
3626
3627
{
3628
#if 0
3629
// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),
3630
// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.
3631
float total_sq_dist_to_line = 0.0f;
3632
for (uint32_t i = 0; i < BLOCK_T; i++)
3633
{
3634
const uint32_t part_index = (*pPat)[i];
3635
assert(part_index < num_parts);
3636
3637
total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis[part_index]);
3638
}
3639
3640
mode11_candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;
3641
#else
3642
mode11_candidates[examine_iter].m_total_sq_dist = mode11_total_sq_dist_to_line_alt;
3643
#endif
3644
mode11_candidates[examine_iter].m_index = unique_part_index;
3645
}
3646
3647
{
3648
float mode7_total_sq_dist_to_line_alt = 0.0f;
3649
for (uint32_t part_index = 0; part_index < num_parts; part_index++)
3650
{
3651
float d = maximum(0.0f, total_variance[part_index] - mode7_eigenvalue_est[part_index]);
3652
mode7_total_sq_dist_to_line_alt += d;
3653
}
3654
3655
mode7_candidates[examine_iter].m_total_sq_dist = mode7_total_sq_dist_to_line_alt;
3656
mode7_candidates[examine_iter].m_index = unique_part_index;
3657
}
3658
3659
} // examine_iter
3660
3661
std::sort(&mode11_candidates[0], &mode11_candidates[num_pats_to_examine]);
3662
std::sort(&mode7_candidates[0], &mode7_candidates[num_pats_to_examine]);
3663
3664
for (uint32_t i = 0; i < num_desired_pats; i++)
3665
pDesired_pat_indices_mode11[i] = mode11_candidates[i].m_index;
3666
3667
for (uint32_t i = 0; i < num_desired_pats; i++)
3668
pDesired_pat_indices_mode7[i] = mode7_candidates[i].m_index;
3669
}
3670
3671
static void estimate_partitions_mode7(
3672
uint32_t num_parts, // 2 or 3 partitions
3673
uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns
3674
uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine
3675
const vec3F* pHalf_pixels_as_floats, // block's half pixel values casted to floats
3676
const astc_hdr_codec_base_options& coptions, // options
3677
uint32_t num_desired_pats, uint32_t* pDesired_pat_indices) // output indices
3678
{
3679
BASISU_NOTE_UNUSED(coptions);
3680
BASISU_NOTE_UNUSED(num_unique_pats);
3681
3682
const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = 6 * 6, MAX_PARTS = 3;
3683
assert(num_parts <= MAX_PARTS);
3684
3685
struct candidate_res
3686
{
3687
float m_total_sq_dist;
3688
uint32_t m_index;
3689
bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }
3690
};
3691
3692
const uint32_t MAX_CANDIDATES = 1024;
3693
assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));
3694
3695
candidate_res candidates[MAX_CANDIDATES];
3696
3697
for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)
3698
{
3699
const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];
3700
assert(unique_part_index < num_unique_pats);
3701
3702
const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];
3703
3704
vec3F part_means[MAX_PARTS];
3705
uint32_t part_total_texels[MAX_PARTS] = { 0 };
3706
3707
for (uint32_t i = 0; i < num_parts; i++)
3708
part_means[i].clear();
3709
3710
for (uint32_t y = 0; y < BLOCK_H; y++)
3711
{
3712
for (uint32_t x = 0; x < BLOCK_W; x++)
3713
{
3714
const uint32_t part_index = (*pPat)(x, y);
3715
assert(part_index < num_parts);
3716
3717
part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];
3718
part_total_texels[part_index]++;
3719
3720
} // x
3721
} // y
3722
3723
for (uint32_t i = 0; i < num_parts; i++)
3724
{
3725
assert(part_total_texels[i]);
3726
part_means[i] /= (float)part_total_texels[i];
3727
}
3728
3729
vec3F part_axis(0.5773502691f);
3730
3731
// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),
3732
// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.
3733
float total_sq_dist_to_line = 0.0f;
3734
for (uint32_t i = 0; i < BLOCK_T; i++)
3735
{
3736
const uint32_t part_index = (*pPat)[i];
3737
assert(part_index < num_parts);
3738
3739
total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis);
3740
}
3741
3742
candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;
3743
3744
candidates[examine_iter].m_index = unique_part_index;
3745
3746
} // examine_iter
3747
3748
std::sort(&candidates[0], &candidates[num_pats_to_examine]);
3749
3750
for (uint32_t i = 0; i < num_desired_pats; i++)
3751
pDesired_pat_indices[i] = candidates[i].m_index;
3752
}
3753
3754
static float calc_deblocking_penalty_itp(
3755
uint32_t bx, uint32_t by, uint32_t width, uint32_t height,
3756
const imagef& pass_src_img_itp, const candidate_encoding& candidate)
3757
{
3758
float total_deblock_penalty = 0.0f;
3759
3760
float total_orig_mse = 0.0f, total_comp_mse = 0.0f;
3761
uint32_t total_c = 0;
3762
3763
for (uint32_t b = 0; b < 4; b++)
3764
{
3765
for (uint32_t i = 0; i < 6; i++)
3766
{
3767
int ox = 0, oy = 0, qx = 0, qy = 0;
3768
3769
switch (b)
3770
{
3771
case 0:
3772
ox = bx * 6 + i; oy = (by - 1) * 6 + 5;
3773
qx = bx * 6 + i; qy = by * 6;
3774
break;
3775
case 1:
3776
ox = bx * 6 + i; oy = (by + 1) * 6;
3777
qx = bx * 6 + i; qy = by * 6 + 5;
3778
break;
3779
case 2:
3780
ox = (bx - 1) * 6 + 5; oy = by * 6 + i;
3781
qx = bx * 6; qy = by * 6 + i;
3782
break;
3783
case 3:
3784
ox = (bx + 1) * 6; oy = by * 6 + i;
3785
qx = bx * 6 + 5; qy = by * 6 + i;
3786
break;
3787
}
3788
3789
if ((ox < 0) || (oy < 0) || (ox >= (int)width) || (oy >= (int)height))
3790
continue;
3791
3792
const vec3F& o_pixel_itp = pass_src_img_itp(ox, oy);
3793
const vec3F& q_pixel_itp = pass_src_img_itp(qx, qy);
3794
3795
const vec3F &d_pixel_itp = candidate.m_comp_pixels_itp[qy - by * 6][qx - bx * 6]; // compressed block
3796
3797
vec3F orig_delta_v(o_pixel_itp - q_pixel_itp);
3798
total_orig_mse += square(orig_delta_v[0]) + square(orig_delta_v[1]) + square(orig_delta_v[2]);
3799
3800
vec3F d_delta_v(o_pixel_itp - d_pixel_itp);
3801
total_comp_mse += square(d_delta_v[0]) + square(d_delta_v[1]) + square(d_delta_v[2]);
3802
3803
total_c++;
3804
}
3805
}
3806
3807
if (total_c)
3808
{
3809
total_orig_mse /= (float)total_c;
3810
total_comp_mse /= (float)total_c;
3811
3812
if (total_orig_mse)
3813
{
3814
total_deblock_penalty = fabsf((total_comp_mse - total_orig_mse) / total_orig_mse);
3815
}
3816
}
3817
3818
return total_deblock_penalty;
3819
}
3820
3821
static bool calc_strip_size(
3822
float lambda,
3823
uint32_t num_blocks_y, uint32_t total_threads, bool force_one_strip,
3824
uint32_t& res_total_strips, uint32_t& res_rows_per_strip, astc_hdr_6x6_global_config &global_cfg)
3825
{
3826
uint32_t total_strips = 1;
3827
3828
if (lambda == 0.0f)
3829
{
3830
if (!force_one_strip)
3831
{
3832
total_strips = total_threads;
3833
}
3834
}
3835
else
3836
{
3837
const uint32_t MIN_DESIRED_STRIPS = 8;
3838
const uint32_t MAX_TARGET_STRIPS = 32;
3839
const uint32_t TARGET_ASTC_6X6_ROWS_PER_STRIP = 12;
3840
3841
if (!force_one_strip)
3842
{
3843
total_strips = maximum<uint32_t>(1, num_blocks_y / TARGET_ASTC_6X6_ROWS_PER_STRIP);
3844
3845
if (num_blocks_y >= MIN_DESIRED_STRIPS * 2)
3846
total_strips = maximum(total_strips, MIN_DESIRED_STRIPS);
3847
}
3848
3849
total_strips = minimum(total_strips, MAX_TARGET_STRIPS);
3850
}
3851
3852
uint32_t rows_per_strip = 0;
3853
if (total_strips <= 1)
3854
{
3855
rows_per_strip = num_blocks_y;
3856
}
3857
else
3858
{
3859
rows_per_strip = (num_blocks_y / total_strips) & ~1;
3860
3861
if (rows_per_strip < 2)
3862
rows_per_strip = 2;// num_blocks_y;
3863
}
3864
3865
assert((rows_per_strip == num_blocks_y) || ((rows_per_strip & 1) == 0));
3866
3867
total_strips = (num_blocks_y + rows_per_strip - 1) / rows_per_strip;
3868
3869
if (global_cfg.m_debug_output)
3870
{
3871
fmt_printf("num_blocks_y: {}, total_threads : {}, Total strips : {}\n", num_blocks_y, total_threads, total_strips);
3872
fmt_printf("ASTC 6x6 block rows per strip: {}\n", rows_per_strip);
3873
fmt_printf("ASTC 6x6 block rows on final strip: {}\n", num_blocks_y - (total_strips - 1) * rows_per_strip);
3874
}
3875
3876
uint32_t total_rows = 0;
3877
for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)
3878
{
3879
uint32_t strip_first_by = strip_index * rows_per_strip;
3880
uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);
3881
3882
if (strip_index == (total_strips - 1))
3883
strip_last_by = num_blocks_y - 1;
3884
3885
uint32_t num_strip_block_rows = (strip_last_by - strip_first_by) + 1;
3886
total_rows += num_strip_block_rows;
3887
3888
if (global_cfg.m_debug_output)
3889
fmt_printf("Strip row: {}, total block rows: {}\n", strip_index, num_strip_block_rows);
3890
}
3891
3892
if (total_rows != num_blocks_y)
3893
{
3894
fmt_error_printf("Strip calc failed\n");
3895
return false;
3896
}
3897
3898
res_total_strips = total_strips;
3899
res_rows_per_strip = rows_per_strip;
3900
3901
return true;
3902
}
3903
3904
static void convet_rgb_image_to_itp(const imagef &src_img, imagef &dst_img, const astc_hdr_6x6_global_config& cfg)
3905
{
3906
const uint32_t width = src_img.get_width(), height = src_img.get_height();
3907
3908
dst_img.resize(width, height);
3909
3910
for (uint32_t y = 0; y < height; y++)
3911
{
3912
for (uint32_t x = 0; x < width; x++)
3913
{
3914
vec3F src_rgb(src_img(x, y));
3915
3916
vec3F src_itp;
3917
linear_rgb_to_itp(src_rgb, src_itp, cfg);
3918
3919
dst_img(x, y) = src_itp;
3920
}
3921
}
3922
}
3923
3924
const uint32_t BLOCK_W = 6, BLOCK_H = 6;
3925
const uint32_t NUM_BLOCK_PIXELS = BLOCK_W * BLOCK_H;
3926
3927
const float SOLID_PENALTY = 4.0f;
3928
const float REUSE_PENALTY = 1.0f;
3929
const float RUN_PENALTY = 10.0f;
3930
3931
const float MSE_WEIGHT = 300000.0f;
3932
const float SSIM_WEIGHT = 200.0f;
3933
const float TWO_LEVEL_PENALTY = 1.425f;
3934
const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM = .04f;
3935
const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM = .04f;
3936
const float COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY = 1.5f;
3937
const float COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY = 1.25f;
3938
const float COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY = 1.15f;
3939
3940
struct uastc_hdr_6x6_debug_state
3941
{
3942
uint32_t m_encoding_type_hist[(uint32_t)encoding_type::cTotal] = { 0 };
3943
uint32_t m_endpoint_mode_hist[(uint32_t)endpoint_mode::cTotal] = { 0 };
3944
uint32_t m_block_mode_hist[(uint32_t)block_mode::cBMTotalModes] = { 0 };
3945
uint64_t m_block_mode_total_bits[(uint32_t)block_mode::cBMTotalModes] = { 0 };
3946
3947
basisu::vector< basisu::stats<float> > m_block_mode_comp_stats[(uint32_t)block_mode::cBMTotalModes][3];
3948
basisu::vector< basisu::comparative_stats<float> > m_block_mode_comparative_stats[(uint32_t)block_mode::cBMTotalModes][3];
3949
3950
std::atomic<uint32_t> m_total_gaussian1_blocks;
3951
std::atomic<uint32_t> m_total_gaussian2_blocks;
3952
std::atomic<uint32_t> m_total_filter_horizontal;
3953
std::atomic<uint32_t> m_detail_stats[5];
3954
std::atomic<uint32_t> m_total_mode7_skips;
3955
3956
std::atomic<uint32_t> m_total_blocks_compressed;
3957
3958
std::atomic<uint32_t> m_total_candidates_considered;
3959
std::atomic<uint32_t> m_max_candidates_considered;
3960
3961
std::atomic<uint32_t> m_total_part2_stats[4];
3962
std::atomic<uint32_t> m_dp_stats[5];
3963
3964
std::atomic<uint32_t> m_reuse_num_parts[4];
3965
std::atomic<uint32_t> m_reuse_total_dp;
3966
3967
imagef m_stat_vis;
3968
std::mutex m_stat_vis_mutex;
3969
3970
image m_part_vis;
3971
image m_mode_vis;
3972
image m_mode_vis2;
3973
image m_grid_vis;
3974
image m_enc_vis;
3975
std::mutex m_vis_image_mutex;
3976
3977
std::atomic<uint32_t> m_comp_level_hist[ASTC_HDR_6X6_MAX_COMP_LEVEL + 1];
3978
3979
std::atomic<uint32_t> m_total_jnd_replacements;
3980
3981
std::mutex m_stats_mutex;
3982
3983
uastc_hdr_6x6_debug_state()
3984
{
3985
for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
3986
{
3987
for (uint32_t j = 0; j < 3; j++)
3988
{
3989
m_block_mode_comp_stats[i][j].reserve(512);
3990
m_block_mode_comparative_stats[i][j].reserve(512);
3991
}
3992
}
3993
}
3994
3995
void init(uint32_t width, uint32_t height)
3996
{
3997
m_stat_vis.resize(width, height);
3998
m_part_vis.resize(width, height);
3999
m_mode_vis.resize(width, height);
4000
m_mode_vis2.resize(width, height);
4001
m_grid_vis.resize(width, height);
4002
m_enc_vis.resize(width, height);
4003
4004
basisu::clear_obj(m_encoding_type_hist);
4005
basisu::clear_obj(m_endpoint_mode_hist);
4006
basisu::clear_obj(m_block_mode_hist);
4007
basisu::clear_obj(m_block_mode_total_bits);
4008
4009
for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
4010
{
4011
for (uint32_t j = 0; j < 3; j++)
4012
{
4013
m_block_mode_comp_stats[i][j].clear();
4014
m_block_mode_comparative_stats[i][j].clear();
4015
}
4016
}
4017
4018
m_total_gaussian1_blocks.store(0);
4019
m_total_gaussian2_blocks.store(0);
4020
m_total_filter_horizontal.store(0);
4021
for (uint32_t i = 0; i < std::size(m_detail_stats); i++)
4022
m_detail_stats[i].store(0);
4023
m_total_mode7_skips.store(0);
4024
4025
for (uint32_t i = 0; i < std::size(m_comp_level_hist); i++)
4026
m_comp_level_hist[i].store(0);
4027
4028
m_total_blocks_compressed.store(0);
4029
4030
m_total_candidates_considered.store(0);
4031
m_max_candidates_considered.store(0);
4032
4033
for (uint32_t i = 0; i < std::size(m_total_part2_stats); i++)
4034
m_total_part2_stats[i].store(0);
4035
4036
for (uint32_t i = 0; i < std::size(m_dp_stats); i++)
4037
m_dp_stats[i].store(0);
4038
4039
for (uint32_t i = 0; i < std::size(m_reuse_num_parts); i++)
4040
m_reuse_num_parts[i] .store(0);
4041
4042
m_reuse_total_dp.store(0);
4043
4044
m_total_jnd_replacements.store(0);
4045
}
4046
4047
void print(uint32_t total_blocks) const
4048
{
4049
fmt_printf("Total blocks: {}\n", total_blocks);
4050
fmt_printf("Total JND replacements: {} {3.2}%\n", m_total_jnd_replacements, (float)m_total_jnd_replacements * 100.0f / (float)total_blocks);
4051
fmt_printf("Comp level histogram: {} {} {} {} {}\n", m_comp_level_hist[0], m_comp_level_hist[1], m_comp_level_hist[2], m_comp_level_hist[3], m_comp_level_hist[4]);
4052
fmt_printf("Total gaussian 1 blocks: {} {3.2}%\n", m_total_gaussian1_blocks, (float)m_total_gaussian1_blocks * 100.0f / (float)total_blocks);
4053
fmt_printf("Total gaussian 2 blocks: {} {3.2}%\n", m_total_gaussian2_blocks, (float)m_total_gaussian2_blocks * 100.0f / (float)total_blocks);
4054
fmt_printf("Total filter horizontal: {} {3.2}%\n", m_total_filter_horizontal, (float)m_total_filter_horizontal * 100.0f / (float)total_blocks);
4055
fmt_printf("Detail stats: Detailed block low grid skip: {}, Blurry block skip: {}, Very blurry block skip: {}, NH:{} H:{}\n", m_detail_stats[0], m_detail_stats[1], m_detail_stats[2], m_detail_stats[3], m_detail_stats[4]);
4056
fmt_printf("Total mode7 skips: {}\n", m_total_mode7_skips);
4057
4058
fmt_printf("Total candidates: {}, {} avg per block\n", m_total_candidates_considered, (float)m_total_candidates_considered / (float)total_blocks);
4059
fmt_printf("Max ever candidates: {}\n", m_max_candidates_considered);
4060
4061
fmt_printf("Part2/3 stats: {} {} {} {}\n", m_total_part2_stats[0], m_total_part2_stats[1], m_total_part2_stats[2], m_total_part2_stats[3]);
4062
fmt_printf("Dual plane stats: {} {} {} {} {}\n", m_dp_stats[0], m_dp_stats[1], m_dp_stats[2], m_dp_stats[3], m_dp_stats[4]);
4063
fmt_printf("Reuse total dual plane: {}\n", m_reuse_total_dp);
4064
fmt_printf("Reuse part stats: {} {} {}\n", m_reuse_num_parts[1], m_reuse_num_parts[2], m_reuse_num_parts[3]);
4065
4066
fmt_printf("\nEncoding type histogram:\n");
4067
for (uint32_t i = 0; i < std::size(m_encoding_type_hist); i++)
4068
fmt_printf("{}: {}\n", i, m_encoding_type_hist[i]);
4069
4070
fmt_printf("\nEndpoint mode histogram:\n");
4071
for (uint32_t i = 0; i < std::size(m_endpoint_mode_hist); i++)
4072
fmt_printf("{}: {}\n", i, m_endpoint_mode_hist[i]);
4073
4074
fmt_printf("\nBlock mode histogram:\n");
4075
4076
uint32_t total_dp = 0, total_sp = 0;
4077
uint32_t total_mode11 = 0, total_mode7 = 0;
4078
uint32_t part_hist[3] = { 0 };
4079
uint32_t part2_mode7_total = 0, part2_mode11_total = 0;
4080
uint32_t total_used_modes = 0;
4081
for (uint32_t i = 0; i < std::size(m_block_mode_hist); i++)
4082
{
4083
const auto& bm_desc = g_block_mode_descs[i];
4084
4085
const uint32_t total_uses = m_block_mode_hist[i];
4086
4087
if (bm_desc.m_dp)
4088
total_dp += total_uses;
4089
else
4090
total_sp += total_uses;
4091
4092
if (bm_desc.m_cem == 7)
4093
total_mode7 += total_uses;
4094
else
4095
total_mode11 += total_uses;
4096
4097
part_hist[bm_desc.m_num_partitions - 1] += total_uses;
4098
4099
if (bm_desc.m_num_partitions == 2)
4100
{
4101
if (bm_desc.m_cem == 7)
4102
part2_mode7_total += total_uses;
4103
else
4104
{
4105
assert(bm_desc.m_cem == 11);
4106
part2_mode11_total += total_uses;
4107
}
4108
}
4109
4110
float avg_std_dev = 0.0f;
4111
float avg_cross_correlations[3] = { 0 };
4112
4113
if (m_block_mode_comp_stats[i][0].size())
4114
{
4115
const uint32_t num_uses = m_block_mode_comp_stats[i][0].size_u32();
4116
4117
for (uint32_t j = 0; j < num_uses; j++)
4118
avg_std_dev += (float)maximum(m_block_mode_comp_stats[i][0][j].m_std_dev, m_block_mode_comp_stats[i][1][j].m_std_dev, m_block_mode_comp_stats[i][2][j].m_std_dev);
4119
avg_std_dev /= (float)num_uses;
4120
4121
for (uint32_t j = 0; j < num_uses; j++)
4122
{
4123
avg_cross_correlations[0] += fabsf((float)m_block_mode_comparative_stats[i][0][j].m_pearson);
4124
avg_cross_correlations[1] += fabsf((float)m_block_mode_comparative_stats[i][1][j].m_pearson);
4125
avg_cross_correlations[2] += fabsf((float)m_block_mode_comparative_stats[i][2][j].m_pearson);
4126
}
4127
4128
avg_cross_correlations[0] /= (float)num_uses;
4129
avg_cross_correlations[1] /= (float)num_uses;
4130
avg_cross_correlations[2] /= (float)num_uses;
4131
}
4132
4133
fmt_printf("{ 2}: uses: { 6}, cem: {}, dp: {} chan: {}, parts: {}, grid: {}x{}, endpoint levels: {}, weight levels: {}, Avg bits: {}, Avg Max Std Dev: {}, RG: {} RB: {} GB: {}\n", i, total_uses,
4134
bm_desc.m_cem,
4135
bm_desc.m_dp, bm_desc.m_dp_channel,
4136
bm_desc.m_num_partitions,
4137
bm_desc.m_grid_x, bm_desc.m_grid_y,
4138
astc_helpers::get_ise_levels(bm_desc.m_endpoint_ise_range),
4139
astc_helpers::get_ise_levels(bm_desc.m_weight_ise_range),
4140
total_uses ? ((double)m_block_mode_total_bits[i] / total_uses) : 0.0f,
4141
avg_std_dev, avg_cross_correlations[0], avg_cross_correlations[1], avg_cross_correlations[2]);
4142
4143
if (total_uses)
4144
total_used_modes++;
4145
}
4146
4147
fmt_printf("Total used modes: {}\n", total_used_modes);
4148
4149
fmt_printf("Total single plane: {}, total dual plane: {}\n", total_sp, total_dp);
4150
fmt_printf("Total mode 11: {}, mode 7: {}\n", total_mode11, total_mode7);
4151
fmt_printf("Partition histogram: {} {} {}\n", part_hist[0], part_hist[1], part_hist[2]);
4152
fmt_printf("2 subset mode 7 uses: {}, mode 11 uses: {}\n", part2_mode7_total, part2_mode11_total);
4153
}
4154
};
4155
4156
struct uastc_hdr_6x6_encode_state
4157
{
4158
astc_hdr_codec_base_options master_coptions;
4159
4160
imagef src_img;
4161
4162
imagef src_img_filtered1;
4163
imagef src_img_filtered2;
4164
4165
imagef src_img_itp;
4166
imagef src_img_filtered1_itp;
4167
imagef src_img_filtered2_itp;
4168
4169
vector2D<float> smooth_block_mse_scales;
4170
4171
imagef packed_img;
4172
4173
basisu::vector<bitwise_coder> strip_bits;
4174
4175
basisu::vector2D<astc_helpers::astc_block> final_astc_blocks;
4176
4177
vector2D<candidate_encoding> coded_blocks;
4178
};
4179
4180
static bool compress_strip_task(
4181
uint32_t strip_index, uint32_t total_strips, uint32_t strip_first_by, uint32_t strip_last_by,
4182
uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t total_blocks, uint32_t width, uint32_t height,
4183
astc_hdr_6x6_global_config &global_cfg, uastc_hdr_6x6_debug_state &debug_state, uastc_hdr_6x6_encode_state &enc_state)
4184
{
4185
BASISU_NOTE_UNUSED(num_blocks_y);
4186
BASISU_NOTE_UNUSED(total_strips);
4187
4188
vec3F prev_comp_pixels[BLOCK_H][BLOCK_W]; // [y][x]
4189
basisu::clear_obj(prev_comp_pixels);
4190
4191
uint32_t prev_run_len = 0;
4192
4193
bitwise_coder prev_encoding;
4194
candidate_encoding prev_candidate_encoding; // the previous candidate written, which may have been a run extension
4195
candidate_encoding prev_non_run_candidate_encoding; // the previous *non-run* candidate written
4196
4197
bitwise_coder& strip_coded_bits = enc_state.strip_bits[strip_index];
4198
4199
const uint32_t CANDIDATES_TO_RESERVE = 1536;
4200
4201
basisu::vector<candidate_encoding> candidates;
4202
candidates.reserve(CANDIDATES_TO_RESERVE);
4203
4204
for (uint32_t by = strip_first_by; by <= strip_last_by; by++)
4205
{
4206
const bool has_upper_neighbor = by > strip_first_by;
4207
4208
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
4209
{
4210
//if ((bx == 1) && (by == 2))
4211
// basisu::fmt_printf("!");
4212
4213
for (uint32_t outer_pass = 0; outer_pass < 3; outer_pass++)
4214
{
4215
const bool has_left_neighbor = bx > 0;
4216
//const bool has_prev = has_left_neighbor || has_upper_neighbor;
4217
4218
// Select either the original source image, or the Gaussian filtered version.
4219
// From here the encoder *must* use these 2 sources.
4220
const imagef& pass_src_img = (outer_pass == 2) ? enc_state.src_img_filtered2 :
4221
((outer_pass == 1) ? enc_state.src_img_filtered1 : enc_state.src_img);
4222
4223
const imagef& pass_src_img_itp = (outer_pass == 2) ? enc_state.src_img_filtered2_itp :
4224
((outer_pass == 1) ? enc_state.src_img_filtered1_itp : enc_state.src_img_itp);
4225
4226
// Extract source image block
4227
vec4F block_pixels[BLOCK_H][BLOCK_W]; // [y][x]
4228
pass_src_img.extract_block_clamped(&block_pixels[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
4229
4230
vec4F block_pixels_itp[BLOCK_H][BLOCK_W]; // [y][x]
4231
pass_src_img_itp.extract_block_clamped(&block_pixels_itp[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
4232
4233
half_vec3 half_pixels[BLOCK_H][BLOCK_W]; // [y][x] half-float values
4234
vec3F half_pixels_as_floats[BLOCK_H][BLOCK_W]; // [y][x] half float values, integer bits as floats
4235
vec4F block_pixels_q16[BLOCK_H][BLOCK_W]; // [y][x], q16 space for low-level ASTC encoding
4236
vec3F block_pixels_as_itp[BLOCK_H][BLOCK_W]; // [y][x] input converted to itp space, for faster error calculations
4237
4238
bool is_grayscale = true;
4239
4240
candidates.resize(0);
4241
4242
float block_ly = BIG_FLOAT_VAL, block_hy = 0.0f, block_avg_y = 0.0f;
4243
4244
for (uint32_t y = 0; y < BLOCK_H; y++)
4245
{
4246
for (uint32_t x = 0; x < BLOCK_W; x++)
4247
{
4248
vec3F rgb_input;
4249
4250
for (uint32_t c = 0; c < 3; c++)
4251
{
4252
float v = block_pixels[y][x][c];
4253
4254
rgb_input[c] = v;
4255
4256
const basist::half_float h = basisu::fast_float_to_half_no_clamp_neg_nan_or_inf(v);
4257
assert(h == basist::float_to_half(v));
4258
4259
half_pixels[y][x][c] = h;
4260
4261
block_pixels_q16[y][x][c] = (float)half_to_qlog16(h);
4262
4263
half_pixels_as_floats[y][x][c] = (float)h;
4264
4265
} // c
4266
4267
float py = rgb_input.dot(vec3F(REC_709_R, REC_709_G, REC_709_B));
4268
if (py < block_ly)
4269
block_ly = py;
4270
if (py > block_hy)
4271
block_hy = py;
4272
block_avg_y += py;
4273
4274
//linear_rgb_to_itp(rgb_input, block_pixels_as_itp[y][x]);
4275
4276
block_pixels_as_itp[y][x] = block_pixels_itp[y][x];
4277
4278
block_pixels_q16[y][x][3] = 0.0f;
4279
4280
if ((half_pixels[y][x][0] != half_pixels[y][x][1]) || (half_pixels[y][x][0] != half_pixels[y][x][2]))
4281
is_grayscale = false;
4282
4283
} // x
4284
} // y
4285
4286
block_avg_y *= (1.0f / (float)NUM_BLOCK_PIXELS);
4287
4288
encode_astc_block_stats enc_block_stats;
4289
enc_block_stats.init(NUM_BLOCK_PIXELS, &block_pixels_q16[0][0]);
4290
4291
vec4F x_filtered[6][6], y_filtered[6][6];
4292
4293
filter_block(3, 6, (vec4F*)block_pixels, (vec4F*)x_filtered); // filter rows (horizontal)
4294
filter_block(6, 3, (vec4F*)block_pixels, (vec4F*)y_filtered); // filter cols (vertically)
4295
4296
const float filtered_x_err = diff_blocks((vec4F*)block_pixels, (vec4F*)x_filtered);
4297
const float filtered_y_err = diff_blocks((vec4F*)block_pixels, (vec4F*)y_filtered);
4298
const bool filter_horizontally = filtered_x_err < filtered_y_err;
4299
4300
//const float block_mag_gradient_mag = block_max_gradient_mag(bx, by);
4301
4302
if (filter_horizontally)
4303
debug_state.m_total_filter_horizontal.fetch_add(1, std::memory_order_relaxed);
4304
4305
vec3F lowpass_filtered[6][6];
4306
filter_block(3, 3, &half_pixels_as_floats[0][0], &lowpass_filtered[0][0]);
4307
float lowpass_std_dev = sub_and_compute_std_dev(&lowpass_filtered[0][0], &half_pixels_as_floats[0][0]);
4308
4309
const bool very_detailed_block = lowpass_std_dev > 350.0f;
4310
const bool very_blurry_block = lowpass_std_dev < 30.0f;
4311
const bool super_blurry_block = lowpass_std_dev < 15.0f;
4312
4313
basisu::stats<float> half_comp_stats[3];
4314
for (uint32_t c = 0; c < 3; c++)
4315
half_comp_stats[c].calc(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][c], 3);
4316
4317
const float SINGLE_PART_HALF_THRESH = 256.0f;
4318
const float COMPLEX_HALF_THRESH = 1024.0f;
4319
// HACK HACK
4320
const float VERY_COMPLEX_HALF_THRESH = 1400.0f; // 1536.0f;
4321
4322
const float max_std_dev = (float)maximum(half_comp_stats[0].m_std_dev, half_comp_stats[1].m_std_dev, half_comp_stats[2].m_std_dev);
4323
4324
const bool very_simple_block = (max_std_dev < SINGLE_PART_HALF_THRESH);
4325
const bool complex_block = (max_std_dev > COMPLEX_HALF_THRESH);
4326
const bool very_complex_block = (max_std_dev > VERY_COMPLEX_HALF_THRESH);
4327
4328
// Dynamically choose a comp_level for this block.
4329
astc_hdr_codec_base_options coptions(enc_state.master_coptions);
4330
uint32_t comp_level = global_cfg.m_master_comp_level;
4331
4332
if (very_complex_block)
4333
comp_level = global_cfg.m_highest_comp_level;
4334
else if (complex_block)
4335
comp_level = (global_cfg.m_master_comp_level + global_cfg.m_highest_comp_level + 1) / 2;
4336
4337
debug_state.m_comp_level_hist[comp_level].fetch_add(1, std::memory_order_relaxed);
4338
4339
bool any_2subset_enabled = false, any_2subset_mode11_enabled = false, any_2subset_mode7_enabled = false, any_3subset_enabled = false;
4340
BASISU_NOTE_UNUSED(any_2subset_mode11_enabled);
4341
4342
for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
4343
{
4344
if (comp_level == 0)
4345
{
4346
if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)
4347
continue;
4348
}
4349
else if (comp_level == 1)
4350
{
4351
if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)
4352
continue;
4353
}
4354
else if (comp_level == 2)
4355
{
4356
if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)
4357
continue;
4358
}
4359
4360
if (g_block_mode_descs[i].m_num_partitions == 2)
4361
{
4362
any_2subset_enabled = true;
4363
4364
if (g_block_mode_descs[i].m_cem == 7)
4365
{
4366
any_2subset_mode7_enabled = true;
4367
}
4368
else
4369
{
4370
assert(g_block_mode_descs[i].m_cem == 11);
4371
any_2subset_mode11_enabled = true;
4372
}
4373
}
4374
else if (g_block_mode_descs[i].m_num_partitions == 3)
4375
any_3subset_enabled = true;
4376
}
4377
4378
coptions.m_mode7_full_s_optimization = (comp_level >= 2);
4379
4380
const bool uber_mode_flag = (comp_level >= 3);
4381
coptions.m_allow_uber_mode = uber_mode_flag;
4382
4383
coptions.m_ultra_quant = (comp_level >= 4);
4384
4385
coptions.m_take_first_non_clamping_mode11_submode = (comp_level <= 2);
4386
coptions.m_take_first_non_clamping_mode7_submode = (comp_level <= 2);
4387
4388
coptions.m_disable_weight_plane_optimization = (comp_level >= 2);
4389
4390
// -------------------
4391
4392
uint32_t total_used_block_chans = 0;
4393
for (uint32_t i = 0; i < 3; i++)
4394
total_used_block_chans += (half_comp_stats[i].m_range > 0.0f);
4395
4396
const bool is_solid_block = (total_used_block_chans == 0);
4397
4398
basisu::comparative_stats<float> half_cross_chan_stats[3];
4399
4400
// R vs. G
4401
half_cross_chan_stats[0].calc_pearson(NUM_BLOCK_PIXELS,
4402
&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][1],
4403
3, 3,
4404
&half_comp_stats[0], &half_comp_stats[1]);
4405
4406
// R vs. B
4407
half_cross_chan_stats[1].calc_pearson(NUM_BLOCK_PIXELS,
4408
&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][2],
4409
3, 3,
4410
&half_comp_stats[0], &half_comp_stats[2]);
4411
4412
// G vs. B
4413
half_cross_chan_stats[2].calc_pearson(NUM_BLOCK_PIXELS,
4414
&half_pixels_as_floats[0][0][1], &half_pixels_as_floats[0][0][2],
4415
3, 3,
4416
&half_comp_stats[1], &half_comp_stats[2]);
4417
4418
const float rg_corr = fabsf((float)half_cross_chan_stats[0].m_pearson);
4419
const float rb_corr = fabsf((float)half_cross_chan_stats[1].m_pearson);
4420
const float gb_corr = fabsf((float)half_cross_chan_stats[2].m_pearson);
4421
4422
float min_corr = BIG_FLOAT_VAL, max_corr = -BIG_FLOAT_VAL;
4423
for (uint32_t i = 0; i < 3; i++)
4424
{
4425
if (half_comp_stats[i].m_range > 0.0f)
4426
{
4427
const float c = fabsf((float)half_cross_chan_stats[i].m_pearson);
4428
min_corr = minimum(min_corr, c);
4429
max_corr = maximum(max_corr, c);
4430
}
4431
}
4432
4433
bool use_single_subset_mode7 = true;
4434
if (comp_level <= 1)
4435
{
4436
// TODO: could also compute angle between principle axis and the grayscale axis.
4437
// TODO: Transform grayscale axis by covar matrix, compute variance vs. total variance
4438
const float MODE7_MIN_CHAN_CORR = .5f;
4439
const float MODE7_PCA_ANGLE_THRESH = .9f;
4440
use_single_subset_mode7 = is_grayscale || is_solid_block || (min_corr >= MODE7_MIN_CHAN_CORR);
4441
4442
if (use_single_subset_mode7)
4443
{
4444
float cos_ang = fabsf(enc_block_stats.m_axis_q16.dot(vec3F(0.5773502691f)));
4445
if (cos_ang < MODE7_PCA_ANGLE_THRESH)
4446
use_single_subset_mode7 = false;
4447
}
4448
}
4449
4450
const float STRONG_CORR_THRESH = (comp_level <= 1) ? .5f : ((comp_level <= 3) ? .75f : .9f);
4451
4452
int desired_dp_chan = -1;
4453
if (total_used_block_chans <= 1)
4454
{
4455
// no need for dual plane (except possibly 2x2 weight grids for RDO)
4456
}
4457
else
4458
{
4459
if (min_corr >= STRONG_CORR_THRESH)
4460
{
4461
// all channel pairs strongly correlated, no need for dual plane
4462
debug_state.m_dp_stats[0].fetch_add(1, std::memory_order_relaxed);
4463
}
4464
else
4465
{
4466
if (total_used_block_chans == 2)
4467
{
4468
if (half_comp_stats[0].m_range == 0.0f)
4469
{
4470
// r unused, check for strong gb correlation
4471
if (gb_corr < STRONG_CORR_THRESH)
4472
desired_dp_chan = 1;
4473
}
4474
else if (half_comp_stats[1].m_range == 0.0f)
4475
{
4476
// g unused, check for strong rb correlation
4477
if (rb_corr < STRONG_CORR_THRESH)
4478
desired_dp_chan = 0;
4479
}
4480
else
4481
{
4482
// b unused, check for strong rg correlation
4483
if (rg_corr < STRONG_CORR_THRESH)
4484
desired_dp_chan = 0;
4485
}
4486
}
4487
else
4488
{
4489
assert(total_used_block_chans == 3);
4490
4491
// see if rg/rb is weakly correlated vs. gb
4492
if ((rg_corr < gb_corr) && (rb_corr < gb_corr))
4493
desired_dp_chan = 0;
4494
// see if gr/gb is weakly correlated vs. rb
4495
else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))
4496
desired_dp_chan = 1;
4497
// assume b is weakest
4498
else
4499
desired_dp_chan = 2;
4500
}
4501
4502
if (desired_dp_chan == -1)
4503
debug_state.m_dp_stats[1].fetch_add(1, std::memory_order_relaxed);
4504
else
4505
debug_state.m_dp_stats[2 + desired_dp_chan].fetch_add(1, std::memory_order_relaxed);
4506
}
4507
}
4508
4509
// 2x2 is special for RDO at higher lambdas - always pick a preferred channel.
4510
int desired_dp_chan_2x2 = 0;
4511
if (total_used_block_chans == 2)
4512
{
4513
if (half_comp_stats[0].m_range == 0.0f)
4514
desired_dp_chan_2x2 = 1;
4515
}
4516
else if (total_used_block_chans == 3)
4517
{
4518
// see if rg/rb is weakly correlated vs. gb
4519
if ((rg_corr < gb_corr) && (rb_corr < gb_corr))
4520
desired_dp_chan_2x2 = 0;
4521
// see if gr/gb is weakly correlated vs. rb
4522
else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))
4523
desired_dp_chan_2x2 = 1;
4524
// assume b is weakest
4525
else
4526
desired_dp_chan_2x2 = 2;
4527
}
4528
4529
// Gather all candidate encodings
4530
bool status = false;
4531
4532
// ---- Run candidate
4533
if ((global_cfg.m_use_runs) && (has_left_neighbor || has_upper_neighbor))
4534
{
4535
candidate_encoding candidate;
4536
candidate.m_coder.reserve(24);
4537
4538
candidate.m_encoding_type = encoding_type::cRun;
4539
4540
candidate.m_decomp_log_blk = prev_non_run_candidate_encoding.m_decomp_log_blk;
4541
candidate.m_coded_log_blk = prev_non_run_candidate_encoding.m_coded_log_blk;
4542
4543
memcpy(candidate.m_comp_pixels, prev_comp_pixels, sizeof(prev_comp_pixels));
4544
4545
if (!prev_run_len)
4546
{
4547
candidate.m_coder.put_bits(RUN_CODE, RUN_CODE_LEN);
4548
candidate.m_coder.put_vlc(0, 5);
4549
}
4550
else
4551
{
4552
// extend current run - compute the # of new bits needed for the extension.
4553
4554
uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();
4555
assert(prev_run_bits > 0);
4556
4557
// We're not actually going to code this, because the previously emitted run code will be extended.
4558
bitwise_coder temp_coder;
4559
temp_coder.put_bits(RUN_CODE, RUN_CODE_LEN);
4560
temp_coder.put_vlc((prev_run_len + 1) - 1, 5);
4561
4562
uint32_t cur_run_bits = temp_coder.get_total_bits_u32();
4563
assert(cur_run_bits >= prev_run_bits);
4564
4565
uint32_t total_new_bits = cur_run_bits - prev_run_bits;
4566
if (total_new_bits > 0)
4567
candidate.m_coder.put_bits(0, total_new_bits); // dummy bits
4568
}
4569
4570
candidate.m_run_len = prev_run_len + 1;
4571
4572
candidates.emplace_back(std::move(candidate));
4573
}
4574
4575
// ---- Reuse candidate
4576
if ((!is_solid_block) && (global_cfg.m_lambda > 0.0f))
4577
{
4578
for (uint32_t reuse_delta_index = 0; reuse_delta_index < global_cfg.m_num_reuse_xy_deltas; reuse_delta_index++)
4579
{
4580
const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;
4581
const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;
4582
4583
const int reuse_bx = bx + reuse_delta_x, reuse_by = by + reuse_delta_y;
4584
if ((reuse_bx < 0) || (reuse_bx >= (int)num_blocks_x))
4585
continue;
4586
if (reuse_by < (int)strip_first_by)
4587
break;
4588
4589
const candidate_encoding& prev_candidate = enc_state.coded_blocks(reuse_bx, reuse_by);
4590
4591
// TODO - support this.
4592
if (prev_candidate.m_encoding_type == encoding_type::cSolid)
4593
continue;
4594
assert((prev_candidate.m_encoding_type == encoding_type::cBlock) || (prev_candidate.m_encoding_type == encoding_type::cReuse));
4595
4596
candidate_encoding candidate;
4597
candidate.m_coder.reserve(24);
4598
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
4599
astc_helpers::log_astc_block& decomp_log_blk = candidate.m_decomp_log_blk;
4600
4601
const astc_helpers::log_astc_block& prev_coded_log_blk = prev_candidate.m_coded_log_blk;
4602
4603
const uint32_t grid_x = prev_coded_log_blk.m_grid_width, grid_y = prev_coded_log_blk.m_grid_height;
4604
const bool dual_plane = prev_candidate.m_coded_log_blk.m_dual_plane;
4605
const uint32_t num_grid_samples = grid_x * grid_y;
4606
const uint32_t num_endpoint_vals = get_num_endpoint_vals(prev_coded_log_blk.m_color_endpoint_modes[0]);
4607
4608
coded_log_blk = prev_candidate.m_coded_log_blk;
4609
decomp_log_blk = prev_candidate.m_decomp_log_blk;
4610
4611
if (prev_coded_log_blk.m_num_partitions == 1)
4612
{
4613
// Now encode the block using the transcoded endpoints
4614
basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
4615
4616
if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)
4617
{
4618
status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,
4619
astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4620
}
4621
else
4622
{
4623
status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,
4624
astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4625
}
4626
assert(status);
4627
4628
uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];
4629
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
4630
4631
if (dual_plane)
4632
{
4633
eval_selectors_dual_plane(prev_candidate.m_coded_log_blk.m_color_component_selector,
4634
BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4635
4636
downsample_ise_weights_dual_plane(
4637
coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4638
BLOCK_W, BLOCK_H,
4639
grid_x, grid_y,
4640
trial_weights0, trial_weights1, coded_log_blk.m_weights);
4641
4642
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * 2, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4643
}
4644
else
4645
{
4646
eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, coded_log_blk.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4647
4648
downsample_ise_weights(
4649
coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4650
BLOCK_W, BLOCK_H,
4651
grid_x, grid_y,
4652
trial_weights0, coded_log_blk.m_weights);
4653
4654
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4655
}
4656
4657
// Create the block the decoder would transcode into.
4658
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
4659
}
4660
else if (prev_coded_log_blk.m_num_partitions == 2)
4661
{
4662
assert(!dual_plane);
4663
4664
const int unique_pat_index = g_part2_seed_to_unique_index[coded_log_blk.m_partition_id];
4665
assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS2));
4666
4667
const partition_pattern_vec& pat_vec = g_partitions2[unique_pat_index];
4668
4669
vec4F part_pixels_q16[2][64];
4670
half_vec3 part_half_pixels[2][64];
4671
uint32_t part_total_pixels[2] = { 0 };
4672
4673
for (uint32_t y = 0; y < BLOCK_H; y++)
4674
{
4675
for (uint32_t x = 0; x < BLOCK_W; x++)
4676
{
4677
const uint32_t part_index = pat_vec[x + y * 6];
4678
4679
uint32_t l = part_total_pixels[part_index];
4680
4681
part_pixels_q16[part_index][l] = block_pixels_q16[y][x];
4682
part_half_pixels[part_index][l] = half_pixels[y][x];
4683
4684
part_total_pixels[part_index] = l + 1;
4685
} // x
4686
} // y
4687
4688
uint8_t blk_weights[2][BLOCK_W * BLOCK_H];
4689
4690
for (uint32_t part_index = 0; part_index < 2; part_index++)
4691
{
4692
basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
4693
4694
if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)
4695
{
4696
status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
4697
astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4698
}
4699
else
4700
{
4701
status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
4702
astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4703
}
4704
assert(status);
4705
4706
eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,
4707
(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4708
4709
} // part_index
4710
4711
uint8_t ise_weights[BLOCK_W * BLOCK_H];
4712
4713
uint32_t src_pixel_index[2] = { 0, 0 };
4714
for (uint32_t y = 0; y < BLOCK_H; y++)
4715
{
4716
for (uint32_t x = 0; x < BLOCK_W; x++)
4717
{
4718
const uint32_t part_index = pat_vec[x + y * 6];
4719
4720
ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
4721
src_pixel_index[part_index]++;
4722
} // x
4723
} // y
4724
4725
downsample_ise_weights(
4726
coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4727
BLOCK_W, BLOCK_H,
4728
grid_x, grid_y,
4729
ise_weights, coded_log_blk.m_weights);
4730
4731
// Transcode these codable weights to ASTC weights.
4732
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];
4733
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4734
4735
// Create the block the decoder would transcode into.
4736
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
4737
}
4738
else if (prev_coded_log_blk.m_num_partitions == 3)
4739
{
4740
assert(!dual_plane);
4741
4742
const int unique_pat_index = g_part3_seed_to_unique_index[coded_log_blk.m_partition_id];
4743
assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS3));
4744
4745
const partition_pattern_vec& pat = g_partitions3[unique_pat_index];
4746
4747
vec4F part_pixels_q16[3][64];
4748
half_vec3 part_half_pixels[3][64];
4749
uint32_t part_total_pixels[3] = { 0 };
4750
4751
for (uint32_t y = 0; y < BLOCK_H; y++)
4752
{
4753
for (uint32_t x = 0; x < BLOCK_W; x++)
4754
{
4755
const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];
4756
4757
uint32_t l = part_total_pixels[part_index];
4758
4759
part_pixels_q16[part_index][l] = block_pixels_q16[y][x];
4760
part_half_pixels[part_index][l] = half_pixels[y][x];
4761
4762
part_total_pixels[part_index] = l + 1;
4763
} // x
4764
} // y
4765
4766
uint8_t blk_weights[3][BLOCK_W * BLOCK_H];
4767
4768
for (uint32_t part_index = 0; part_index < 3; part_index++)
4769
{
4770
basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
4771
4772
status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
4773
astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
4774
assert(status);
4775
4776
eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,
4777
(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
4778
4779
} // part_index
4780
4781
uint8_t ise_weights[BLOCK_W * BLOCK_H];
4782
4783
uint32_t src_pixel_index[3] = { 0 };
4784
for (uint32_t y = 0; y < BLOCK_H; y++)
4785
{
4786
for (uint32_t x = 0; x < BLOCK_W; x++)
4787
{
4788
const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];
4789
4790
ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
4791
src_pixel_index[part_index]++;
4792
} // x
4793
} // y
4794
4795
downsample_ise_weights(
4796
coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
4797
BLOCK_W, BLOCK_H,
4798
grid_x, grid_y,
4799
ise_weights, coded_log_blk.m_weights);
4800
4801
// Transcode these codable weights to ASTC weights.
4802
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];
4803
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
4804
4805
// Create the block the decoder would transcode into.
4806
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk);
4807
}
4808
4809
if (!validate_log_blk(decomp_log_blk))
4810
{
4811
fmt_error_printf("pack_astc_block() failed\n");
4812
return false;
4813
}
4814
4815
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_log_blk, &candidate.m_comp_pixels[0][0]);
4816
if (!status)
4817
{
4818
fmt_error_printf("decode_astc_block() failed\n");
4819
return false;
4820
}
4821
4822
candidate.m_coder.put_bits(REUSE_CODE, REUSE_CODE_LEN);
4823
candidate.m_coder.put_bits(reuse_delta_index, REUSE_XY_DELTA_BITS);
4824
encode_values(candidate.m_coder, num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range);
4825
4826
candidate.m_encoding_type = encoding_type::cReuse;
4827
candidate.m_block_mode = prev_candidate.m_block_mode;
4828
candidate.m_endpoint_mode = prev_candidate.m_endpoint_mode;
4829
candidate.m_reuse_delta_index = reuse_delta_index;
4830
4831
candidates.emplace_back(std::move(candidate));
4832
4833
} // reuse_delta_index
4834
}
4835
4836
// ---- Solid candidate
4837
if (global_cfg.m_use_solid_blocks)
4838
{
4839
candidate_encoding candidate;
4840
candidate.m_coder.reserve(24);
4841
4842
// solid
4843
candidate.m_encoding_type = encoding_type::cSolid;
4844
4845
float r = 0.0f, g = 0.0f, b = 0.0f;
4846
const float LOG_BIAS = .125f;
4847
bool solid_block = true;
4848
for (uint32_t y = 0; y < BLOCK_H; y++)
4849
{
4850
for (uint32_t x = 0; x < BLOCK_W; x++)
4851
{
4852
if ((block_pixels[0][0][0] != block_pixels[y][x][0]) ||
4853
(block_pixels[0][0][1] != block_pixels[y][x][1]) ||
4854
(block_pixels[0][0][2] != block_pixels[y][x][2]))
4855
{
4856
solid_block = false;
4857
}
4858
4859
r += log2f(block_pixels[y][x][0] + LOG_BIAS);
4860
g += log2f(block_pixels[y][x][1] + LOG_BIAS);
4861
b += log2f(block_pixels[y][x][2] + LOG_BIAS);
4862
}
4863
}
4864
4865
if (solid_block)
4866
{
4867
r = block_pixels[0][0][0];
4868
g = block_pixels[0][0][1];
4869
b = block_pixels[0][0][2];
4870
}
4871
else
4872
{
4873
r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
4874
g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
4875
b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
4876
4877
r = minimum<float>(r, basist::MAX_HALF_FLOAT);
4878
g = minimum<float>(g, basist::MAX_HALF_FLOAT);
4879
b = minimum<float>(b, basist::MAX_HALF_FLOAT);
4880
}
4881
4882
basist::half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b);
4883
4884
candidate.m_solid_color[0] = rh;
4885
candidate.m_solid_color[1] = gh;
4886
candidate.m_solid_color[2] = bh;
4887
4888
candidate.m_coder.put_bits(SOLID_CODE, SOLID_CODE_LEN);
4889
4890
candidate.m_coder.put_bits(rh, 15);
4891
candidate.m_coder.put_bits(gh, 15);
4892
candidate.m_coder.put_bits(bh, 15);
4893
4894
vec3F cp(basist::half_to_float(rh), basist::half_to_float(gh), basist::half_to_float(bh));
4895
4896
for (uint32_t y = 0; y < BLOCK_H; y++)
4897
for (uint32_t x = 0; x < BLOCK_W; x++)
4898
candidate.m_comp_pixels[y][x] = cp;
4899
4900
astc_helpers::log_astc_block& log_blk = candidate.m_coded_log_blk;
4901
4902
log_blk.clear();
4903
log_blk.m_solid_color_flag_hdr = true;
4904
log_blk.m_solid_color[0] = rh;
4905
log_blk.m_solid_color[1] = gh;
4906
log_blk.m_solid_color[2] = bh;
4907
log_blk.m_solid_color[3] = basist::float_to_half(1.0f);
4908
4909
candidate.m_decomp_log_blk = log_blk;
4910
4911
candidates.emplace_back(std::move(candidate));
4912
}
4913
4914
if ((!is_solid_block) || (!global_cfg.m_use_solid_blocks))
4915
{
4916
static uint8_t s_parts2_normal[5] = { 0, 2, 4, 6, 8 };
4917
static uint8_t s_parts3_normal[5] = { 0, 0, 4, 6, 8 };
4918
4919
static uint8_t s_parts2_complex[5] = { 0, 4, 8, 10, 16 };
4920
static uint8_t s_parts3_complex[5] = { 0, 0, 8, 10, 16 };
4921
4922
static uint8_t s_parts2_very_complex[5] = { 0, 8, 12, 14, 20 };
4923
static uint8_t s_parts3_very_complex[5] = { 0, 0, 12, 14, 20 };
4924
4925
uint32_t total_parts2 = 0, total_parts3 = 0;
4926
4927
assert(comp_level < 5);
4928
if ((very_simple_block) && (comp_level <= 3))
4929
{
4930
// Block's std dev is so low that 2-3 subsets are unlikely to help much
4931
total_parts2 = 0;
4932
total_parts3 = 0;
4933
4934
debug_state.m_total_part2_stats[0].fetch_add(1, std::memory_order_relaxed);
4935
}
4936
else if (very_complex_block)
4937
{
4938
total_parts2 = s_parts2_very_complex[comp_level];
4939
total_parts3 = s_parts3_very_complex[comp_level];
4940
4941
if (global_cfg.m_extra_patterns_flag)
4942
{
4943
total_parts2 += (comp_level == 4) ? 30 : 20;
4944
total_parts3 += (comp_level == 4) ? 30 : 20;
4945
}
4946
4947
debug_state.m_total_part2_stats[2].fetch_add(1, std::memory_order_relaxed);
4948
}
4949
else if (complex_block)
4950
{
4951
total_parts2 = s_parts2_complex[comp_level];
4952
total_parts3 = s_parts3_complex[comp_level];
4953
4954
if (global_cfg.m_extra_patterns_flag)
4955
{
4956
total_parts2 += (comp_level == 4) ? 15 : 10;
4957
total_parts3 += (comp_level == 4) ? 15 : 10;
4958
}
4959
4960
debug_state.m_total_part2_stats[3].fetch_add(1, std::memory_order_relaxed);
4961
}
4962
else
4963
{
4964
// moderate complexity - use defaults
4965
total_parts2 = s_parts2_normal[comp_level];
4966
total_parts3 = s_parts3_normal[comp_level];
4967
4968
if (global_cfg.m_extra_patterns_flag)
4969
{
4970
total_parts2 += 5;
4971
total_parts3 += 5;
4972
}
4973
4974
debug_state.m_total_part2_stats[1].fetch_add(1, std::memory_order_relaxed);
4975
}
4976
4977
if (!any_2subset_enabled)
4978
total_parts2 = 0;
4979
4980
if (!any_3subset_enabled)
4981
total_parts3 = 0;
4982
4983
int best_parts2_mode11[NUM_UNIQUE_PARTITIONS2], best_parts2_mode7[NUM_UNIQUE_PARTITIONS2];
4984
bool has_estimated_parts2 = false;
4985
4986
if (total_parts2)
4987
{
4988
if (global_cfg.m_brute_force_partition_matching)
4989
{
4990
int candidate_pats2[NUM_UNIQUE_PARTITIONS2];
4991
for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS2; i++)
4992
candidate_pats2[i] = i;
4993
4994
if (any_2subset_enabled)
4995
{
4996
estimate_partitions_mode7_and_11(
4997
2,
4998
NUM_UNIQUE_PARTITIONS2, g_partitions2,
4999
NUM_UNIQUE_PARTITIONS2, (uint32_t*)candidate_pats2,
5000
&half_pixels_as_floats[0][0],
5001
coptions,
5002
total_parts2, best_parts2_mode11, best_parts2_mode7);
5003
}
5004
5005
has_estimated_parts2 = true;
5006
}
5007
else
5008
{
5009
if (comp_level >= 1)
5010
{
5011
const uint32_t MAX_CANDIDATES2 = 48;
5012
int candidate_pats2[MAX_CANDIDATES2 * 2];
5013
5014
uint32_t num_candidate_pats2 = maximum((total_parts2 * 3) / 2, very_complex_block ? MAX_CANDIDATES2 : (MAX_CANDIDATES2 / 2));
5015
num_candidate_pats2 = minimum<uint32_t>(num_candidate_pats2, (uint32_t)std::size(candidate_pats2));
5016
5017
has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, candidate_pats2, num_candidate_pats2);
5018
5019
if (has_estimated_parts2)
5020
{
5021
estimate_partitions_mode7_and_11(
5022
2,
5023
NUM_UNIQUE_PARTITIONS2, g_partitions2,
5024
num_candidate_pats2, (uint32_t*)candidate_pats2,
5025
&half_pixels_as_floats[0][0],
5026
coptions,
5027
total_parts2, best_parts2_mode11, best_parts2_mode7);
5028
}
5029
}
5030
else
5031
{
5032
has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, best_parts2_mode11, total_parts2);
5033
5034
if ((has_estimated_parts2) && (any_2subset_mode7_enabled))
5035
memcpy(best_parts2_mode7, best_parts2_mode11, total_parts2 * sizeof(best_parts2_mode7[0]));
5036
}
5037
}
5038
}
5039
5040
int best_parts3[NUM_UNIQUE_PARTITIONS3];
5041
bool has_estimated_parts3 = false;
5042
5043
if (total_parts3)
5044
{
5045
#if 0
5046
has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, best_parts3, total_parts3);
5047
#elif 1
5048
if (global_cfg.m_brute_force_partition_matching)
5049
{
5050
int candidate_pats3[NUM_UNIQUE_PARTITIONS3];
5051
for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS3; i++)
5052
candidate_pats3[i] = i;
5053
5054
estimate_partitions_mode7(
5055
3,
5056
NUM_UNIQUE_PARTITIONS3, g_partitions3,
5057
NUM_UNIQUE_PARTITIONS3, (uint32_t*)candidate_pats3,
5058
&half_pixels_as_floats[0][0],
5059
coptions,
5060
total_parts3, (uint32_t*)best_parts3);
5061
5062
has_estimated_parts3 = true;
5063
}
5064
else
5065
{
5066
const uint32_t MAX_CANDIDATES3 = 48;
5067
int candidate_pats3[MAX_CANDIDATES3 * 2];
5068
5069
uint32_t num_candidate_pats3 = maximum((total_parts3 * 3) / 2, very_complex_block ? MAX_CANDIDATES3 : (MAX_CANDIDATES3 / 2));
5070
num_candidate_pats3 = minimum<uint32_t>(num_candidate_pats3, (uint32_t)std::size(candidate_pats3));
5071
5072
has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, candidate_pats3, num_candidate_pats3);
5073
5074
if (has_estimated_parts3)
5075
{
5076
estimate_partitions_mode7(
5077
3,
5078
NUM_UNIQUE_PARTITIONS3, g_partitions3,
5079
num_candidate_pats3, (uint32_t*)candidate_pats3,
5080
&half_pixels_as_floats[0][0],
5081
coptions,
5082
total_parts3, (uint32_t*)best_parts3);
5083
}
5084
}
5085
#endif
5086
}
5087
5088
const opt_mode_t mode11_opt_mode = complex_block ? cWeightedLeastSquares : cOrdinaryLeastSquares;
5089
5090
// ---- Encoded block candidate
5091
for (uint32_t block_mode_iter = 0; block_mode_iter < (uint32_t)block_mode::cBMTotalModes; block_mode_iter++)
5092
{
5093
const block_mode bm = (block_mode)block_mode_iter;
5094
5095
if (comp_level == 0)
5096
{
5097
if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)
5098
continue;
5099
}
5100
else if (comp_level == 1)
5101
{
5102
if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)
5103
continue;
5104
}
5105
else if (comp_level == 2)
5106
{
5107
if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)
5108
continue;
5109
}
5110
5111
if (global_cfg.m_block_stat_optimizations_flag)
5112
{
5113
if ((comp_level <= 3) && (g_block_mode_descs[block_mode_iter].m_dp))
5114
{
5115
if ((global_cfg.m_lambda > 0.0f) && (!complex_block) && (g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))
5116
{
5117
if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan_2x2)
5118
continue;
5119
}
5120
else
5121
{
5122
if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan)
5123
continue;
5124
}
5125
}
5126
5127
if (comp_level <= 3)
5128
{
5129
const uint32_t grid_x = g_block_mode_descs[block_mode_iter].m_grid_x;
5130
const uint32_t grid_y = g_block_mode_descs[block_mode_iter].m_grid_y;
5131
5132
if (!g_block_mode_descs[block_mode_iter].m_dp)
5133
{
5134
// Minor gain (.5-1% less canidates)
5135
if (very_detailed_block)
5136
{
5137
if (grid_x * grid_y <= 12)
5138
{
5139
debug_state.m_detail_stats[0].fetch_add(1, std::memory_order_relaxed);
5140
continue;
5141
}
5142
}
5143
5144
// Major gains (10-25% less candidates)
5145
if (very_blurry_block)
5146
{
5147
if ((grid_x > 4) || (grid_y > 4) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))
5148
{
5149
debug_state.m_detail_stats[1].fetch_add(1, std::memory_order_relaxed);
5150
continue;
5151
}
5152
}
5153
if (super_blurry_block)
5154
{
5155
if ((grid_x > 3) || (grid_y > 3) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))
5156
{
5157
debug_state.m_detail_stats[2].fetch_add(1, std::memory_order_relaxed);
5158
continue;
5159
}
5160
}
5161
}
5162
5163
if (grid_x != grid_y)
5164
{
5165
if (grid_x < grid_y)
5166
{
5167
if (!filter_horizontally)
5168
{
5169
debug_state.m_detail_stats[3].fetch_add(1, std::memory_order_relaxed);
5170
continue;
5171
}
5172
}
5173
else
5174
{
5175
if (filter_horizontally)
5176
{
5177
debug_state.m_detail_stats[4].fetch_add(1, std::memory_order_relaxed);
5178
continue;
5179
}
5180
}
5181
}
5182
}
5183
5184
if (global_cfg.m_lambda == 0.0f)
5185
{
5186
// Rarely useful if lambda=0
5187
if ((g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))
5188
continue;
5189
}
5190
} // block_stat_optimizations_flag
5191
5192
if ((!use_single_subset_mode7) &&
5193
(g_block_mode_descs[block_mode_iter].m_cem == 7) &&
5194
(g_block_mode_descs[block_mode_iter].m_num_partitions == 1))
5195
{
5196
debug_state.m_total_mode7_skips.fetch_add(1, std::memory_order_relaxed);
5197
continue;
5198
}
5199
5200
for (uint32_t endpoint_mode_iter = 0; endpoint_mode_iter < (uint32_t)endpoint_mode::cTotal; endpoint_mode_iter++)
5201
{
5202
if (global_cfg.m_lambda == 0.0f)
5203
{
5204
// No use trying anything else
5205
if (endpoint_mode_iter != (uint32_t)endpoint_mode::cRaw)
5206
continue;
5207
}
5208
5209
if (global_cfg.m_disable_delta_endpoint_usage)
5210
{
5211
if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeftDelta))
5212
continue;
5213
}
5214
5215
if (!global_cfg.m_favor_higher_compression)
5216
{
5217
if (comp_level == 0)
5218
{
5219
if (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta)
5220
continue;
5221
}
5222
5223
if (comp_level <= 1)
5224
{
5225
if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeft) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpper))
5226
continue;
5227
}
5228
}
5229
5230
const endpoint_mode em = (endpoint_mode)endpoint_mode_iter;
5231
5232
switch (em)
5233
{
5234
case endpoint_mode::cUseLeft:
5235
case endpoint_mode::cUseUpper:
5236
{
5237
const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];
5238
const uint32_t cem = local_md.m_cem;
5239
5240
if (local_md.m_num_partitions > 1)
5241
break;
5242
5243
if ((em == endpoint_mode::cUseLeft) && (!has_left_neighbor))
5244
break;
5245
else if ((em == endpoint_mode::cUseUpper) && (!has_upper_neighbor))
5246
break;
5247
5248
candidate_encoding candidate;
5249
candidate.m_coder.reserve(24);
5250
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5251
5252
int nx = bx, ny = by;
5253
if (em == endpoint_mode::cUseLeft)
5254
nx--;
5255
else
5256
ny--;
5257
5258
const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);
5259
if (neighbor_blk.m_encoding_type == encoding_type::cSolid)
5260
break;
5261
assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));
5262
5263
const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];
5264
5265
if (neighbor_md.m_cem != cem)
5266
break;
5267
5268
assert(neighbor_blk.m_coded_log_blk.m_color_endpoint_modes[0] == cem);
5269
5270
const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;
5271
const bool dual_plane = local_md.m_dp;
5272
const uint32_t num_grid_samples = grid_x * grid_y;
5273
const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);
5274
5275
coded_log_blk.m_grid_width = (uint8_t)grid_x;
5276
coded_log_blk.m_grid_height = (uint8_t)grid_y;
5277
coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
5278
coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5279
coded_log_blk.m_num_partitions = 1;
5280
coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)neighbor_md.m_cem;
5281
coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;
5282
5283
// We're not explictly writing any endpoints, just reusing existing ones. So copy the neighbor's endpoints unchanged (so no loss).
5284
coded_log_blk.m_endpoint_ise_range = neighbor_blk.m_coded_log_blk.m_endpoint_ise_range;
5285
memcpy(coded_log_blk.m_endpoints, neighbor_blk.m_coded_log_blk.m_endpoints, num_endpoint_vals);
5286
5287
uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];
5288
5289
// Requantize the neighbor's endpoints to whatever we'll have to transcode into to make a valid ASTC encoding.
5290
basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,
5291
neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,
5292
local_md.m_transcode_endpoint_ise_range, transcode_endpoints);
5293
5294
// Now encode the block using the transcoded endpoints
5295
basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];
5296
5297
if (cem == 7)
5298
{
5299
status = get_astc_hdr_mode_7_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,
5300
astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);
5301
}
5302
else
5303
{
5304
status = get_astc_hdr_mode_11_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,
5305
astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);
5306
}
5307
if (!status)
5308
break;
5309
5310
uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];
5311
if (dual_plane)
5312
{
5313
eval_selectors_dual_plane(local_md.m_dp_channel, BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
5314
5315
downsample_ise_weights_dual_plane(
5316
local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5317
BLOCK_W, BLOCK_H,
5318
grid_x, grid_y,
5319
trial_weights0, trial_weights1, coded_log_blk.m_weights);
5320
}
5321
else
5322
{
5323
eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, local_md.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);
5324
5325
downsample_ise_weights(
5326
local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5327
BLOCK_W, BLOCK_H,
5328
grid_x, grid_y,
5329
trial_weights0, coded_log_blk.m_weights);
5330
}
5331
5332
// Transcode these codable weights to ASTC weights.
5333
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
5334
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);
5335
5336
// Create the block the decoder would transcode into.
5337
astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5338
decomp_blk.clear();
5339
5340
decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
5341
decomp_blk.m_dual_plane = local_md.m_dp;
5342
decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5343
decomp_blk.m_num_partitions = 1;
5344
decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;
5345
decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;
5346
5347
memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);
5348
5349
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
5350
5351
if (!validate_log_blk(decomp_blk))
5352
{
5353
fmt_error_printf("pack_astc_block() failed\n");
5354
return false;
5355
}
5356
5357
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5358
if (!status)
5359
{
5360
fmt_error_printf("decode_astc_block() failed\n");
5361
return false;
5362
}
5363
5364
candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5365
code_block(candidate.m_coder, candidate.m_coded_log_blk, (block_mode)block_mode_iter, em, nullptr);
5366
5367
candidate.m_encoding_type = encoding_type::cBlock;
5368
candidate.m_endpoint_mode = em;
5369
candidate.m_block_mode = bm;
5370
5371
candidates.emplace_back(std::move(candidate));
5372
5373
break;
5374
}
5375
case endpoint_mode::cUseLeftDelta:
5376
case endpoint_mode::cUseUpperDelta:
5377
{
5378
const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];
5379
const uint32_t cem = local_md.m_cem;
5380
5381
if (local_md.m_num_partitions > 1)
5382
break;
5383
5384
if ((em == endpoint_mode::cUseLeftDelta) && (!has_left_neighbor))
5385
break;
5386
else if ((em == endpoint_mode::cUseUpperDelta) && (!has_upper_neighbor))
5387
break;
5388
5389
candidate_encoding candidate;
5390
candidate.m_coder.reserve(24);
5391
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5392
5393
int nx = bx, ny = by;
5394
if (em == endpoint_mode::cUseLeftDelta)
5395
nx--;
5396
else
5397
ny--;
5398
5399
const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);
5400
if (neighbor_blk.m_encoding_type == encoding_type::cSolid)
5401
break;
5402
assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));
5403
5404
const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];
5405
5406
if (neighbor_md.m_cem != cem)
5407
break;
5408
5409
assert(neighbor_md.m_cem == local_md.m_cem);
5410
5411
const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;
5412
const bool dual_plane = local_md.m_dp;
5413
const uint32_t num_grid_samples = grid_x * grid_y;
5414
const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);
5415
5416
// Dequantize neighbor's endpoints to ISE 20
5417
uint8_t neighbor_endpoints_ise20[basist::NUM_MODE11_ENDPOINTS];
5418
basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,
5419
neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,
5420
astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20);
5421
5422
// Requantize neighbor's endpoints to our local desired coding ISE range
5423
uint8_t neighbor_endpoints_coding_ise_local[basist::NUM_MODE11_ENDPOINTS];
5424
basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20, local_md.m_endpoint_ise_range, neighbor_endpoints_coding_ise_local);
5425
5426
uint8_t blk_endpoints[basist::NUM_MODE11_ENDPOINTS];
5427
uint8_t blk_weights0[NUM_BLOCK_PIXELS], blk_weights1[NUM_BLOCK_PIXELS];
5428
5429
// Now try to encode the current block using the neighbor's endpoints submode.
5430
double err = 0.0f;
5431
uint32_t best_submode = 0;
5432
5433
if (cem == 7)
5434
{
5435
int maj_index, submode_index;
5436
decode_cem_7_config(neighbor_endpoints_ise20, submode_index, maj_index);
5437
5438
int first_submode = submode_index, last_submode = submode_index;
5439
5440
err = encode_astc_hdr_block_mode_7(
5441
NUM_BLOCK_PIXELS,
5442
(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
5443
local_md.m_weight_ise_range,
5444
best_submode,
5445
BIG_FLOAT_VAL,
5446
blk_endpoints, blk_weights0,
5447
coptions,
5448
local_md.m_endpoint_ise_range,
5449
first_submode, last_submode,
5450
&enc_block_stats);
5451
}
5452
else
5453
{
5454
int maj_index, submode_index;
5455
decode_cem_11_config(neighbor_endpoints_ise20, submode_index, maj_index);
5456
5457
int first_submode = -1, last_submode = -1;
5458
if (maj_index == 3)
5459
{
5460
// direct
5461
}
5462
else
5463
{
5464
first_submode = submode_index;
5465
last_submode = submode_index;
5466
}
5467
5468
if (dual_plane)
5469
{
5470
err = encode_astc_hdr_block_mode_11_dual_plane(
5471
NUM_BLOCK_PIXELS,
5472
(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
5473
local_md.m_dp_channel,
5474
local_md.m_weight_ise_range,
5475
best_submode,
5476
BIG_FLOAT_VAL,
5477
blk_endpoints, blk_weights0, blk_weights1,
5478
coptions,
5479
false,
5480
local_md.m_endpoint_ise_range,
5481
false, //uber_mode_flag,
5482
false,
5483
first_submode, last_submode, true);
5484
}
5485
else
5486
{
5487
err = encode_astc_hdr_block_mode_11(
5488
NUM_BLOCK_PIXELS,
5489
(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
5490
local_md.m_weight_ise_range,
5491
best_submode,
5492
BIG_FLOAT_VAL,
5493
blk_endpoints, blk_weights0,
5494
coptions,
5495
false,
5496
local_md.m_endpoint_ise_range,
5497
false, //uber_mode_flag,
5498
false,
5499
first_submode, last_submode, true,
5500
mode11_opt_mode,
5501
&enc_block_stats);
5502
}
5503
}
5504
5505
if (err == BIG_FLOAT_VAL)
5506
break;
5507
5508
uint8_t endpoint_deltas[basist::NUM_MODE11_ENDPOINTS];
5509
5510
// TODO: For now, just try 5 bits for each endpoint. Can tune later.
5511
// This isn't right, it's computing the deltas in ISE space.
5512
//const uint32_t NUM_ENDPOINT_DELTA_BITS = 5;
5513
const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;
5514
const int low_delta_limit = -(total_endpoint_delta_vals / 2), high_delta_limit = (total_endpoint_delta_vals / 2) - 1;
5515
5516
const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(local_md.m_endpoint_ise_range).m_ISE_to_rank;
5517
5518
bool all_deltas_in_limits = true;
5519
for (uint32_t i = 0; i < num_endpoint_vals; i++)
5520
{
5521
int endpoint_delta = (int)ise_to_rank[blk_endpoints[i]] - (int)ise_to_rank[neighbor_endpoints_coding_ise_local[i]];
5522
5523
if ((endpoint_delta < low_delta_limit) || (endpoint_delta > high_delta_limit))
5524
all_deltas_in_limits = false;
5525
5526
endpoint_deltas[i] = (uint8_t)(endpoint_delta + -low_delta_limit);
5527
}
5528
5529
if (all_deltas_in_limits)
5530
{
5531
coded_log_blk.m_grid_width = (uint8_t)grid_x;
5532
coded_log_blk.m_grid_height = (uint8_t)grid_y;
5533
coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
5534
coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5535
coded_log_blk.m_num_partitions = 1;
5536
coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
5537
coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;
5538
coded_log_blk.m_endpoint_ise_range = (uint8_t)local_md.m_endpoint_ise_range;
5539
5540
memcpy(coded_log_blk.m_endpoints, blk_endpoints, num_endpoint_vals);
5541
5542
uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];
5543
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
5544
5545
basist::astc_6x6_hdr::requantize_ise_endpoints(local_md.m_cem, local_md.m_endpoint_ise_range, blk_endpoints, local_md.m_transcode_endpoint_ise_range, transcode_endpoints);
5546
5547
if (dual_plane)
5548
{
5549
downsample_ise_weights_dual_plane(
5550
local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5551
BLOCK_W, BLOCK_H,
5552
grid_x, grid_y,
5553
blk_weights0, blk_weights1,
5554
coded_log_blk.m_weights);
5555
}
5556
else
5557
{
5558
downsample_ise_weights(
5559
local_md.m_weight_ise_range, local_md.m_weight_ise_range,
5560
BLOCK_W, BLOCK_H,
5561
grid_x, grid_y,
5562
blk_weights0, coded_log_blk.m_weights);
5563
}
5564
5565
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);
5566
5567
// Create the block the decoder would transcode into.
5568
5569
astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5570
decomp_blk.clear();
5571
5572
decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
5573
decomp_blk.m_dual_plane = local_md.m_dp;
5574
decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
5575
decomp_blk.m_num_partitions = 1;
5576
decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;
5577
decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;
5578
5579
memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);
5580
5581
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
5582
5583
if (!validate_log_blk(decomp_blk))
5584
{
5585
fmt_error_printf("pack_astc_block() failed\n");
5586
return false;
5587
}
5588
5589
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5590
if (!status)
5591
{
5592
fmt_error_printf("decode_astc_block() failed\n");
5593
return false;
5594
}
5595
5596
candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5597
code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, endpoint_deltas);
5598
5599
candidate.m_encoding_type = encoding_type::cBlock;
5600
candidate.m_endpoint_mode = em;
5601
candidate.m_block_mode = bm;
5602
5603
candidates.emplace_back(std::move(candidate));
5604
}
5605
5606
break;
5607
}
5608
case endpoint_mode::cRaw:
5609
{
5610
//if (candidates.size() == 339)
5611
// fmt_printf("!");
5612
5613
const auto& mode_desc = g_block_mode_descs[(uint32_t)bm];
5614
const uint32_t cem = mode_desc.m_cem;
5615
//const uint32_t num_endpoint_vals = get_num_endpoint_vals(cem);
5616
const bool dual_plane = mode_desc.m_dp;
5617
5618
if ((global_cfg.m_disable_twothree_subsets) && (mode_desc.m_num_partitions >= 2))
5619
break;
5620
5621
if (mode_desc.m_num_partitions == 3)
5622
{
5623
assert(!dual_plane);
5624
5625
if (!has_estimated_parts3)
5626
break;
5627
5628
assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);
5629
assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);
5630
5631
trial_result res;
5632
5633
status = encode_block_3_subsets(
5634
res,
5635
cem,
5636
mode_desc.m_grid_x, mode_desc.m_grid_y,
5637
mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
5638
&half_pixels[0][0], (vec4F*)block_pixels_q16,
5639
coptions,
5640
uber_mode_flag,
5641
best_parts3, total_parts3, comp_level, mode11_opt_mode);
5642
5643
if (!status)
5644
break;
5645
5646
assert(res.m_valid);
5647
5648
candidate_encoding candidate;
5649
candidate.m_coder.reserve(24);
5650
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5651
5652
coded_log_blk = res.m_log_blk;
5653
5654
astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5655
decomp_blk = res.m_log_blk;
5656
5657
if (!validate_log_blk(decomp_blk))
5658
{
5659
fmt_error_printf("pack_astc_block() failed\n");
5660
return false;
5661
}
5662
5663
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5664
if (!status)
5665
{
5666
fmt_error_printf("decode_astc_block() failed\n");
5667
return false;
5668
}
5669
5670
candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5671
code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
5672
5673
candidate.m_encoding_type = encoding_type::cBlock;
5674
candidate.m_endpoint_mode = em;
5675
candidate.m_block_mode = bm;
5676
5677
candidates.emplace_back(std::move(candidate));
5678
}
5679
else if (mode_desc.m_num_partitions == 2)
5680
{
5681
assert(!dual_plane);
5682
5683
if (!has_estimated_parts2)
5684
break;
5685
5686
assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);
5687
assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);
5688
5689
for (uint32_t est_part_iter = 0; est_part_iter < total_parts2; est_part_iter++)
5690
{
5691
trial_result results[2];
5692
5693
assert(((cem == 11) && any_2subset_mode11_enabled) || ((cem == 7) && any_2subset_mode7_enabled));
5694
5695
status = encode_block_2_subsets(
5696
results,
5697
mode_desc.m_grid_x, mode_desc.m_grid_y,
5698
mode_desc.m_cem,
5699
mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
5700
&half_pixels[0][0], (vec4F*)block_pixels_q16,
5701
coptions,
5702
uber_mode_flag,
5703
(cem == 11) ? best_parts2_mode11[est_part_iter] : best_parts2_mode7[est_part_iter],
5704
comp_level,
5705
mode11_opt_mode,
5706
true);
5707
5708
if (!status)
5709
continue;
5710
5711
for (uint32_t r_iter = 0; r_iter < 2; r_iter++)
5712
{
5713
const trial_result& res = results[r_iter];
5714
5715
if (!res.m_valid)
5716
continue;
5717
5718
candidate_encoding candidate;
5719
candidate.m_coder.reserve(24);
5720
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5721
5722
coded_log_blk = res.m_log_blk;
5723
5724
astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5725
decomp_blk = res.m_log_blk;
5726
5727
if (!validate_log_blk(decomp_blk))
5728
{
5729
fmt_error_printf("pack_astc_block() failed\n");
5730
return false;
5731
}
5732
5733
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5734
if (!status)
5735
{
5736
fmt_error_printf("decode_astc_block() failed\n");
5737
return false;
5738
}
5739
5740
candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5741
code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
5742
5743
candidate.m_encoding_type = encoding_type::cBlock;
5744
candidate.m_endpoint_mode = em;
5745
candidate.m_block_mode = bm;
5746
5747
candidates.emplace_back(std::move(candidate));
5748
5749
} // r_iter
5750
}
5751
}
5752
else
5753
{
5754
// 1 subset
5755
uint8_t blk_weights0[BLOCK_W * BLOCK_H], blk_weights1[BLOCK_W * BLOCK_H];
5756
uint32_t best_submode = 0;
5757
5758
candidate_encoding candidate;
5759
candidate.m_coder.reserve(24);
5760
astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
5761
5762
const uint32_t grid_x = mode_desc.m_grid_x, grid_y = mode_desc.m_grid_y;
5763
const uint32_t num_grid_samples = grid_x * grid_y;
5764
5765
const half_vec3* pBlock_pixels_half = &half_pixels[0][0];
5766
const vec4F* pBlock_pixels_q16 = &block_pixels_q16[0][0];
5767
5768
const uint32_t num_grid_samples_dp = num_grid_samples * (dual_plane ? 2 : 1);
5769
5770
uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
5771
5772
coded_log_blk.m_grid_width = (uint8_t)grid_x;
5773
coded_log_blk.m_grid_height = (uint8_t)grid_y;
5774
coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
5775
coded_log_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;
5776
coded_log_blk.m_num_partitions = 1;
5777
coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;
5778
coded_log_blk.m_weight_ise_range = (uint8_t)mode_desc.m_weight_ise_range;
5779
coded_log_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_endpoint_ise_range;
5780
5781
if ((cem == 11) && (!dual_plane) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))
5782
{
5783
double e = encode_astc_hdr_block_downsampled_mode_11(
5784
BLOCK_W, BLOCK_H, grid_x, grid_y,
5785
mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
5786
NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5787
BIG_FLOAT_VAL,
5788
FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode,
5789
coded_log_blk.m_endpoints, coded_log_blk.m_weights, best_submode,
5790
coptions,
5791
&enc_block_stats);
5792
5793
if (e == BIG_FLOAT_VAL)
5794
break;
5795
}
5796
else
5797
{
5798
if (cem == 7)
5799
{
5800
assert(!dual_plane);
5801
5802
double e = encode_astc_hdr_block_mode_7(
5803
NUM_BLOCK_PIXELS,
5804
(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5805
mode_desc.m_weight_ise_range,
5806
best_submode,
5807
BIG_FLOAT_VAL,
5808
coded_log_blk.m_endpoints,
5809
blk_weights0,
5810
coptions,
5811
mode_desc.m_endpoint_ise_range,
5812
0, MAX_MODE7_SUBMODE_INDEX,
5813
&enc_block_stats);
5814
BASISU_NOTE_UNUSED(e);
5815
}
5816
else
5817
{
5818
double e;
5819
5820
if (dual_plane)
5821
{
5822
e = encode_astc_hdr_block_mode_11_dual_plane(
5823
NUM_BLOCK_PIXELS,
5824
(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5825
mode_desc.m_dp_channel,
5826
mode_desc.m_weight_ise_range,
5827
best_submode,
5828
BIG_FLOAT_VAL,
5829
coded_log_blk.m_endpoints,
5830
blk_weights0, blk_weights1,
5831
coptions,
5832
false,
5833
mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false);
5834
}
5835
else
5836
{
5837
e = encode_astc_hdr_block_mode_11(
5838
NUM_BLOCK_PIXELS,
5839
(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
5840
mode_desc.m_weight_ise_range,
5841
best_submode,
5842
BIG_FLOAT_VAL,
5843
coded_log_blk.m_endpoints,
5844
blk_weights0,
5845
coptions,
5846
false,
5847
mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,
5848
mode11_opt_mode,
5849
&enc_block_stats);
5850
}
5851
5852
if (e == BIG_FLOAT_VAL)
5853
break;
5854
}
5855
5856
if (dual_plane)
5857
{
5858
downsample_ise_weights_dual_plane(
5859
mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,
5860
BLOCK_W, BLOCK_H,
5861
grid_x, grid_y,
5862
blk_weights0, blk_weights1,
5863
coded_log_blk.m_weights);
5864
}
5865
else
5866
{
5867
downsample_ise_weights(
5868
mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,
5869
BLOCK_W, BLOCK_H,
5870
grid_x, grid_y,
5871
blk_weights0, coded_log_blk.m_weights);
5872
5873
if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))
5874
{
5875
bool refine_status = refine_endpoints(cem,
5876
mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints,
5877
6, 6, mode_desc.m_grid_x, mode_desc.m_grid_y,
5878
coded_log_blk.m_weights, mode_desc.m_weight_ise_range,
5879
BLOCK_W * BLOCK_H,
5880
(basist::half_float(*)[3])pBlock_pixels_half, (vec4F*)pBlock_pixels_q16,
5881
nullptr,
5882
coptions, mode11_opt_mode);
5883
BASISU_NOTE_UNUSED(refine_status);
5884
}
5885
}
5886
}
5887
5888
basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples_dp, coded_log_blk.m_weights, mode_desc.m_weight_ise_range, transcode_weights, mode_desc.m_transcode_weight_ise_range);
5889
5890
// Create the block the decoder would transcode into.
5891
astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
5892
decomp_blk.clear();
5893
5894
decomp_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;
5895
decomp_blk.m_dual_plane = mode_desc.m_dp;
5896
decomp_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;
5897
decomp_blk.m_num_partitions = 1;
5898
decomp_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_transcode_endpoint_ise_range;
5899
decomp_blk.m_weight_ise_range = (uint8_t)mode_desc.m_transcode_weight_ise_range;
5900
5901
basist::astc_6x6_hdr::requantize_ise_endpoints(mode_desc.m_cem, mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, mode_desc.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);
5902
5903
copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk);
5904
5905
if (!validate_log_blk(decomp_blk))
5906
{
5907
fmt_error_printf("pack_astc_block() failed\n");
5908
return false;
5909
}
5910
5911
status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
5912
if (!status)
5913
{
5914
fmt_error_printf("decode_astc_block() failed\n");
5915
return false;
5916
}
5917
5918
candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
5919
code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);
5920
5921
candidate.m_encoding_type = encoding_type::cBlock;
5922
candidate.m_endpoint_mode = em;
5923
candidate.m_block_mode = bm;
5924
5925
candidates.emplace_back(std::move(candidate));
5926
}
5927
5928
break;
5929
}
5930
default:
5931
assert(0);
5932
fmt_debug_printf("Invalid endpoint mode\n");
5933
return false;
5934
5935
} // switch (em)
5936
5937
} // endpoint_mode_iter
5938
5939
} // block_mode_iter
5940
5941
} // is_solid_block
5942
5943
//------------------------------------------------
5944
5945
debug_state.m_total_candidates_considered.fetch_add(candidates.size_u32(), std::memory_order_relaxed);
5946
atomic_max(debug_state.m_max_candidates_considered, candidates.size_u32());
5947
5948
for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
5949
{
5950
auto& candidate = candidates[candidate_iter];
5951
5952
for (uint32_t y = 0; y < BLOCK_H; y++)
5953
for (uint32_t x = 0; x < BLOCK_W; x++)
5954
linear_rgb_to_itp(candidate.m_comp_pixels[y][x], candidate.m_comp_pixels_itp[y][x], global_cfg);
5955
}
5956
5957
// Find best overall candidate
5958
double best_t = BIG_FLOAT_VAL;
5959
int best_candidate_index = -1;
5960
5961
float best_d_ssim = BIG_FLOAT_VAL;
5962
5963
if (global_cfg.m_lambda == 0.0f)
5964
{
5965
for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
5966
{
5967
const auto& candidate = candidates[candidate_iter];
5968
5969
float candidate_d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);
5970
5971
if (candidate_d_ssim < best_d_ssim)
5972
best_d_ssim = candidate_d_ssim;
5973
5974
candidate_d_ssim *= SSIM_WEIGHT;
5975
5976
float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);
5977
5978
candidate_mse += candidate_d_ssim;
5979
5980
float total_deblock_penalty = 0.0f;
5981
if (global_cfg.m_deblocking_flag)
5982
{
5983
total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;
5984
}
5985
candidate_mse += total_deblock_penalty * SSIM_WEIGHT;
5986
5987
if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))
5988
{
5989
// Bias the encoder away from 2 level blocks on complex blocks
5990
// TODO: Perhaps only do this on large or non-interpolated grids
5991
if (complex_block)
5992
{
5993
if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)
5994
{
5995
candidate_mse *= TWO_LEVEL_PENALTY;
5996
}
5997
}
5998
5999
// Bias the encoder away from smaller weight grids if the block is very complex
6000
// TODO: Use the DCT to compute an approximation of the block energy/variance retained vs. lost by downsampling.
6001
if (complex_block)
6002
{
6003
if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))
6004
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;
6005
else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)
6006
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;
6007
else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)
6008
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;
6009
}
6010
}
6011
6012
float candidate_t = candidate_mse;
6013
6014
if (candidate_t < best_t)
6015
{
6016
best_t = candidate_t;
6017
best_candidate_index = candidate_iter;
6018
}
6019
6020
} // candidate_iter
6021
6022
if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))
6023
{
6024
debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);
6025
continue;
6026
}
6027
6028
const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);
6029
6030
if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&
6031
(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&
6032
(block_avg_y >= 1.5f))
6033
{
6034
debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);
6035
continue;
6036
}
6037
}
6038
else
6039
{
6040
assert(enc_state.smooth_block_mse_scales.get_width() > 0);
6041
6042
// Compute block's perceptual weighting
6043
float perceptual_scale = 0.0f;
6044
for (uint32_t y = 0; y < BLOCK_H; y++)
6045
for (uint32_t x = 0; x < BLOCK_W; x++)
6046
perceptual_scale = basisu::maximumf(perceptual_scale, enc_state.smooth_block_mse_scales.at_clamped(bx * BLOCK_W + x, by * BLOCK_H + y));
6047
6048
// Very roughly normalize the computed distortion vs. bits.
6049
perceptual_scale *= 10.0f;
6050
6051
for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
6052
{
6053
auto& candidate = candidates[candidate_iter];
6054
6055
float d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);
6056
6057
if (d_ssim < best_d_ssim)
6058
best_d_ssim = (float)d_ssim;
6059
6060
d_ssim *= SSIM_WEIGHT;
6061
6062
float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);
6063
6064
candidate_mse += d_ssim;
6065
6066
float total_deblock_penalty = 0.0f;
6067
if (global_cfg.m_deblocking_flag)
6068
{
6069
total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;
6070
}
6071
candidate_mse += total_deblock_penalty * SSIM_WEIGHT;
6072
6073
if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))
6074
{
6075
// Bias the encoder away from 2 level blocks on complex blocks
6076
if (complex_block)
6077
{
6078
if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)
6079
{
6080
candidate_mse *= TWO_LEVEL_PENALTY;
6081
}
6082
}
6083
6084
// Bias the encoder away from smaller weight grids if the block is very complex
6085
if (complex_block)
6086
{
6087
if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))
6088
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;
6089
else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)
6090
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;
6091
else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)
6092
candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;
6093
}
6094
}
6095
6096
float mode_penalty = 1.0f;
6097
if (candidate.m_encoding_type == encoding_type::cSolid)
6098
mode_penalty *= SOLID_PENALTY;
6099
else if (candidate.m_encoding_type == encoding_type::cReuse)
6100
mode_penalty *= REUSE_PENALTY;
6101
else if (candidate.m_encoding_type == encoding_type::cRun)
6102
mode_penalty *= (complex_block ? RUN_PENALTY * 2.0f : RUN_PENALTY);
6103
6104
float candidate_bits = (float)candidate.m_coder.get_total_bits();
6105
float candidate_d = candidate_mse * mode_penalty;
6106
6107
const float D_POWER = 2.0f;
6108
float candidate_t = perceptual_scale * powf(candidate_d, D_POWER) + candidate_bits * (global_cfg.m_lambda * 1000.0f);
6109
6110
candidate.m_t = candidate_t;
6111
candidate.m_d = candidate_d;
6112
candidate.m_bits = candidate_bits;
6113
6114
if (candidate_t < best_t)
6115
{
6116
best_t = candidate_t;
6117
best_candidate_index = candidate_iter;
6118
}
6119
6120
} // candidate_iter
6121
6122
if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))
6123
{
6124
debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);
6125
continue;
6126
}
6127
6128
const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);
6129
6130
if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&
6131
(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&
6132
(block_avg_y >= 1.5f))
6133
{
6134
debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);
6135
continue;
6136
}
6137
6138
if (global_cfg.m_rdo_candidate_diversity_boost)
6139
{
6140
// candidate diversity boosting - consider candidates along/near the Pareto front
6141
const candidate_encoding& comp_candidate = candidates[best_candidate_index];
6142
6143
float best_d = BIG_FLOAT_VAL;
6144
6145
for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
6146
{
6147
const auto& candidate = candidates[candidate_iter];
6148
6149
if (candidate.m_bits <= comp_candidate.m_bits * global_cfg.m_rdo_candidate_diversity_boost_bit_window_weight)
6150
{
6151
if (candidate.m_d < best_d)
6152
{
6153
best_d = candidate.m_d;
6154
best_candidate_index = candidate_iter;
6155
}
6156
}
6157
}
6158
}
6159
6160
// candidate JND optimization - if there's a cheaper to code candidate that is nearly equivalent visually to the best candidate chose, choose that
6161
if (global_cfg.m_jnd_optimization)
6162
{
6163
const candidate_encoding& cur_comp_candidate = candidates[best_candidate_index];
6164
6165
float new_best_candidate_bits = BIG_FLOAT_VAL;
6166
int new_best_candidate_index = -1;
6167
6168
for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
6169
{
6170
if ((int)candidate_iter == best_candidate_index)
6171
continue;
6172
6173
const auto& candidate = candidates[candidate_iter];
6174
6175
if (candidate.m_bits >= cur_comp_candidate.m_bits)
6176
continue;
6177
6178
float max_delta_itp = 0.0f;
6179
for (uint32_t y = 0; y < BLOCK_H; y++)
6180
{
6181
for (uint32_t x = 0; x < BLOCK_W; x++)
6182
{
6183
float delta_itp = compute_pixel_delta_itp(cur_comp_candidate.m_comp_pixels_itp[y][x], candidate.m_comp_pixels_itp[y][x], block_pixels_as_itp[y][x], global_cfg.m_delta_itp_dark_adjustment);
6184
max_delta_itp = maximum(max_delta_itp, delta_itp);
6185
6186
if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)
6187
goto skip;
6188
}
6189
}
6190
6191
skip:
6192
if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)
6193
continue;
6194
6195
if (candidate.m_bits < new_best_candidate_bits)
6196
{
6197
new_best_candidate_bits = candidate.m_bits;
6198
new_best_candidate_index = candidate_iter;
6199
}
6200
}
6201
6202
if (new_best_candidate_index != -1)
6203
{
6204
best_candidate_index = new_best_candidate_index;
6205
debug_state.m_total_jnd_replacements.fetch_add(1, std::memory_order_relaxed);
6206
}
6207
}
6208
6209
} // if (lambda == 0.0f)
6210
6211
if (global_cfg.m_debug_images)
6212
{
6213
std::lock_guard<std::mutex> lck(debug_state.m_stat_vis_mutex);
6214
debug_state.m_stat_vis.fill_box(bx * 6, by * 6, 6, 6, vec4F(best_d_ssim, max_std_dev, lowpass_std_dev, 1.0f));
6215
}
6216
6217
if (best_candidate_index < 0)
6218
{
6219
assert(best_candidate_index >= 0);
6220
fmt_error_printf("No candidates!\n");
6221
return false;
6222
}
6223
6224
const auto& best_candidate = candidates[best_candidate_index];
6225
6226
assert(best_candidate.m_encoding_type != encoding_type::cInvalid);
6227
6228
if (best_candidate.m_encoding_type == encoding_type::cRun)
6229
{
6230
if (!prev_run_len)
6231
{
6232
if (prev_encoding.get_total_bits())
6233
{
6234
#if SYNC_MARKERS
6235
strip_coded_bits.put_bits(0xDEAD, 16);
6236
#endif
6237
6238
strip_coded_bits.append(prev_encoding);
6239
}
6240
6241
assert(best_candidate.m_coder.get_total_bits());
6242
6243
prev_encoding = best_candidate.m_coder;
6244
6245
prev_run_len = 1;
6246
}
6247
else
6248
{
6249
prev_run_len++;
6250
6251
const uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();
6252
assert(prev_run_bits);
6253
BASISU_NOTE_UNUSED(prev_run_bits);
6254
6255
const uint32_t num_dummy_bits = best_candidate.m_coder.get_total_bits_u32();
6256
BASISU_NOTE_UNUSED(num_dummy_bits);
6257
6258
// Rewrite the previous encoding to extend the run length.
6259
prev_encoding.restart();
6260
prev_encoding.put_bits(RUN_CODE, RUN_CODE_LEN);
6261
prev_encoding.put_vlc(prev_run_len - 1, 5);
6262
6263
assert(prev_encoding.get_total_bits() == prev_run_bits + num_dummy_bits);
6264
}
6265
}
6266
else
6267
{
6268
if (prev_encoding.get_total_bits())
6269
{
6270
#if SYNC_MARKERS
6271
strip_coded_bits.put_bits(0xDEAD, 16);
6272
#endif
6273
6274
strip_coded_bits.append(prev_encoding);
6275
}
6276
6277
prev_encoding = best_candidate.m_coder;
6278
prev_run_len = 0;
6279
}
6280
6281
memcpy(prev_comp_pixels, best_candidate.m_comp_pixels, sizeof(vec3F) * BLOCK_W * BLOCK_H);
6282
6283
prev_candidate_encoding = best_candidate;
6284
6285
if (best_candidate.m_encoding_type != encoding_type::cRun)
6286
prev_non_run_candidate_encoding = best_candidate;
6287
6288
{
6289
std::lock_guard<std::mutex> lck(debug_state.m_stats_mutex);
6290
6291
debug_state.m_encoding_type_hist[(uint32_t)best_candidate.m_encoding_type]++;
6292
6293
if (best_candidate.m_encoding_type == encoding_type::cBlock)
6294
{
6295
debug_state.m_endpoint_mode_hist[(uint32_t)best_candidate.m_endpoint_mode]++;
6296
}
6297
6298
if ((best_candidate.m_encoding_type == encoding_type::cReuse) || (best_candidate.m_encoding_type == encoding_type::cBlock))
6299
{
6300
const uint32_t bm_index = (uint32_t)best_candidate.m_block_mode;
6301
assert(bm_index < (uint32_t)block_mode::cBMTotalModes);
6302
6303
debug_state.m_block_mode_hist[bm_index]++;
6304
debug_state.m_block_mode_total_bits[bm_index] += best_candidate.m_coder.get_total_bits();
6305
6306
for (uint32_t i = 0; i < 3; i++)
6307
{
6308
debug_state.m_block_mode_comp_stats[bm_index][i].push_back(half_comp_stats[i]);
6309
debug_state.m_block_mode_comparative_stats[bm_index][i].push_back(half_cross_chan_stats[i]);
6310
}
6311
}
6312
6313
if (best_candidate.m_encoding_type == encoding_type::cReuse)
6314
{
6315
debug_state.m_reuse_num_parts[best_candidate.m_coded_log_blk.m_num_partitions].fetch_add(1, std::memory_order_relaxed);
6316
6317
if (best_candidate.m_coded_log_blk.m_dual_plane)
6318
debug_state.m_reuse_total_dp.fetch_add(1, std::memory_order_relaxed);
6319
}
6320
}
6321
6322
enc_state.coded_blocks(bx, by) = prev_non_run_candidate_encoding;
6323
6324
// Update decoded image
6325
vec4F decoded_float_pixels[BLOCK_H][BLOCK_W];
6326
for (uint32_t y = 0; y < BLOCK_H; y++)
6327
for (uint32_t x = 0; x < BLOCK_W; x++)
6328
decoded_float_pixels[y][x] = best_candidate.m_comp_pixels[y][x];
6329
6330
enc_state.packed_img.set_block_clipped((vec4F*)decoded_float_pixels, bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);
6331
6332
status = astc_helpers::pack_astc_block(enc_state.final_astc_blocks(bx, by), best_candidate.m_decomp_log_blk, nullptr, nullptr);
6333
if (!status)
6334
{
6335
fmt_error_printf("Failed packing block\n");
6336
return false;
6337
}
6338
6339
const uint32_t r = debug_state.m_total_blocks_compressed.fetch_add(1, std::memory_order_relaxed);
6340
if ((r & 2047) == 2047)
6341
{
6342
if (global_cfg.m_status_output)
6343
{
6344
basisu::fmt_printf("{} of {} total blocks compressed, {3.2}%\n", r, total_blocks, (r * 100.0f) / total_blocks);
6345
}
6346
}
6347
6348
if ((global_cfg.m_debug_images) &&
6349
((best_candidate.m_encoding_type != encoding_type::cRun) && (best_candidate.m_encoding_type != encoding_type::cSolid)))
6350
{
6351
std::lock_guard<std::mutex> lck(debug_state.m_vis_image_mutex);
6352
6353
if (best_candidate.m_decomp_log_blk.m_num_partitions == 2)
6354
{
6355
const int part2_unique_index = g_part2_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];
6356
assert((part2_unique_index >= 0) && (part2_unique_index < (int)NUM_UNIQUE_PARTITIONS2));
6357
6358
const partition_pattern_vec& pat = g_partitions2[part2_unique_index];
6359
6360
for (uint32_t y = 0; y < 6; y++)
6361
{
6362
for (uint32_t x = 0; x < 6; x++)
6363
{
6364
const uint32_t p = pat[x + y * 6];
6365
debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, color_rgba(p ? 100 : 0, 128, p ? 100 : 0, 255));
6366
} // x
6367
} // y
6368
}
6369
else if (best_candidate.m_decomp_log_blk.m_num_partitions == 3)
6370
{
6371
//part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(0, 0, 255, 255));
6372
6373
const int part3_unique_index = g_part3_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];
6374
assert((part3_unique_index >= 0) && (part3_unique_index < (int)NUM_UNIQUE_PARTITIONS3));
6375
6376
const partition_pattern_vec& pat = g_partitions3[part3_unique_index];
6377
6378
for (uint32_t y = 0; y < 6; y++)
6379
{
6380
for (uint32_t x = 0; x < 6; x++)
6381
{
6382
const uint32_t p = pat[x + y * 6];
6383
color_rgba c(0, 0, 150, 255);
6384
if (p == 1)
6385
c.set(100, 0, 150, 255);
6386
else if (p == 2)
6387
c.set(0, 100, 150, 255);
6388
debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, c);
6389
} // x
6390
} // y
6391
}
6392
else if (best_candidate.m_decomp_log_blk.m_dual_plane)
6393
{
6394
debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 255, 255));
6395
}
6396
else
6397
{
6398
debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 0, 255));
6399
}
6400
6401
color_rgba c;
6402
c.set((best_candidate.m_coded_log_blk.m_grid_width * best_candidate.m_coded_log_blk.m_grid_height * 255 + 18) / 36);
6403
debug_state.m_grid_vis.fill_box(bx * 6, by * 6, 6, 6, c);
6404
6405
c.set(0, 0, 0, 255);
6406
if (complex_block)
6407
c[0] = 255;
6408
6409
if (very_complex_block)
6410
c[1] = 255;
6411
6412
if (outer_pass == 2)
6413
c[2] = 255;
6414
else if (outer_pass == 1)
6415
c[2] = 128;
6416
6417
debug_state.m_mode_vis.fill_box(bx * 6, by * 6, 6, 6, c);
6418
6419
c.set(0, 255, 0, 255);
6420
if (best_candidate.m_coded_log_blk.m_color_endpoint_modes[0] == 7)
6421
c.set(255, 0, 0, 255);
6422
debug_state.m_mode_vis2.fill_box(bx * 6, by * 6, 6, 6, c);
6423
6424
switch (best_candidate.m_encoding_type)
6425
{
6426
case encoding_type::cRun:
6427
c.set(0, 0, 0, 255);
6428
break;
6429
case encoding_type::cSolid:
6430
c.set(128, 128, 128, 255); // dark grey
6431
break;
6432
case encoding_type::cReuse:
6433
c.set(255, 255, 0, 255); // yellow
6434
break;
6435
case encoding_type::cBlock:
6436
{
6437
switch (best_candidate.m_endpoint_mode)
6438
{
6439
case endpoint_mode::cRaw:
6440
c.set(255, 0, 0, 255); // red
6441
break;
6442
case endpoint_mode::cUseLeft:
6443
c.set(0, 0, 255, 255); // blue
6444
break;
6445
case endpoint_mode::cUseUpper:
6446
c.set(0, 0, 192, 255); // darker blue
6447
break;
6448
case endpoint_mode::cUseLeftDelta:
6449
c.set(0, 255, 0, 255); // green
6450
break;
6451
case endpoint_mode::cUseUpperDelta:
6452
c.set(0, 192, 0, 255); // darker green
6453
break;
6454
default:
6455
break;
6456
}
6457
6458
break;
6459
}
6460
default:
6461
break;
6462
}
6463
6464
if (filtered_x_err < filtered_y_err)
6465
c[3] = 0;
6466
else
6467
c[3] = 255;
6468
6469
debug_state.m_enc_vis.fill_box(bx * 6, by * 6, 6, 6, c);
6470
}
6471
6472
break;
6473
6474
} // outer_pass
6475
6476
} // bx
6477
6478
} // by
6479
6480
if (prev_encoding.get_total_bits())
6481
{
6482
#if SYNC_MARKERS
6483
strip_coded_bits.put_bits(0xDEAD, 16);
6484
#endif
6485
6486
strip_coded_bits.append(prev_encoding);
6487
}
6488
6489
return true;
6490
}
6491
6492
bool g_initialized = false;
6493
6494
void global_init()
6495
{
6496
if (g_initialized)
6497
return;
6498
6499
interval_timer tm;
6500
tm.start();
6501
6502
init_pq_tables();
6503
6504
init_partitions2_6x6();
6505
init_partitions3_6x6();
6506
6507
init_contrib_lists();
6508
6509
g_initialized = true;
6510
6511
//fmt_printf("astc_6x6_hdr::global_init() total time: {}\n", tm.get_elapsed_secs());
6512
}
6513
6514
bool compress_photo(const basisu::imagef &orig_src_img, const astc_hdr_6x6_global_config &orig_global_cfg, job_pool *pJob_pool,
6515
basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics)
6516
{
6517
assert(g_initialized);
6518
if (!g_initialized)
6519
return false;
6520
6521
assert(pJob_pool);
6522
6523
if (orig_global_cfg.m_debug_output)
6524
{
6525
fmt_debug_printf("------ astc_6x6_hdr::compress_photo:\n");
6526
fmt_debug_printf("Source image dimensions: {}x{}\n", orig_src_img.get_width(), orig_src_img.get_height());
6527
fmt_debug_printf("Job pool total threads: {}\n", (uint64_t)pJob_pool->get_total_threads());
6528
orig_global_cfg.print();
6529
}
6530
6531
if (!orig_src_img.get_width() || !orig_src_img.get_height())
6532
{
6533
assert(false);
6534
fmt_error_printf("compress_photo: Invalid source image\n");
6535
return false;
6536
}
6537
6538
astc_hdr_6x6_global_config global_cfg(orig_global_cfg);
6539
6540
uastc_hdr_6x6_encode_state enc_state;
6541
enc_state.master_coptions.m_q_log_bias = Q_LOG_BIAS_6x6;
6542
enc_state.src_img = orig_src_img;
6543
6544
//src_img.crop(256, 256);
6545
6546
const uint32_t width = enc_state.src_img.get_width();
6547
const uint32_t height = enc_state.src_img.get_height();
6548
const uint32_t num_blocks_x = enc_state.src_img.get_block_width(BLOCK_W);
6549
const uint32_t num_blocks_y = enc_state.src_img.get_block_height(BLOCK_H);
6550
const uint32_t total_blocks = num_blocks_x * num_blocks_y;
6551
6552
for (uint32_t y = 0; y < height; y++)
6553
{
6554
for (uint32_t x = 0; x < width; x++)
6555
{
6556
for (uint32_t c = 0; c < 3; c++)
6557
{
6558
float f = enc_state.src_img(x, y)[c];
6559
6560
if (std::isinf(f) || std::isnan(f) || (f < 0.0f))
6561
f = 0;
6562
else if (f > basist::ASTC_HDR_MAX_VAL)
6563
f = basist::ASTC_HDR_MAX_VAL;
6564
6565
enc_state.src_img(x, y)[c] = f;
6566
6567
} // c
6568
6569
} // x
6570
} // y
6571
6572
if (global_cfg.m_debug_images)
6573
{
6574
write_exr((global_cfg.m_debug_image_prefix + "orig.exr").c_str(), enc_state.src_img, 3, 0);
6575
}
6576
6577
image src_img_compressed;
6578
tonemap_image_compressive2(src_img_compressed, enc_state.src_img);
6579
6580
if (global_cfg.m_debug_images)
6581
{
6582
save_png(global_cfg.m_debug_image_prefix + "compressive_tone_map.png", src_img_compressed);
6583
}
6584
6585
smooth_map_params rp;
6586
rp.m_debug_images = global_cfg.m_debug_images;
6587
6588
if (global_cfg.m_lambda != 0.0f)
6589
{
6590
if (global_cfg.m_status_output)
6591
fmt_printf("Creating RDO perceptual weighting maps\n");
6592
6593
create_smooth_maps2(enc_state.smooth_block_mse_scales, src_img_compressed, rp);
6594
}
6595
6596
if (global_cfg.m_status_output)
6597
fmt_printf("Blurring image\n");
6598
6599
enc_state.src_img_filtered1.resize(width, height);
6600
image_resample(enc_state.src_img, enc_state.src_img_filtered1, "gaussian", global_cfg.m_gaussian1_strength); //1.45f);
6601
6602
enc_state.src_img_filtered2.resize(width, height);
6603
image_resample(enc_state.src_img, enc_state.src_img_filtered2, "gaussian", global_cfg.m_gaussian2_strength); //1.83f);
6604
6605
if (global_cfg.m_debug_images)
6606
{
6607
write_exr((global_cfg.m_debug_image_prefix + "blurred1.exr").c_str(), enc_state.src_img_filtered1, 3, 0);
6608
write_exr((global_cfg.m_debug_image_prefix + "blurred2.exr").c_str(), enc_state.src_img_filtered2, 3, 0);
6609
}
6610
6611
if (global_cfg.m_status_output)
6612
fmt_printf("Transforming to ITP\n");
6613
6614
enc_state.src_img_itp.resize(width, height);
6615
convet_rgb_image_to_itp(enc_state.src_img, enc_state.src_img_itp, global_cfg);
6616
6617
enc_state.src_img_filtered1_itp.resize(width, height);
6618
convet_rgb_image_to_itp(enc_state.src_img_filtered1, enc_state.src_img_filtered1_itp, global_cfg);
6619
6620
enc_state.src_img_filtered2_itp.resize(width, height);
6621
convet_rgb_image_to_itp(enc_state.src_img_filtered2, enc_state.src_img_filtered2_itp, global_cfg);
6622
6623
if (global_cfg.m_lambda == 0.0f)
6624
global_cfg.m_favor_higher_compression = false;
6625
6626
uint32_t total_strips = 0, rows_per_strip = 0;
6627
if (!calc_strip_size(global_cfg.m_lambda, num_blocks_y, (uint32_t)pJob_pool->get_total_threads(), global_cfg.m_force_one_strip, total_strips, rows_per_strip, global_cfg))
6628
{
6629
fmt_error_printf("compress_photo: Failed computing strip sizes\n");
6630
return false;
6631
}
6632
6633
if (global_cfg.m_debug_output)
6634
fmt_printf("lambda: {}, comp_level: {}, highest_comp_level: {}, extra patterns: {}\n", global_cfg.m_lambda, global_cfg.m_master_comp_level, global_cfg.m_highest_comp_level, global_cfg.m_extra_patterns_flag);
6635
6636
enc_state.coded_blocks.resize(num_blocks_x, num_blocks_y);
6637
6638
bitwise_coder coded_bits;
6639
6640
coded_bits.put_bits(0xABCD, 16);
6641
coded_bits.put_bits(width, 16);
6642
coded_bits.put_bits(height, 16);
6643
6644
enc_state.packed_img.resize(width, height);
6645
6646
enc_state.strip_bits.resize(total_strips);
6647
6648
enc_state.final_astc_blocks.resize(num_blocks_x, num_blocks_y);
6649
6650
uastc_hdr_6x6_debug_state debug_state;
6651
6652
if (global_cfg.m_debug_images)
6653
debug_state.init(width, height);
6654
else
6655
debug_state.init(0, 0);
6656
6657
interval_timer tm;
6658
tm.start();
6659
6660
std::atomic_bool any_failed_flag;
6661
any_failed_flag.store(false);
6662
6663
for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)
6664
{
6665
const uint32_t strip_first_by = strip_index * rows_per_strip;
6666
6667
uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);
6668
if (strip_index == (total_strips - 1))
6669
strip_last_by = num_blocks_y - 1;
6670
6671
pJob_pool->add_job([&any_failed_flag, &global_cfg, &debug_state, &enc_state,
6672
strip_index, total_strips, strip_first_by, strip_last_by,
6673
num_blocks_x, num_blocks_y, total_blocks, width, height]
6674
{
6675
if (!any_failed_flag)
6676
{
6677
bool status = compress_strip_task(
6678
strip_index, total_strips, strip_first_by, strip_last_by,
6679
num_blocks_x, num_blocks_y, total_blocks, width, height,
6680
global_cfg, debug_state, enc_state);
6681
6682
if (!status)
6683
{
6684
fmt_error_printf("compress_photo: compress_strip_task() failed\n");
6685
any_failed_flag.store(true, std::memory_order_relaxed);
6686
}
6687
}
6688
} );
6689
6690
if (any_failed_flag)
6691
break;
6692
6693
} // strip_index
6694
6695
pJob_pool->wait_for_all();
6696
6697
if (any_failed_flag)
6698
{
6699
fmt_error_printf("One or more strips failed during compression\n");
6700
return false;
6701
}
6702
6703
if (global_cfg.m_debug_output)
6704
fmt_printf("Encoding time: {} secs\n", tm.get_elapsed_secs());
6705
6706
if (global_cfg.m_debug_output)
6707
debug_state.print(total_blocks);
6708
6709
if (global_cfg.m_debug_images)
6710
{
6711
save_png(global_cfg.m_debug_image_prefix + "part_vis.png", debug_state.m_part_vis);
6712
save_png(global_cfg.m_debug_image_prefix + "grid_vis.png", debug_state.m_grid_vis);
6713
save_png(global_cfg.m_debug_image_prefix + "mode_vis.png", debug_state.m_mode_vis);
6714
save_png(global_cfg.m_debug_image_prefix + "mode_vis2.png", debug_state.m_mode_vis2);
6715
save_png(global_cfg.m_debug_image_prefix + "enc_vis.png", debug_state.m_enc_vis);
6716
write_exr((global_cfg.m_debug_image_prefix + "stat_vis.exr").c_str(), debug_state.m_stat_vis, 3, 0);
6717
}
6718
6719
for (uint32_t i = 0; i < total_strips; i++)
6720
coded_bits.append(enc_state.strip_bits[i]);
6721
6722
coded_bits.put_bits(0xA742, 16);
6723
6724
coded_bits.flush();
6725
6726
if (global_cfg.m_output_images)
6727
{
6728
write_exr((global_cfg.m_output_image_prefix + "comp.exr").c_str(), enc_state.packed_img, 3, 0);
6729
}
6730
6731
if (global_cfg.m_debug_output)
6732
fmt_printf("\nTotal intermediate output bits/pixel: {3.4}\n", (float)coded_bits.get_total_bits() / (float)(width * height));
6733
6734
vector2D<astc_helpers::astc_block> decoded_blocks1;
6735
vector2D<astc_helpers::astc_block> decoded_blocks2;
6736
6737
if (global_cfg.m_debug_output)
6738
fmt_printf("decode_file\n");
6739
6740
uint32_t unpacked_width = 0, unpacked_height = 0;
6741
bool status = decode_file(coded_bits.get_bytes(), decoded_blocks1, unpacked_width, unpacked_height);
6742
if (!status)
6743
{
6744
fmt_error_printf("decode_file() failed\n");
6745
return false;
6746
}
6747
6748
if (global_cfg.m_debug_output)
6749
fmt_printf("decode_6x6_hdr\n");
6750
6751
status = decode_6x6_hdr(coded_bits.get_bytes().get_ptr(), coded_bits.get_bytes().size_in_bytes_u32(), decoded_blocks2, unpacked_width, unpacked_height);
6752
if (!status)
6753
{
6754
fmt_error_printf("decode_6x6_hdr_file() failed\n");
6755
return false;
6756
}
6757
6758
if ((enc_state.final_astc_blocks.get_width() != decoded_blocks1.get_width()) ||
6759
(enc_state.final_astc_blocks.get_height() != decoded_blocks1.get_height()))
6760
{
6761
fmt_error_printf("Decode size mismatch with decode_file\n");
6762
return false;
6763
}
6764
6765
if ((enc_state.final_astc_blocks.get_width() != decoded_blocks2.get_width()) ||
6766
(enc_state.final_astc_blocks.get_height() != decoded_blocks2.get_height()))
6767
{
6768
fmt_error_printf("Decode size mismatch with decode_6x6_hdr_file\n");
6769
return false;
6770
}
6771
6772
if (memcmp(decoded_blocks1.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks1.size_in_bytes()) != 0)
6773
{
6774
fmt_error_printf("Decoded ASTC blocks verification failed\n");
6775
return false;
6776
}
6777
6778
if (memcmp(decoded_blocks2.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks2.size_in_bytes()) != 0)
6779
{
6780
fmt_error_printf("Decoded ASTC blocks verification failed\n");
6781
return false;
6782
}
6783
6784
if (global_cfg.m_debug_output)
6785
basisu::fmt_printf("Decoded ASTC verification checks succeeded\n");
6786
6787
if (global_cfg.m_output_images)
6788
{
6789
if (write_astc_file((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), decoded_blocks1.get_ptr(), BLOCK_W, BLOCK_H, width, height))
6790
{
6791
basisu::platform_sleep(20);
6792
6793
uint8_vec astc_file_data;
6794
if (read_file_to_vec((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), astc_file_data))
6795
{
6796
if (astc_file_data.size() > 16)
6797
{
6798
astc_file_data.erase(0, 16);
6799
6800
size_t comp_size = 0;
6801
void* pComp_data = tdefl_compress_mem_to_heap(&astc_file_data[0], astc_file_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK);
6802
mz_free(pComp_data);
6803
6804
if (global_cfg.m_debug_output)
6805
{
6806
fmt_printf(".ASTC file size (less header): {}, bits/pixel: {}, Deflate bits/pixel: {}\n",
6807
(uint64_t)astc_file_data.size(),
6808
(float)astc_file_data.size() * 8.0f / (float)(width * height),
6809
(float)comp_size * 8.0f / (float)(width * height));
6810
}
6811
}
6812
}
6813
}
6814
}
6815
6816
// Must decode all the blocks (even padded rows/cols) to match what the transcoder does.
6817
imagef unpacked_astc_img(num_blocks_x * 6, num_blocks_y * 6);
6818
imagef unpacked_astc_google_img(num_blocks_x * 6, num_blocks_y * 6);
6819
6820
for (uint32_t y = 0; y < decoded_blocks1.get_height(); y++)
6821
{
6822
for (uint32_t x = 0; x < decoded_blocks1.get_width(); x++)
6823
{
6824
const auto& phys_blk = decoded_blocks1(x, y);
6825
6826
vec4F pixels[MAX_BLOCK_W * MAX_BLOCK_H];
6827
status = unpack_physical_astc_block(&phys_blk, BLOCK_W, BLOCK_H, pixels);
6828
if (!status)
6829
{
6830
fmt_error_printf("unpack_physical_astc_block() failed\n");
6831
return false;
6832
}
6833
6834
unpacked_astc_img.set_block_clipped(pixels, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);
6835
6836
vec4F pixels_google[MAX_BLOCK_W * MAX_BLOCK_H];
6837
status = unpack_physical_astc_block_google(&phys_blk, BLOCK_W, BLOCK_H, pixels_google);
6838
if (!status)
6839
{
6840
fmt_error_printf("unpack_physical_astc_block_google() failed\n");
6841
return false;
6842
}
6843
6844
unpacked_astc_google_img.set_block_clipped(pixels_google, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);
6845
6846
for (uint32_t i = 0; i < 36; i++)
6847
{
6848
if (pixels[i] != pixels_google[i])
6849
{
6850
fmt_error_printf("pixel unpack mismatch\n");
6851
return false;
6852
}
6853
}
6854
}
6855
}
6856
6857
if (global_cfg.m_debug_output)
6858
fmt_printf("\nUnpack succeeded\n");
6859
6860
imagef unpacked_bc6h_img;
6861
6862
{
6863
vector2D<basist::bc6h_block> bc6h_blocks;
6864
6865
fast_bc6h_params enc_params;
6866
6867
bool pack_status = pack_bc6h_image(unpacked_astc_img, bc6h_blocks, &unpacked_bc6h_img, enc_params);
6868
if (!pack_status)
6869
{
6870
fmt_error_printf("pack_bc6h_image() failed!");
6871
return false;
6872
}
6873
6874
unpacked_bc6h_img.crop(width, height);
6875
6876
if (global_cfg.m_output_images)
6877
{
6878
write_exr((global_cfg.m_output_image_prefix + "unpacked_bc6h.exr").c_str(), unpacked_bc6h_img, 3, 0);
6879
}
6880
}
6881
6882
unpacked_astc_img.crop(width, height);
6883
unpacked_astc_google_img.crop(width, height);
6884
6885
if (global_cfg.m_output_images)
6886
{
6887
write_exr((global_cfg.m_output_image_prefix + "unpacked_astc.exr").c_str(), unpacked_astc_img, 3, 0);
6888
write_exr((global_cfg.m_output_image_prefix + "unpacked_google_astc.exr").c_str(), unpacked_astc_google_img, 3, 0);
6889
}
6890
6891
// ASTC metrics
6892
if (global_cfg.m_image_stats)
6893
{
6894
image_metrics im;
6895
6896
if (global_cfg.m_debug_output)
6897
printf("\nASTC log2 float error metrics:\n");
6898
6899
for (uint32_t i = 0; i < 3; i++)
6900
{
6901
im.calc(enc_state.src_img, unpacked_astc_img, i, 1, true, true);
6902
6903
if (global_cfg.m_debug_output)
6904
{
6905
printf("%c: ", "RGBA"[i]);
6906
im.print_hp();
6907
}
6908
}
6909
6910
metrics.m_im_astc_log2.calc(enc_state.src_img, unpacked_astc_img, 0, 3, true, true);
6911
6912
if (global_cfg.m_debug_output)
6913
{
6914
printf("RGB: ");
6915
metrics.m_im_astc_log2.print_hp();
6916
6917
printf("\n");
6918
}
6919
}
6920
6921
if (global_cfg.m_image_stats)
6922
{
6923
image_metrics im;
6924
6925
if (global_cfg.m_debug_output)
6926
printf("ASTC half float space error metrics (a piecewise linear approximation of log2 error):\n");
6927
6928
for (uint32_t i = 0; i < 3; i++)
6929
{
6930
im.calc_half(enc_state.src_img, unpacked_astc_img, i, 1, true);
6931
6932
if (global_cfg.m_debug_output)
6933
{
6934
printf("%c: ", "RGBA"[i]);
6935
im.print_hp();
6936
}
6937
}
6938
6939
metrics.m_im_astc_half.calc_half(enc_state.src_img, unpacked_astc_img, 0, 3, true);
6940
6941
if (global_cfg.m_debug_output)
6942
{
6943
printf("RGB: ");
6944
metrics.m_im_astc_half.print_hp();
6945
}
6946
}
6947
6948
// BC6H metrics
6949
if (global_cfg.m_image_stats)
6950
{
6951
image_metrics im;
6952
6953
if (global_cfg.m_debug_output)
6954
printf("\nBC6H log2 float error metrics:\n");
6955
6956
for (uint32_t i = 0; i < 3; i++)
6957
{
6958
im.calc(enc_state.src_img, unpacked_bc6h_img, i, 1, true, true);
6959
6960
if (global_cfg.m_debug_output)
6961
{
6962
printf("%c: ", "RGBA"[i]);
6963
im.print_hp();
6964
}
6965
}
6966
6967
metrics.m_im_bc6h_log2.calc(enc_state.src_img, unpacked_bc6h_img, 0, 3, true, true);
6968
6969
if (global_cfg.m_debug_output)
6970
{
6971
printf("RGB: ");
6972
metrics.m_im_bc6h_log2.print_hp();
6973
6974
printf("\n");
6975
}
6976
}
6977
6978
if (global_cfg.m_image_stats)
6979
{
6980
image_metrics im;
6981
6982
if (global_cfg.m_debug_output)
6983
printf("BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n");
6984
6985
for (uint32_t i = 0; i < 3; i++)
6986
{
6987
im.calc_half(enc_state.src_img, unpacked_bc6h_img, i, 1, true);
6988
6989
if (global_cfg.m_debug_output)
6990
{
6991
printf("%c: ", "RGBA"[i]);
6992
im.print_hp();
6993
}
6994
}
6995
6996
metrics.m_im_bc6h_half.calc_half(enc_state.src_img, unpacked_bc6h_img, 0, 3, true);
6997
6998
if (global_cfg.m_debug_output)
6999
{
7000
printf("RGB: ");
7001
metrics.m_im_bc6h_half.print_hp();
7002
7003
printf("\n");
7004
}
7005
}
7006
7007
intermediate_tex_data.swap(coded_bits.get_bytes());
7008
7009
astc_tex_data.resize(decoded_blocks1.size_in_bytes());
7010
memcpy(astc_tex_data.data(), decoded_blocks1.get_ptr(), decoded_blocks1.size_in_bytes());
7011
7012
return true;
7013
}
7014
7015
} // namespace astc_6x6_hdr
7016
7017