Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_compress_symbolic.cpp
9896 views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2025 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
// http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20
/**
21
* @brief Functions to compress a symbolic block.
22
*/
23
24
#include "astcenc_internal.h"
25
#include "astcenc_diagnostic_trace.h"
26
27
#include <cassert>
28
29
/**
30
* @brief Merge two planes of endpoints into a single vector.
31
*
32
* @param ep_plane1 The endpoints for plane 1.
33
* @param ep_plane2 The endpoints for plane 2.
34
* @param component_plane2 The color component for plane 2.
35
* @param[out] result The merged output.
36
*/
37
static void merge_endpoints(
38
const endpoints& ep_plane1,
39
const endpoints& ep_plane2,
40
unsigned int component_plane2,
41
endpoints& result
42
) {
43
unsigned int partition_count = ep_plane1.partition_count;
44
assert(partition_count == 1);
45
46
vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
47
48
result.partition_count = partition_count;
49
result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
50
result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
51
}
52
53
/**
54
* @brief Attempt to improve weights given a chosen configuration.
55
*
56
* Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
57
* partition and per plane) and attempt to improve image quality by moving each weight up by one or
58
* down by one quantization step.
59
*
60
* This is a specialized function which only supports operating on undecimated weight grids,
61
* therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
62
* is needed less often.
63
*
64
* @param decode_mode The decode mode (LDR, HDR).
65
* @param bsd The block size information.
66
* @param blk The image block color data to compress.
67
* @param[out] scb The symbolic compressed block output.
68
*/
69
static bool realign_weights_undecimated(
70
astcenc_profile decode_mode,
71
const block_size_descriptor& bsd,
72
const image_block& blk,
73
symbolic_compressed_block& scb
74
) {
75
// Get the partition descriptor
76
unsigned int partition_count = scb.partition_count;
77
const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
78
79
// Get the quantization table
80
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
81
unsigned int weight_quant_level = bm.quant_mode;
82
const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
83
84
unsigned int max_plane = bm.is_dual_plane;
85
int plane2_component = scb.plane2_component;
86
vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
87
88
// Decode the color endpoints
89
bool rgb_hdr;
90
bool alpha_hdr;
91
vint4 endpnt0[BLOCK_MAX_PARTITIONS];
92
vint4 endpnt1[BLOCK_MAX_PARTITIONS];
93
vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
94
vfloat4 offset[BLOCK_MAX_PARTITIONS];
95
96
promise(partition_count > 0);
97
98
for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
99
{
100
unpack_color_endpoints(decode_mode,
101
scb.color_formats[pa_idx],
102
scb.color_values[pa_idx],
103
rgb_hdr, alpha_hdr,
104
endpnt0[pa_idx],
105
endpnt1[pa_idx]);
106
}
107
108
uint8_t* dec_weights_uquant = scb.weights;
109
bool adjustments = false;
110
111
// For each plane and partition ...
112
for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
113
{
114
for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
115
{
116
// Compute the endpoint delta for all components in current plane
117
vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
118
epd = select(epd, vint4::zero(), plane_mask);
119
120
endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
121
offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
122
}
123
124
// For each weight compute previous, current, and next errors
125
promise(bsd.texel_count > 0);
126
for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
127
{
128
int uqw = dec_weights_uquant[texel];
129
130
uint32_t prev_and_next = qat.prev_next_values[uqw];
131
int uqw_down = prev_and_next & 0xFF;
132
int uqw_up = (prev_and_next >> 8) & 0xFF;
133
134
// Interpolate the colors to create the diffs
135
float weight_base = static_cast<float>(uqw);
136
float weight_down = static_cast<float>(uqw_down - uqw);
137
float weight_up = static_cast<float>(uqw_up - uqw);
138
139
unsigned int partition = pi.partition_of_texel[texel];
140
vfloat4 color_offset = offset[partition];
141
vfloat4 color_base = endpnt0f[partition];
142
143
vfloat4 color = color_base + color_offset * weight_base;
144
vfloat4 orig_color = blk.texel(texel);
145
vfloat4 error_weight = blk.channel_weight;
146
147
vfloat4 color_diff = color - orig_color;
148
vfloat4 color_diff_down = color_diff + color_offset * weight_down;
149
vfloat4 color_diff_up = color_diff + color_offset * weight_up;
150
151
float error_base = dot_s(color_diff * color_diff, error_weight);
152
float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
153
float error_up = dot_s(color_diff_up * color_diff_up, error_weight);
154
155
// Check if the prev or next error is better, and if so use it
156
if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
157
{
158
dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
159
adjustments = true;
160
}
161
else if ((error_down < error_base) && (uqw > 0))
162
{
163
dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
164
adjustments = true;
165
}
166
}
167
168
// Prepare iteration for plane 2
169
dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
170
plane_mask = ~plane_mask;
171
}
172
173
return adjustments;
174
}
175
176
/**
177
* @brief Attempt to improve weights given a chosen configuration.
178
*
179
* Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
180
* partition and per plane) and attempt to improve image quality by moving each weight up by one or
181
* down by one quantization step.
182
*
183
* @param decode_mode The decode mode (LDR, HDR).
184
* @param bsd The block size information.
185
* @param blk The image block color data to compress.
186
* @param[out] scb The symbolic compressed block output.
187
*/
188
static bool realign_weights_decimated(
189
astcenc_profile decode_mode,
190
const block_size_descriptor& bsd,
191
const image_block& blk,
192
symbolic_compressed_block& scb
193
) {
194
// Get the partition descriptor
195
unsigned int partition_count = scb.partition_count;
196
const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
197
198
// Get the quantization table
199
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
200
unsigned int weight_quant_level = bm.quant_mode;
201
const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
202
203
// Get the decimation table
204
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
205
unsigned int weight_count = di.weight_count;
206
assert(weight_count != bsd.texel_count);
207
208
unsigned int max_plane = bm.is_dual_plane;
209
int plane2_component = scb.plane2_component;
210
vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
211
212
// Decode the color endpoints
213
bool rgb_hdr;
214
bool alpha_hdr;
215
vint4 endpnt0[BLOCK_MAX_PARTITIONS];
216
vint4 endpnt1[BLOCK_MAX_PARTITIONS];
217
vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
218
vfloat4 offset[BLOCK_MAX_PARTITIONS];
219
220
promise(partition_count > 0);
221
promise(weight_count > 0);
222
223
for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
224
{
225
unpack_color_endpoints(decode_mode,
226
scb.color_formats[pa_idx],
227
scb.color_values[pa_idx],
228
rgb_hdr, alpha_hdr,
229
endpnt0[pa_idx],
230
endpnt1[pa_idx]);
231
}
232
233
uint8_t* dec_weights_uquant = scb.weights;
234
bool adjustments = false;
235
236
// For each plane and partition ...
237
for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
238
{
239
for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
240
{
241
// Compute the endpoint delta for all components in current plane
242
vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
243
epd = select(epd, vint4::zero(), plane_mask);
244
245
endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
246
offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
247
}
248
249
// Create an unquantized weight grid for this decimation level
250
ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
251
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
252
{
253
vint unquant_value(dec_weights_uquant + we_idx);
254
vfloat unquant_valuef = int_to_float(unquant_value);
255
storea(unquant_valuef, uq_weightsf + we_idx);
256
}
257
258
// For each weight compute previous, current, and next errors
259
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
260
{
261
int uqw = dec_weights_uquant[we_idx];
262
uint32_t prev_and_next = qat.prev_next_values[uqw];
263
264
float uqw_base = uq_weightsf[we_idx];
265
float uqw_down = static_cast<float>(prev_and_next & 0xFF);
266
float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
267
268
float uqw_diff_down = uqw_down - uqw_base;
269
float uqw_diff_up = uqw_up - uqw_base;
270
271
vfloat4 error_basev = vfloat4::zero();
272
vfloat4 error_downv = vfloat4::zero();
273
vfloat4 error_upv = vfloat4::zero();
274
275
// Interpolate the colors to create the diffs
276
unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
277
promise(texels_to_evaluate > 0);
278
for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
279
{
280
unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
281
282
float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
283
284
float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
285
+ uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
286
+ (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
287
+ uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
288
289
// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
290
// float weight = astc::flt_rd(weight_base + 0.5f);
291
// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
292
// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
293
float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
294
float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
295
296
unsigned int partition = pi.partition_of_texel[texel];
297
vfloat4 color_offset = offset[partition];
298
vfloat4 color_base = endpnt0f[partition];
299
300
vfloat4 color = color_base + color_offset * weight_base;
301
vfloat4 orig_color = blk.texel(texel);
302
303
vfloat4 color_diff = color - orig_color;
304
vfloat4 color_down_diff = color_diff + color_offset * weight_down;
305
vfloat4 color_up_diff = color_diff + color_offset * weight_up;
306
307
error_basev += color_diff * color_diff;
308
error_downv += color_down_diff * color_down_diff;
309
error_upv += color_up_diff * color_up_diff;
310
}
311
312
vfloat4 error_weight = blk.channel_weight;
313
float error_base = hadd_s(error_basev * error_weight);
314
float error_down = hadd_s(error_downv * error_weight);
315
float error_up = hadd_s(error_upv * error_weight);
316
317
// Check if the prev or next error is better, and if so use it
318
if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
319
{
320
uq_weightsf[we_idx] = uqw_up;
321
dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
322
adjustments = true;
323
}
324
else if ((error_down < error_base) && (uqw > 0))
325
{
326
uq_weightsf[we_idx] = uqw_down;
327
dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
328
adjustments = true;
329
}
330
}
331
332
// Prepare iteration for plane 2
333
dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
334
plane_mask = ~plane_mask;
335
}
336
337
return adjustments;
338
}
339
340
/**
341
* @brief Compress a block using a chosen partitioning and 1 plane of weights.
342
*
343
* @param config The compressor configuration.
344
* @param bsd The block size information.
345
* @param blk The image block color data to compress.
346
* @param only_always True if we only use "always" percentile block modes.
347
* @param tune_errorval_threshold The error value threshold.
348
* @param partition_count The partition count.
349
* @param partition_index The partition index if @c partition_count is 2-4.
350
* @param[out] scb The symbolic compressed block output.
351
* @param[out] tmpbuf The quantized weights for plane 1.
352
*/
353
static float compress_symbolic_block_for_partition_1plane(
354
const astcenc_config& config,
355
const block_size_descriptor& bsd,
356
const image_block& blk,
357
bool only_always,
358
float tune_errorval_threshold,
359
unsigned int partition_count,
360
unsigned int partition_index,
361
symbolic_compressed_block& scb,
362
compression_working_buffers& tmpbuf,
363
int quant_limit
364
) {
365
promise(partition_count > 0);
366
promise(config.tune_candidate_limit > 0);
367
promise(config.tune_refinement_limit > 0);
368
369
int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
370
371
auto compute_difference = &compute_symbolic_block_difference_1plane;
372
if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
373
{
374
compute_difference = &compute_symbolic_block_difference_1plane_1partition;
375
}
376
377
const auto& pi = bsd.get_partition_info(partition_count, partition_index);
378
379
// Compute ideal weights and endpoint colors, with no quantization or decimation
380
endpoints_and_weights& ei = tmpbuf.ei1;
381
compute_ideal_colors_and_weights_1plane(blk, pi, ei);
382
383
// Compute ideal weights and endpoint colors for every decimation
384
float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
385
uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
386
387
// For each decimation mode, compute an ideal set of weights with no quantization
388
unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
389
: bsd.decimation_mode_count_selected;
390
promise(max_decimation_modes > 0);
391
for (unsigned int i = 0; i < max_decimation_modes; i++)
392
{
393
const auto& dm = bsd.get_decimation_mode(i);
394
if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
395
{
396
continue;
397
}
398
399
const auto& di = bsd.get_decimation_info(i);
400
401
compute_ideal_weights_for_decimation(
402
ei,
403
di,
404
dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
405
}
406
407
// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
408
// weight pair, compute the smallest weight that will result in a color value greater than 1
409
vfloat4 min_ep(10.0f);
410
for (unsigned int i = 0; i < partition_count; i++)
411
{
412
vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
413
414
vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
415
min_ep = select(min_ep, ep, use_ep);
416
}
417
418
float min_wt_cutoff = hmin_s(min_ep);
419
420
// For each mode, use the angular method to compute a shift
421
compute_angular_endpoints_1plane(
422
only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
423
424
float* weight_low_value = tmpbuf.weight_low_value1;
425
float* weight_high_value = tmpbuf.weight_high_value1;
426
int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
427
float* qwt_errors = tmpbuf.qwt_errors;
428
429
// For each mode (which specifies a decimation and a quantization):
430
// * Compute number of bits needed for the quantized weights
431
// * Generate an optimized set of quantized weights
432
// * Compute quantization errors for the mode
433
434
435
static const int8_t free_bits_for_partition_count[4] {
436
115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
437
};
438
439
unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
440
: bsd.block_mode_count_1plane_selected;
441
promise(max_block_modes > 0);
442
for (unsigned int i = 0; i < max_block_modes; i++)
443
{
444
const block_mode& bm = bsd.block_modes[i];
445
446
if (bm.quant_mode > max_weight_quant)
447
{
448
qwt_errors[i] = 1e38f;
449
continue;
450
}
451
452
assert(!bm.is_dual_plane);
453
int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
454
if (bitcount <= 0)
455
{
456
qwt_errors[i] = 1e38f;
457
continue;
458
}
459
460
if (weight_high_value[i] > 1.02f * min_wt_cutoff)
461
{
462
weight_high_value[i] = 1.0f;
463
}
464
465
int decimation_mode = bm.decimation_mode;
466
const auto& di = bsd.get_decimation_info(decimation_mode);
467
468
qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
469
470
ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
471
472
// Generate the optimized set of weights for the weight mode
473
compute_quantized_weights_for_decimation(
474
di,
475
weight_low_value[i], weight_high_value[i],
476
dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
477
dec_weights_uquantf,
478
dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
479
bm.get_weight_quant_mode());
480
481
// Compute weight quantization errors for the block mode
482
qwt_errors[i] = compute_error_of_weight_set_1plane(
483
ei,
484
di,
485
dec_weights_uquantf);
486
}
487
488
// Decide the optimal combination of color endpoint encodings and weight encodings
489
uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
490
int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
491
492
quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
493
quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
494
495
unsigned int candidate_count = compute_ideal_endpoint_formats(
496
pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
497
config.tune_candidate_limit, 0, max_block_modes,
498
partition_format_specifiers, block_mode_index,
499
color_quant_level, color_quant_level_mod, tmpbuf);
500
501
// Iterate over the N believed-to-be-best modes to find out which one is actually best
502
float best_errorval_in_mode = ERROR_CALC_DEFAULT;
503
float best_errorval_in_scb = scb.errorval;
504
505
for (unsigned int i = 0; i < candidate_count; i++)
506
{
507
TRACE_NODE(node0, "candidate");
508
509
const int bm_packed_index = block_mode_index[i];
510
assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
511
const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
512
513
int decimation_mode = qw_bm.decimation_mode;
514
const auto& di = bsd.get_decimation_info(decimation_mode);
515
promise(di.weight_count > 0);
516
517
trace_add_data("weight_x", di.weight_x);
518
trace_add_data("weight_y", di.weight_y);
519
trace_add_data("weight_z", di.weight_z);
520
trace_add_data("weight_quant", qw_bm.quant_mode);
521
522
// Recompute the ideal color endpoints before storing them
523
vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
524
vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
525
526
symbolic_compressed_block workscb;
527
endpoints workep = ei.ep;
528
529
uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
530
531
for (unsigned int j = 0; j < di.weight_count; j++)
532
{
533
workscb.weights[j] = u8_weight_src[j];
534
}
535
536
for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
537
{
538
recompute_ideal_colors_1plane(
539
blk, pi, di, workscb.weights,
540
workep, rgbs_colors, rgbo_colors);
541
542
// Quantize the chosen color, tracking if worth trying the mod value
543
bool all_same = color_quant_level[i] != color_quant_level_mod[i];
544
for (unsigned int j = 0; j < partition_count; j++)
545
{
546
workscb.color_formats[j] = pack_color_endpoints(
547
workep.endpt0[j],
548
workep.endpt1[j],
549
rgbs_colors[j],
550
rgbo_colors[j],
551
partition_format_specifiers[i][j],
552
workscb.color_values[j],
553
color_quant_level[i]);
554
555
all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
556
}
557
558
// If all the color endpoint modes are the same, we get a few more bits to store colors;
559
// let's see if we can take advantage of this: requantize all the colors and see if the
560
// endpoint modes remain the same.
561
workscb.color_formats_matched = 0;
562
if (partition_count >= 2 && all_same)
563
{
564
uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
565
uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
566
bool all_same_mod = true;
567
for (unsigned int j = 0; j < partition_count; j++)
568
{
569
color_formats_mod[j] = pack_color_endpoints(
570
workep.endpt0[j],
571
workep.endpt1[j],
572
rgbs_colors[j],
573
rgbo_colors[j],
574
partition_format_specifiers[i][j],
575
colorvals[j],
576
color_quant_level_mod[i]);
577
578
// Early out as soon as it's no longer possible to use mod
579
if (color_formats_mod[j] != color_formats_mod[0])
580
{
581
all_same_mod = false;
582
break;
583
}
584
}
585
586
if (all_same_mod)
587
{
588
workscb.color_formats_matched = 1;
589
for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
590
{
591
for (unsigned int k = 0; k < 8; k++)
592
{
593
workscb.color_values[j][k] = colorvals[j][k];
594
}
595
596
workscb.color_formats[j] = color_formats_mod[j];
597
}
598
}
599
}
600
601
// Store header fields
602
workscb.partition_count = static_cast<uint8_t>(partition_count);
603
workscb.partition_index = static_cast<uint16_t>(partition_index);
604
workscb.plane2_component = -1;
605
workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
606
workscb.block_mode = qw_bm.mode_index;
607
workscb.block_type = SYM_BTYPE_NONCONST;
608
609
// Pre-realign test
610
if (l == 0)
611
{
612
float errorval = compute_difference(config, bsd, workscb, blk);
613
if (errorval == -ERROR_CALC_DEFAULT)
614
{
615
errorval = -errorval;
616
workscb.block_type = SYM_BTYPE_ERROR;
617
}
618
619
trace_add_data("error_prerealign", errorval);
620
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
621
622
// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
623
// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
624
// drive a heuristic to skip blocks that are unlikely to catch up with the best
625
// block we have already.
626
unsigned int iters_remaining = config.tune_refinement_limit - l;
627
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
628
if (errorval > (threshold * best_errorval_in_scb))
629
{
630
break;
631
}
632
633
if (errorval < best_errorval_in_scb)
634
{
635
best_errorval_in_scb = errorval;
636
workscb.errorval = errorval;
637
scb = workscb;
638
639
if (errorval < tune_errorval_threshold)
640
{
641
// Skip remaining candidates - this is "good enough"
642
i = candidate_count;
643
break;
644
}
645
}
646
}
647
648
bool adjustments;
649
if (di.weight_count != bsd.texel_count)
650
{
651
adjustments = realign_weights_decimated(
652
config.profile, bsd, blk, workscb);
653
}
654
else
655
{
656
adjustments = realign_weights_undecimated(
657
config.profile, bsd, blk, workscb);
658
}
659
660
// Post-realign test
661
float errorval = compute_difference(config, bsd, workscb, blk);
662
if (errorval == -ERROR_CALC_DEFAULT)
663
{
664
errorval = -errorval;
665
workscb.block_type = SYM_BTYPE_ERROR;
666
}
667
668
trace_add_data("error_postrealign", errorval);
669
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
670
671
// Average refinement improvement is 3.5% per iteration, so skip blocks that are
672
// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
673
// give benefit of the doubt ...
674
unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
675
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
676
if (errorval > (threshold * best_errorval_in_scb))
677
{
678
break;
679
}
680
681
if (errorval < best_errorval_in_scb)
682
{
683
best_errorval_in_scb = errorval;
684
workscb.errorval = errorval;
685
scb = workscb;
686
687
if (errorval < tune_errorval_threshold)
688
{
689
// Skip remaining candidates - this is "good enough"
690
i = candidate_count;
691
break;
692
}
693
}
694
695
if (!adjustments)
696
{
697
break;
698
}
699
}
700
}
701
702
return best_errorval_in_mode;
703
}
704
705
/**
706
* @brief Compress a block using a chosen partitioning and 2 planes of weights.
707
*
708
* @param config The compressor configuration.
709
* @param bsd The block size information.
710
* @param blk The image block color data to compress.
711
* @param tune_errorval_threshold The error value threshold.
712
* @param plane2_component The component index for the second plane of weights.
713
* @param[out] scb The symbolic compressed block output.
714
* @param[out] tmpbuf The quantized weights for plane 1.
715
*/
716
static float compress_symbolic_block_for_partition_2planes(
717
const astcenc_config& config,
718
const block_size_descriptor& bsd,
719
const image_block& blk,
720
float tune_errorval_threshold,
721
unsigned int plane2_component,
722
symbolic_compressed_block& scb,
723
compression_working_buffers& tmpbuf,
724
int quant_limit
725
) {
726
promise(config.tune_candidate_limit > 0);
727
promise(config.tune_refinement_limit > 0);
728
promise(bsd.decimation_mode_count_selected > 0);
729
730
int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
731
732
// Compute ideal weights and endpoint colors, with no quantization or decimation
733
endpoints_and_weights& ei1 = tmpbuf.ei1;
734
endpoints_and_weights& ei2 = tmpbuf.ei2;
735
736
compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
737
738
// Compute ideal weights and endpoint colors for every decimation
739
float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
740
uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
741
742
// For each decimation mode, compute an ideal set of weights with no quantization
743
for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
744
{
745
const auto& dm = bsd.get_decimation_mode(i);
746
if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
747
{
748
continue;
749
}
750
751
const auto& di = bsd.get_decimation_info(i);
752
753
compute_ideal_weights_for_decimation(
754
ei1,
755
di,
756
dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
757
758
compute_ideal_weights_for_decimation(
759
ei2,
760
di,
761
dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
762
}
763
764
// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
765
// weight pair, compute the smallest weight that will result in a color value greater than 1
766
vfloat4 min_ep1(10.0f);
767
vfloat4 min_ep2(10.0f);
768
769
vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
770
vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
771
min_ep1 = select(min_ep1, ep1, use_ep1);
772
773
vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
774
vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
775
min_ep2 = select(min_ep2, ep2, use_ep2);
776
777
vfloat4 err_max(ERROR_CALC_DEFAULT);
778
vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
779
780
// Set the plane2 component to max error in ep1
781
min_ep1 = select(min_ep1, err_max, err_mask);
782
783
float min_wt_cutoff1 = hmin_s(min_ep1);
784
785
// Set the minwt2 to the plane2 component min in ep2
786
float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
787
788
compute_angular_endpoints_2planes(
789
bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
790
791
// For each mode (which specifies a decimation and a quantization):
792
// * Compute number of bits needed for the quantized weights
793
// * Generate an optimized set of quantized weights
794
// * Compute quantization errors for the mode
795
796
float* weight_low_value1 = tmpbuf.weight_low_value1;
797
float* weight_high_value1 = tmpbuf.weight_high_value1;
798
float* weight_low_value2 = tmpbuf.weight_low_value2;
799
float* weight_high_value2 = tmpbuf.weight_high_value2;
800
801
int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
802
float* qwt_errors = tmpbuf.qwt_errors;
803
804
unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
805
unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
806
807
for (unsigned int i = start_2plane; i < end_2plane; i++)
808
{
809
const block_mode& bm = bsd.block_modes[i];
810
assert(bm.is_dual_plane);
811
812
if (bm.quant_mode > max_weight_quant)
813
{
814
qwt_errors[i] = 1e38f;
815
continue;
816
}
817
818
qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
819
820
if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
821
{
822
weight_high_value1[i] = 1.0f;
823
}
824
825
if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
826
{
827
weight_high_value2[i] = 1.0f;
828
}
829
830
unsigned int decimation_mode = bm.decimation_mode;
831
const auto& di = bsd.get_decimation_info(decimation_mode);
832
833
ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
834
835
// Generate the optimized set of weights for the mode
836
compute_quantized_weights_for_decimation(
837
di,
838
weight_low_value1[i],
839
weight_high_value1[i],
840
dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
841
dec_weights_uquantf,
842
dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
843
bm.get_weight_quant_mode());
844
845
compute_quantized_weights_for_decimation(
846
di,
847
weight_low_value2[i],
848
weight_high_value2[i],
849
dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
850
dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
851
dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
852
bm.get_weight_quant_mode());
853
854
// Compute weight quantization errors for the block mode
855
qwt_errors[i] = compute_error_of_weight_set_2planes(
856
ei1,
857
ei2,
858
di,
859
dec_weights_uquantf,
860
dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
861
}
862
863
// Decide the optimal combination of color endpoint encodings and weight encodings
864
uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
865
int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
866
867
quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
868
quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
869
870
endpoints epm;
871
merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
872
873
const auto& pi = bsd.get_partition_info(1, 0);
874
unsigned int candidate_count = compute_ideal_endpoint_formats(
875
pi, blk, epm, qwt_bitcounts, qwt_errors,
876
config.tune_candidate_limit,
877
bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
878
partition_format_specifiers, block_mode_index,
879
color_quant_level, color_quant_level_mod, tmpbuf);
880
881
// Iterate over the N believed-to-be-best modes to find out which one is actually best
882
float best_errorval_in_mode = ERROR_CALC_DEFAULT;
883
float best_errorval_in_scb = scb.errorval;
884
885
for (unsigned int i = 0; i < candidate_count; i++)
886
{
887
TRACE_NODE(node0, "candidate");
888
889
const int bm_packed_index = block_mode_index[i];
890
assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
891
bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
892
const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
893
894
int decimation_mode = qw_bm.decimation_mode;
895
const auto& di = bsd.get_decimation_info(decimation_mode);
896
promise(di.weight_count > 0);
897
898
trace_add_data("weight_x", di.weight_x);
899
trace_add_data("weight_y", di.weight_y);
900
trace_add_data("weight_z", di.weight_z);
901
trace_add_data("weight_quant", qw_bm.quant_mode);
902
903
vfloat4 rgbs_color;
904
vfloat4 rgbo_color;
905
906
symbolic_compressed_block workscb;
907
endpoints workep = epm;
908
909
uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
910
uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
911
912
for (int j = 0; j < di.weight_count; j++)
913
{
914
workscb.weights[j] = u8_weight1_src[j];
915
workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
916
}
917
918
for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
919
{
920
recompute_ideal_colors_2planes(
921
blk, bsd, di,
922
workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
923
workep, rgbs_color, rgbo_color, plane2_component);
924
925
// Quantize the chosen color
926
workscb.color_formats[0] = pack_color_endpoints(
927
workep.endpt0[0],
928
workep.endpt1[0],
929
rgbs_color, rgbo_color,
930
partition_format_specifiers[i][0],
931
workscb.color_values[0],
932
color_quant_level[i]);
933
934
// Store header fields
935
workscb.partition_count = 1;
936
workscb.partition_index = 0;
937
workscb.quant_mode = color_quant_level[i];
938
workscb.color_formats_matched = 0;
939
workscb.block_mode = qw_bm.mode_index;
940
workscb.plane2_component = static_cast<int8_t>(plane2_component);
941
workscb.block_type = SYM_BTYPE_NONCONST;
942
943
// Pre-realign test
944
if (l == 0)
945
{
946
float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
947
if (errorval == -ERROR_CALC_DEFAULT)
948
{
949
errorval = -errorval;
950
workscb.block_type = SYM_BTYPE_ERROR;
951
}
952
953
trace_add_data("error_prerealign", errorval);
954
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
955
956
// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
957
// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
958
// drive a heuristic to skip blocks that are unlikely to catch up with the best
959
// block we have already.
960
unsigned int iters_remaining = config.tune_refinement_limit - l;
961
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
962
if (errorval > (threshold * best_errorval_in_scb))
963
{
964
break;
965
}
966
967
if (errorval < best_errorval_in_scb)
968
{
969
best_errorval_in_scb = errorval;
970
workscb.errorval = errorval;
971
scb = workscb;
972
973
if (errorval < tune_errorval_threshold)
974
{
975
// Skip remaining candidates - this is "good enough"
976
i = candidate_count;
977
break;
978
}
979
}
980
}
981
982
// Perform a final pass over the weights to try to improve them.
983
bool adjustments;
984
if (di.weight_count != bsd.texel_count)
985
{
986
adjustments = realign_weights_decimated(
987
config.profile, bsd, blk, workscb);
988
}
989
else
990
{
991
adjustments = realign_weights_undecimated(
992
config.profile, bsd, blk, workscb);
993
}
994
995
// Post-realign test
996
float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
997
if (errorval == -ERROR_CALC_DEFAULT)
998
{
999
errorval = -errorval;
1000
workscb.block_type = SYM_BTYPE_ERROR;
1001
}
1002
1003
trace_add_data("error_postrealign", errorval);
1004
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1005
1006
// Average refinement improvement is 3.5% per iteration, so skip blocks that are
1007
// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
1008
// give benefit of the doubt ...
1009
unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
1010
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
1011
if (errorval > (threshold * best_errorval_in_scb))
1012
{
1013
break;
1014
}
1015
1016
if (errorval < best_errorval_in_scb)
1017
{
1018
best_errorval_in_scb = errorval;
1019
workscb.errorval = errorval;
1020
scb = workscb;
1021
1022
if (errorval < tune_errorval_threshold)
1023
{
1024
// Skip remaining candidates - this is "good enough"
1025
i = candidate_count;
1026
break;
1027
}
1028
}
1029
1030
if (!adjustments)
1031
{
1032
break;
1033
}
1034
}
1035
}
1036
1037
return best_errorval_in_mode;
1038
}
1039
1040
/**
1041
* @brief Determine the lowest cross-channel correlation factor.
1042
*
1043
* @param texels_per_block The number of texels in a block.
1044
* @param blk The image block color data to compress.
1045
*
1046
* @return Return the lowest correlation factor.
1047
*/
1048
static float prepare_block_statistics(
1049
int texels_per_block,
1050
const image_block& blk
1051
) {
1052
// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1053
// of the matrix. The matrix is symmetric, so this is all we need for this use case.
1054
float rs = 0.0f;
1055
float gs = 0.0f;
1056
float bs = 0.0f;
1057
float as = 0.0f;
1058
float rr_var = 0.0f;
1059
float gg_var = 0.0f;
1060
float bb_var = 0.0f;
1061
float aa_var = 0.0f;
1062
float rg_cov = 0.0f;
1063
float rb_cov = 0.0f;
1064
float ra_cov = 0.0f;
1065
float gb_cov = 0.0f;
1066
float ga_cov = 0.0f;
1067
float ba_cov = 0.0f;
1068
1069
float weight_sum = 0.0f;
1070
1071
promise(texels_per_block > 0);
1072
for (int i = 0; i < texels_per_block; i++)
1073
{
1074
float weight = hadd_s(blk.channel_weight) / 4.0f;
1075
assert(weight >= 0.0f);
1076
weight_sum += weight;
1077
1078
float r = blk.data_r[i];
1079
float g = blk.data_g[i];
1080
float b = blk.data_b[i];
1081
float a = blk.data_a[i];
1082
1083
float rw = r * weight;
1084
rs += rw;
1085
rr_var += r * rw;
1086
rg_cov += g * rw;
1087
rb_cov += b * rw;
1088
ra_cov += a * rw;
1089
1090
float gw = g * weight;
1091
gs += gw;
1092
gg_var += g * gw;
1093
gb_cov += b * gw;
1094
ga_cov += a * gw;
1095
1096
float bw = b * weight;
1097
bs += bw;
1098
bb_var += b * bw;
1099
ba_cov += a * bw;
1100
1101
float aw = a * weight;
1102
as += aw;
1103
aa_var += a * aw;
1104
}
1105
1106
float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
1107
1108
rr_var -= rs * (rs * rpt);
1109
rg_cov -= gs * (rs * rpt);
1110
rb_cov -= bs * (rs * rpt);
1111
ra_cov -= as * (rs * rpt);
1112
1113
gg_var -= gs * (gs * rpt);
1114
gb_cov -= bs * (gs * rpt);
1115
ga_cov -= as * (gs * rpt);
1116
1117
bb_var -= bs * (bs * rpt);
1118
ba_cov -= as * (bs * rpt);
1119
1120
aa_var -= as * (as * rpt);
1121
1122
// These will give a NaN if a channel is constant - these are fixed up in the next step
1123
rg_cov *= astc::rsqrt(rr_var * gg_var);
1124
rb_cov *= astc::rsqrt(rr_var * bb_var);
1125
ra_cov *= astc::rsqrt(rr_var * aa_var);
1126
gb_cov *= astc::rsqrt(gg_var * bb_var);
1127
ga_cov *= astc::rsqrt(gg_var * aa_var);
1128
ba_cov *= astc::rsqrt(bb_var * aa_var);
1129
1130
if (astc::isnan(rg_cov)) rg_cov = 1.0f;
1131
if (astc::isnan(rb_cov)) rb_cov = 1.0f;
1132
if (astc::isnan(ra_cov)) ra_cov = 1.0f;
1133
if (astc::isnan(gb_cov)) gb_cov = 1.0f;
1134
if (astc::isnan(ga_cov)) ga_cov = 1.0f;
1135
if (astc::isnan(ba_cov)) ba_cov = 1.0f;
1136
1137
float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
1138
lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
1139
lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
1140
lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
1141
lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov));
1142
1143
// Diagnostic trace points
1144
trace_add_data("min_r", blk.data_min.lane<0>());
1145
trace_add_data("max_r", blk.data_max.lane<0>());
1146
trace_add_data("min_g", blk.data_min.lane<1>());
1147
trace_add_data("max_g", blk.data_max.lane<1>());
1148
trace_add_data("min_b", blk.data_min.lane<2>());
1149
trace_add_data("max_b", blk.data_max.lane<2>());
1150
trace_add_data("min_a", blk.data_min.lane<3>());
1151
trace_add_data("max_a", blk.data_max.lane<3>());
1152
trace_add_data("cov_rg", fabsf(rg_cov));
1153
trace_add_data("cov_rb", fabsf(rb_cov));
1154
trace_add_data("cov_ra", fabsf(ra_cov));
1155
trace_add_data("cov_gb", fabsf(gb_cov));
1156
trace_add_data("cov_ga", fabsf(ga_cov));
1157
trace_add_data("cov_ba", fabsf(ba_cov));
1158
1159
return lowest_correlation;
1160
}
1161
1162
/* See header for documentation. */
1163
void compress_block(
1164
const astcenc_contexti& ctx,
1165
const image_block& blk,
1166
uint8_t pcb[16],
1167
compression_working_buffers& tmpbuf)
1168
{
1169
astcenc_profile decode_mode = ctx.config.profile;
1170
symbolic_compressed_block scb;
1171
const block_size_descriptor& bsd = *ctx.bsd;
1172
float lowest_correl;
1173
1174
TRACE_NODE(node0, "block");
1175
trace_add_data("pos_x", blk.xpos);
1176
trace_add_data("pos_y", blk.ypos);
1177
trace_add_data("pos_z", blk.zpos);
1178
1179
// Set stricter block targets for luminance data as we have more bits to play with
1180
bool block_is_l = blk.is_luminance();
1181
float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
1182
1183
// Set slightly stricter block targets for lumalpha data as we have more bits to play with
1184
bool block_is_la = blk.is_luminancealpha();
1185
float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
1186
1187
bool block_skip_two_plane = false;
1188
int max_partitions = ctx.config.tune_partition_count_limit;
1189
1190
unsigned int requested_partition_indices[3] {
1191
ctx.config.tune_2partition_index_limit,
1192
ctx.config.tune_3partition_index_limit,
1193
ctx.config.tune_4partition_index_limit
1194
};
1195
1196
unsigned int requested_partition_trials[3] {
1197
ctx.config.tune_2partitioning_candidate_limit,
1198
ctx.config.tune_3partitioning_candidate_limit,
1199
ctx.config.tune_4partitioning_candidate_limit
1200
};
1201
1202
#if defined(ASTCENC_DIAGNOSTICS)
1203
// Do this early in diagnostic builds so we can dump uniform metrics
1204
// for every block. Do it later in release builds to avoid redundant work!
1205
float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1206
float error_threshold = ctx.config.tune_db_limit
1207
* error_weight_sum
1208
* block_is_l_scale
1209
* block_is_la_scale;
1210
1211
lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1212
trace_add_data("lowest_correl", lowest_correl);
1213
trace_add_data("tune_error_threshold", error_threshold);
1214
#endif
1215
1216
// Detected a constant-color block
1217
if (all(blk.data_min == blk.data_max))
1218
{
1219
TRACE_NODE(node1, "pass");
1220
trace_add_data("partition_count", 0);
1221
trace_add_data("plane_count", 1);
1222
1223
scb.partition_count = 0;
1224
1225
// Encode as FP16 if using HDR
1226
if ((decode_mode == ASTCENC_PRF_HDR) ||
1227
(decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1228
{
1229
scb.block_type = SYM_BTYPE_CONST_F16;
1230
vint4 color_f16 = float_to_float16(blk.origin_texel);
1231
store(color_f16, scb.constant_color);
1232
}
1233
// Encode as UNORM16 if NOT using HDR
1234
else
1235
{
1236
scb.block_type = SYM_BTYPE_CONST_U16;
1237
vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1238
vint4 color_u16 = float_to_int_rtn(color_f32);
1239
store(color_u16, scb.constant_color);
1240
}
1241
1242
trace_add_data("exit", "quality hit");
1243
1244
symbolic_to_physical(bsd, scb, pcb);
1245
return;
1246
}
1247
1248
#if !defined(ASTCENC_DIAGNOSTICS)
1249
float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1250
float error_threshold = ctx.config.tune_db_limit
1251
* error_weight_sum
1252
* block_is_l_scale
1253
* block_is_la_scale;
1254
#endif
1255
1256
// Set SCB and mode errors to a very high error value
1257
scb.errorval = ERROR_CALC_DEFAULT;
1258
scb.block_type = SYM_BTYPE_ERROR;
1259
1260
float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1261
ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1262
};
1263
1264
float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1265
0.0f,
1266
ctx.config.tune_2partition_early_out_limit_factor,
1267
ctx.config.tune_3partition_early_out_limit_factor,
1268
0.0f
1269
};
1270
1271
// Trial using 1 plane of weights and 1 partition.
1272
1273
// Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1274
// mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1275
// optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1276
// compression and slightly reduces image quality.
1277
1278
float errorval_mult[2] {
1279
1.0f / ctx.config.tune_mse_overshoot,
1280
1.0f
1281
};
1282
1283
const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
1284
1285
// Only enable MODE0 fast path if enabled
1286
// Never enable for 3D blocks as no "always" block modes are available
1287
int start_trial = 1;
1288
if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
1289
{
1290
start_trial = 0;
1291
}
1292
1293
int quant_limit = QUANT_32;
1294
for (int i = start_trial; i < 2; i++)
1295
{
1296
TRACE_NODE(node1, "pass");
1297
trace_add_data("partition_count", 1);
1298
trace_add_data("plane_count", 1);
1299
trace_add_data("search_mode", i);
1300
1301
float errorval = compress_symbolic_block_for_partition_1plane(
1302
ctx.config, bsd, blk, i == 0,
1303
error_threshold * errorval_mult[i] * errorval_overshoot,
1304
1, 0, scb, tmpbuf, QUANT_32);
1305
1306
// Record the quant level so we can use the filter later searches
1307
const auto& bm = bsd.get_block_mode(scb.block_mode);
1308
quant_limit = bm.get_weight_quant_mode();
1309
1310
best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
1311
if (errorval < (error_threshold * errorval_mult[i]))
1312
{
1313
trace_add_data("exit", "quality hit");
1314
goto END_OF_TESTS;
1315
}
1316
}
1317
1318
#if !defined(ASTCENC_DIAGNOSTICS)
1319
lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1320
#endif
1321
1322
block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
1323
1324
// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1325
// alpha is the most likely to be non-correlated if it is present in the data.
1326
for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
1327
{
1328
TRACE_NODE(node1, "pass");
1329
trace_add_data("partition_count", 1);
1330
trace_add_data("plane_count", 2);
1331
trace_add_data("plane_component", i);
1332
1333
if (block_skip_two_plane)
1334
{
1335
trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
1336
continue;
1337
}
1338
1339
if (blk.grayscale && i != 3)
1340
{
1341
trace_add_data("skip", "grayscale block");
1342
continue;
1343
}
1344
1345
if (blk.is_constant_channel(i))
1346
{
1347
trace_add_data("skip", "constant component");
1348
continue;
1349
}
1350
1351
float errorval = compress_symbolic_block_for_partition_2planes(
1352
ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1353
i, scb, tmpbuf, quant_limit);
1354
1355
// If attempting two planes is much worse than the best one plane result
1356
// then further two plane searches are unlikely to help so move on ...
1357
if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
1358
{
1359
break;
1360
}
1361
1362
if (errorval < error_threshold)
1363
{
1364
trace_add_data("exit", "quality hit");
1365
goto END_OF_TESTS;
1366
}
1367
}
1368
1369
// Find best blocks for 2, 3 and 4 partitions
1370
for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
1371
{
1372
unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
1373
1374
unsigned int requested_indices = requested_partition_indices[partition_count - 2];
1375
1376
unsigned int requested_trials = requested_partition_trials[partition_count - 2];
1377
requested_trials = astc::min(requested_trials, requested_indices);
1378
1379
unsigned int actual_trials = find_best_partition_candidates(
1380
bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1381
1382
float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
1383
1384
for (unsigned int i = 0; i < actual_trials; i++)
1385
{
1386
TRACE_NODE(node1, "pass");
1387
trace_add_data("partition_count", partition_count);
1388
trace_add_data("partition_index", partition_indices[i]);
1389
trace_add_data("plane_count", 1);
1390
trace_add_data("search_mode", i);
1391
1392
float errorval = compress_symbolic_block_for_partition_1plane(
1393
ctx.config, bsd, blk, false,
1394
error_threshold * errorval_overshoot,
1395
partition_count, partition_indices[i],
1396
scb, tmpbuf, quant_limit);
1397
1398
best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
1399
1400
// If using N partitions doesn't improve much over using N-1 partitions then skip trying
1401
// N+1. Error can dramatically improve if the data is correlated or non-correlated and
1402
// aligns with a partitioning that suits that encoding, so for this inner loop check add
1403
// a large error scale because the "other" trial could be a lot better.
1404
float best_error = best_errorvals_for_pcount[partition_count - 1];
1405
float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
1406
if (best_error > (best_error_in_prev * best_error_scale))
1407
{
1408
trace_add_data("skip", "tune_partition_early_out_limit_factor");
1409
goto END_OF_TESTS;
1410
}
1411
1412
if (errorval < error_threshold)
1413
{
1414
trace_add_data("exit", "quality hit");
1415
goto END_OF_TESTS;
1416
}
1417
}
1418
1419
// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1420
float best_error = best_errorvals_for_pcount[partition_count - 1];
1421
float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
1422
if (best_error > (best_error_in_prev * best_error_scale))
1423
{
1424
trace_add_data("skip", "tune_partition_early_out_limit_factor");
1425
goto END_OF_TESTS;
1426
}
1427
}
1428
1429
trace_add_data("exit", "quality not hit");
1430
1431
END_OF_TESTS:
1432
// If we still have an error block then convert to something we can encode
1433
// TODO: Do something more sensible here, such as average color block
1434
if (scb.block_type == SYM_BTYPE_ERROR)
1435
{
1436
#if defined(ASTCENC_DIAGNOSTICS)
1437
static bool printed_once = false;
1438
if (!printed_once)
1439
{
1440
printed_once = true;
1441
printf("WARN: At least one block failed to find a valid encoding.\n"
1442
" Try increasing compression quality settings.\n\n");
1443
}
1444
#endif
1445
1446
scb.block_type = SYM_BTYPE_CONST_U16;
1447
vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1448
vint4 color_u16 = float_to_int_rtn(color_f32);
1449
store(color_u16, scb.constant_color);
1450
}
1451
1452
// Compress to a physical block
1453
symbolic_to_physical(bsd, scb, pcb);
1454
}
1455
1456
#endif
1457
1458