CoCalc -- astcenc_compress

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_compress_symbolic.cpp
⁹⁸⁹⁶ views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2025 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17

18
#if !defined(ASTCENC_DECOMPRESS_ONLY)
19

20
/**
21
 * @brief Functions to compress a symbolic block.
22
 */
23

24
#include "astcenc_internal.h"
25
#include "astcenc_diagnostic_trace.h"
26

27
#include <cassert>
28

29
/**
30
 * @brief Merge two planes of endpoints into a single vector.
31
 *
32
 * @param      ep_plane1          The endpoints for plane 1.
33
 * @param      ep_plane2          The endpoints for plane 2.
34
 * @param      component_plane2   The color component for plane 2.
35
 * @param[out] result             The merged output.
36
 */
37
static void merge_endpoints(
38
	const endpoints& ep_plane1,
39
	const endpoints& ep_plane2,
40
	unsigned int component_plane2,
41
	endpoints& result
42
) {
43
	unsigned int partition_count = ep_plane1.partition_count;
44
	assert(partition_count == 1);
45

46
	vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
47

48
	result.partition_count = partition_count;
49
	result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
50
	result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
51
}
52

53
/**
54
 * @brief Attempt to improve weights given a chosen configuration.
55
 *
56
 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
57
 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
58
 * down by one quantization step.
59
 *
60
 * This is a specialized function which only supports operating on undecimated weight grids,
61
 * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
62
 * is needed less often.
63
 *
64
 * @param      decode_mode   The decode mode (LDR, HDR).
65
 * @param      bsd           The block size information.
66
 * @param      blk           The image block color data to compress.
67
 * @param[out] scb           The symbolic compressed block output.
68
 */
69
static bool realign_weights_undecimated(
70
	astcenc_profile decode_mode,
71
	const block_size_descriptor& bsd,
72
	const image_block& blk,
73
	symbolic_compressed_block& scb
74
) {
75
	// Get the partition descriptor
76
	unsigned int partition_count = scb.partition_count;
77
	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
78

79
	// Get the quantization table
80
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
81
	unsigned int weight_quant_level = bm.quant_mode;
82
	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
83

84
	unsigned int max_plane = bm.is_dual_plane;
85
	int plane2_component = scb.plane2_component;
86
	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
87

88
	// Decode the color endpoints
89
	bool rgb_hdr;
90
	bool alpha_hdr;
91
	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
92
	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
93
	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
94
	vfloat4 offset[BLOCK_MAX_PARTITIONS];
95

96
	promise(partition_count > 0);
97

98
	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
99
	{
100
		unpack_color_endpoints(decode_mode,
101
		                       scb.color_formats[pa_idx],
102
		                       scb.color_values[pa_idx],
103
		                       rgb_hdr, alpha_hdr,
104
		                       endpnt0[pa_idx],
105
		                       endpnt1[pa_idx]);
106
	}
107

108
	uint8_t* dec_weights_uquant = scb.weights;
109
	bool adjustments = false;
110

111
	// For each plane and partition ...
112
	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
113
	{
114
		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
115
		{
116
			// Compute the endpoint delta for all components in current plane
117
			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
118
			epd = select(epd, vint4::zero(), plane_mask);
119

120
			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
121
			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
122
		}
123

124
		// For each weight compute previous, current, and next errors
125
		promise(bsd.texel_count > 0);
126
		for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
127
		{
128
			int uqw = dec_weights_uquant[texel];
129

130
			uint32_t prev_and_next = qat.prev_next_values[uqw];
131
			int uqw_down = prev_and_next & 0xFF;
132
			int uqw_up = (prev_and_next >> 8) & 0xFF;
133

134
			// Interpolate the colors to create the diffs
135
			float weight_base = static_cast<float>(uqw);
136
			float weight_down = static_cast<float>(uqw_down - uqw);
137
			float weight_up = static_cast<float>(uqw_up - uqw);
138

139
			unsigned int partition = pi.partition_of_texel[texel];
140
			vfloat4 color_offset = offset[partition];
141
			vfloat4 color_base   = endpnt0f[partition];
142

143
			vfloat4 color = color_base + color_offset * weight_base;
144
			vfloat4 orig_color   = blk.texel(texel);
145
			vfloat4 error_weight = blk.channel_weight;
146

147
			vfloat4 color_diff      = color - orig_color;
148
			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
149
			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
150

151
			float error_base = dot_s(color_diff      * color_diff,      error_weight);
152
			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
153
			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
154

155
			// Check if the prev or next error is better, and if so use it
156
			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
157
			{
158
				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
159
				adjustments = true;
160
			}
161
			else if ((error_down < error_base) && (uqw > 0))
162
			{
163
				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
164
				adjustments = true;
165
			}
166
		}
167

168
		// Prepare iteration for plane 2
169
		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
170
		plane_mask = ~plane_mask;
171
	}
172

173
	return adjustments;
174
}
175

176
/**
177
 * @brief Attempt to improve weights given a chosen configuration.
178
 *
179
 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
180
 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
181
 * down by one quantization step.
182
 *
183
 * @param      decode_mode   The decode mode (LDR, HDR).
184
 * @param      bsd           The block size information.
185
 * @param      blk           The image block color data to compress.
186
 * @param[out] scb           The symbolic compressed block output.
187
 */
188
static bool realign_weights_decimated(
189
	astcenc_profile decode_mode,
190
	const block_size_descriptor& bsd,
191
	const image_block& blk,
192
	symbolic_compressed_block& scb
193
) {
194
	// Get the partition descriptor
195
	unsigned int partition_count = scb.partition_count;
196
	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
197

198
	// Get the quantization table
199
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
200
	unsigned int weight_quant_level = bm.quant_mode;
201
	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
202

203
	// Get the decimation table
204
	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
205
	unsigned int weight_count = di.weight_count;
206
	assert(weight_count != bsd.texel_count);
207

208
	unsigned int max_plane = bm.is_dual_plane;
209
	int plane2_component = scb.plane2_component;
210
	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
211

212
	// Decode the color endpoints
213
	bool rgb_hdr;
214
	bool alpha_hdr;
215
	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
216
	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
217
	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
218
	vfloat4 offset[BLOCK_MAX_PARTITIONS];
219

220
	promise(partition_count > 0);
221
	promise(weight_count > 0);
222

223
	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
224
	{
225
		unpack_color_endpoints(decode_mode,
226
		                       scb.color_formats[pa_idx],
227
		                       scb.color_values[pa_idx],
228
		                       rgb_hdr, alpha_hdr,
229
		                       endpnt0[pa_idx],
230
		                       endpnt1[pa_idx]);
231
	}
232

233
	uint8_t* dec_weights_uquant = scb.weights;
234
	bool adjustments = false;
235

236
	// For each plane and partition ...
237
	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
238
	{
239
		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
240
		{
241
			// Compute the endpoint delta for all components in current plane
242
			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
243
			epd = select(epd, vint4::zero(), plane_mask);
244

245
			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
246
			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
247
		}
248

249
		// Create an unquantized weight grid for this decimation level
250
		ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
251
		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
252
		{
253
			vint unquant_value(dec_weights_uquant + we_idx);
254
			vfloat unquant_valuef = int_to_float(unquant_value);
255
			storea(unquant_valuef, uq_weightsf + we_idx);
256
		}
257

258
		// For each weight compute previous, current, and next errors
259
		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
260
		{
261
			int uqw = dec_weights_uquant[we_idx];
262
			uint32_t prev_and_next = qat.prev_next_values[uqw];
263

264
			float uqw_base = uq_weightsf[we_idx];
265
			float uqw_down = static_cast<float>(prev_and_next & 0xFF);
266
			float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
267

268
			float uqw_diff_down = uqw_down - uqw_base;
269
			float uqw_diff_up = uqw_up - uqw_base;
270

271
			vfloat4 error_basev = vfloat4::zero();
272
			vfloat4 error_downv = vfloat4::zero();
273
			vfloat4 error_upv = vfloat4::zero();
274

275
			// Interpolate the colors to create the diffs
276
			unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
277
			promise(texels_to_evaluate > 0);
278
			for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
279
			{
280
				unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
281

282
				float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
283

284
				float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
285
				                   + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
286
					              + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
287
				                   + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
288

289
				// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
290
				// float weight = astc::flt_rd(weight_base + 0.5f);
291
				// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
292
				// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
293
				float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
294
				float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
295

296
				unsigned int partition = pi.partition_of_texel[texel];
297
				vfloat4 color_offset = offset[partition];
298
				vfloat4 color_base   = endpnt0f[partition];
299

300
				vfloat4 color = color_base + color_offset * weight_base;
301
				vfloat4 orig_color = blk.texel(texel);
302

303
				vfloat4 color_diff      = color - orig_color;
304
				vfloat4 color_down_diff = color_diff + color_offset * weight_down;
305
				vfloat4 color_up_diff   = color_diff + color_offset * weight_up;
306

307
				error_basev += color_diff * color_diff;
308
				error_downv += color_down_diff * color_down_diff;
309
				error_upv   += color_up_diff * color_up_diff;
310
			}
311

312
			vfloat4 error_weight = blk.channel_weight;
313
			float error_base = hadd_s(error_basev * error_weight);
314
			float error_down = hadd_s(error_downv * error_weight);
315
			float error_up   = hadd_s(error_upv   * error_weight);
316

317
			// Check if the prev or next error is better, and if so use it
318
			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
319
			{
320
				uq_weightsf[we_idx] = uqw_up;
321
				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
322
				adjustments = true;
323
			}
324
			else if ((error_down < error_base) && (uqw > 0))
325
			{
326
				uq_weightsf[we_idx] = uqw_down;
327
				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
328
				adjustments = true;
329
			}
330
		}
331

332
		// Prepare iteration for plane 2
333
		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
334
		plane_mask = ~plane_mask;
335
	}
336

337
	return adjustments;
338
}
339

340
/**
341
 * @brief Compress a block using a chosen partitioning and 1 plane of weights.
342
 *
343
 * @param      config                    The compressor configuration.
344
 * @param      bsd                       The block size information.
345
 * @param      blk                       The image block color data to compress.
346
 * @param      only_always               True if we only use "always" percentile block modes.
347
 * @param      tune_errorval_threshold   The error value threshold.
348
 * @param      partition_count           The partition count.
349
 * @param      partition_index           The partition index if @c partition_count is 2-4.
350
 * @param[out] scb                       The symbolic compressed block output.
351
 * @param[out] tmpbuf                    The quantized weights for plane 1.
352
 */
353
static float compress_symbolic_block_for_partition_1plane(
354
	const astcenc_config& config,
355
	const block_size_descriptor& bsd,
356
	const image_block& blk,
357
	bool only_always,
358
	float tune_errorval_threshold,
359
	unsigned int partition_count,
360
	unsigned int partition_index,
361
	symbolic_compressed_block& scb,
362
	compression_working_buffers& tmpbuf,
363
	int quant_limit
364
) {
365
	promise(partition_count > 0);
366
	promise(config.tune_candidate_limit > 0);
367
	promise(config.tune_refinement_limit > 0);
368

369
	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
370

371
	auto compute_difference = &compute_symbolic_block_difference_1plane;
372
	if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
373
	{
374
		compute_difference = &compute_symbolic_block_difference_1plane_1partition;
375
	}
376

377
	const auto& pi = bsd.get_partition_info(partition_count, partition_index);
378

379
	// Compute ideal weights and endpoint colors, with no quantization or decimation
380
	endpoints_and_weights& ei = tmpbuf.ei1;
381
	compute_ideal_colors_and_weights_1plane(blk, pi, ei);
382

383
	// Compute ideal weights and endpoint colors for every decimation
384
	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
385
	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
386

387
	// For each decimation mode, compute an ideal set of weights with no quantization
388
	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
389
	                                                : bsd.decimation_mode_count_selected;
390
	promise(max_decimation_modes > 0);
391
	for (unsigned int i = 0; i < max_decimation_modes; i++)
392
	{
393
		const auto& dm = bsd.get_decimation_mode(i);
394
		if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
395
		{
396
			continue;
397
		}
398

399
		const auto& di = bsd.get_decimation_info(i);
400

401
		compute_ideal_weights_for_decimation(
402
		    ei,
403
		    di,
404
		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
405
	}
406

407
	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
408
	// weight pair, compute the smallest weight that will result in a color value greater than 1
409
	vfloat4 min_ep(10.0f);
410
	for (unsigned int i = 0; i < partition_count; i++)
411
	{
412
		vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
413

414
		vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
415
		min_ep = select(min_ep, ep, use_ep);
416
	}
417

418
	float min_wt_cutoff = hmin_s(min_ep);
419

420
	// For each mode, use the angular method to compute a shift
421
	compute_angular_endpoints_1plane(
422
	    only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
423

424
	float* weight_low_value = tmpbuf.weight_low_value1;
425
	float* weight_high_value = tmpbuf.weight_high_value1;
426
	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
427
	float* qwt_errors = tmpbuf.qwt_errors;
428

429
	// For each mode (which specifies a decimation and a quantization):
430
	//     * Compute number of bits needed for the quantized weights
431
	//     * Generate an optimized set of quantized weights
432
	//     * Compute quantization errors for the mode
433

434

435
	static const int8_t free_bits_for_partition_count[4] {
436
		115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
437
	};
438

439
	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
440
	                                           : bsd.block_mode_count_1plane_selected;
441
	promise(max_block_modes > 0);
442
	for (unsigned int i = 0; i < max_block_modes; i++)
443
	{
444
		const block_mode& bm = bsd.block_modes[i];
445

446
		if (bm.quant_mode > max_weight_quant)
447
		{
448
			qwt_errors[i] = 1e38f;
449
			continue;
450
		}
451

452
		assert(!bm.is_dual_plane);
453
		int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
454
		if (bitcount <= 0)
455
		{
456
			qwt_errors[i] = 1e38f;
457
			continue;
458
		}
459

460
		if (weight_high_value[i] > 1.02f * min_wt_cutoff)
461
		{
462
			weight_high_value[i] = 1.0f;
463
		}
464

465
		int decimation_mode = bm.decimation_mode;
466
		const auto& di = bsd.get_decimation_info(decimation_mode);
467

468
		qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
469

470
		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
471

472
		// Generate the optimized set of weights for the weight mode
473
		compute_quantized_weights_for_decimation(
474
		    di,
475
		    weight_low_value[i], weight_high_value[i],
476
		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
477
		    dec_weights_uquantf,
478
		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
479
		    bm.get_weight_quant_mode());
480

481
		// Compute weight quantization errors for the block mode
482
		qwt_errors[i] = compute_error_of_weight_set_1plane(
483
		    ei,
484
		    di,
485
		    dec_weights_uquantf);
486
	}
487

488
	// Decide the optimal combination of color endpoint encodings and weight encodings
489
	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
490
	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
491

492
	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
493
	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
494

495
	unsigned int candidate_count = compute_ideal_endpoint_formats(
496
	    pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
497
	    config.tune_candidate_limit, 0, max_block_modes,
498
	    partition_format_specifiers, block_mode_index,
499
	    color_quant_level, color_quant_level_mod, tmpbuf);
500

501
	// Iterate over the N believed-to-be-best modes to find out which one is actually best
502
	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
503
	float best_errorval_in_scb = scb.errorval;
504

505
	for (unsigned int i = 0; i < candidate_count; i++)
506
	{
507
		TRACE_NODE(node0, "candidate");
508

509
		const int bm_packed_index = block_mode_index[i];
510
		assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
511
		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
512

513
		int decimation_mode = qw_bm.decimation_mode;
514
		const auto& di = bsd.get_decimation_info(decimation_mode);
515
		promise(di.weight_count > 0);
516

517
		trace_add_data("weight_x", di.weight_x);
518
		trace_add_data("weight_y", di.weight_y);
519
		trace_add_data("weight_z", di.weight_z);
520
		trace_add_data("weight_quant", qw_bm.quant_mode);
521

522
		// Recompute the ideal color endpoints before storing them
523
		vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
524
		vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
525

526
		symbolic_compressed_block workscb;
527
		endpoints workep = ei.ep;
528

529
		uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
530

531
		for (unsigned int j = 0; j < di.weight_count; j++)
532
		{
533
			workscb.weights[j] = u8_weight_src[j];
534
		}
535

536
		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
537
		{
538
			recompute_ideal_colors_1plane(
539
			    blk, pi, di, workscb.weights,
540
			    workep, rgbs_colors, rgbo_colors);
541

542
			// Quantize the chosen color, tracking if worth trying the mod value
543
			bool all_same = color_quant_level[i] != color_quant_level_mod[i];
544
			for (unsigned int j = 0; j < partition_count; j++)
545
			{
546
				workscb.color_formats[j] = pack_color_endpoints(
547
				    workep.endpt0[j],
548
				    workep.endpt1[j],
549
				    rgbs_colors[j],
550
				    rgbo_colors[j],
551
				    partition_format_specifiers[i][j],
552
				    workscb.color_values[j],
553
				    color_quant_level[i]);
554

555
				all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
556
			}
557

558
			// If all the color endpoint modes are the same, we get a few more bits to store colors;
559
			// let's see if we can take advantage of this: requantize all the colors and see if the
560
			// endpoint modes remain the same.
561
			workscb.color_formats_matched = 0;
562
			if (partition_count >= 2 && all_same)
563
			{
564
				uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
565
				uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
566
				bool all_same_mod = true;
567
				for (unsigned int j = 0; j < partition_count; j++)
568
				{
569
					color_formats_mod[j] = pack_color_endpoints(
570
					    workep.endpt0[j],
571
					    workep.endpt1[j],
572
					    rgbs_colors[j],
573
					    rgbo_colors[j],
574
					    partition_format_specifiers[i][j],
575
					    colorvals[j],
576
					    color_quant_level_mod[i]);
577

578
					// Early out as soon as it's no longer possible to use mod
579
					if (color_formats_mod[j] != color_formats_mod[0])
580
					{
581
						all_same_mod = false;
582
						break;
583
					}
584
				}
585

586
				if (all_same_mod)
587
				{
588
					workscb.color_formats_matched = 1;
589
					for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
590
					{
591
						for (unsigned int k = 0; k < 8; k++)
592
						{
593
							workscb.color_values[j][k] = colorvals[j][k];
594
						}
595

596
						workscb.color_formats[j] = color_formats_mod[j];
597
					}
598
				}
599
			}
600

601
			// Store header fields
602
			workscb.partition_count = static_cast<uint8_t>(partition_count);
603
			workscb.partition_index = static_cast<uint16_t>(partition_index);
604
			workscb.plane2_component = -1;
605
			workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
606
			workscb.block_mode = qw_bm.mode_index;
607
			workscb.block_type = SYM_BTYPE_NONCONST;
608

609
			// Pre-realign test
610
			if (l == 0)
611
			{
612
				float errorval = compute_difference(config, bsd, workscb, blk);
613
				if (errorval == -ERROR_CALC_DEFAULT)
614
				{
615
					errorval = -errorval;
616
					workscb.block_type = SYM_BTYPE_ERROR;
617
				}
618

619
				trace_add_data("error_prerealign", errorval);
620
				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
621

622
				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
623
				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
624
				// drive a heuristic to skip blocks that are unlikely to catch up with the best
625
				// block we have already.
626
				unsigned int iters_remaining = config.tune_refinement_limit - l;
627
				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
628
				if (errorval > (threshold * best_errorval_in_scb))
629
				{
630
					break;
631
				}
632

633
				if (errorval < best_errorval_in_scb)
634
				{
635
					best_errorval_in_scb = errorval;
636
					workscb.errorval = errorval;
637
					scb = workscb;
638

639
					if (errorval < tune_errorval_threshold)
640
					{
641
						// Skip remaining candidates - this is "good enough"
642
						i = candidate_count;
643
						break;
644
					}
645
				}
646
			}
647

648
			bool adjustments;
649
			if (di.weight_count != bsd.texel_count)
650
			{
651
				adjustments = realign_weights_decimated(
652
					config.profile, bsd, blk, workscb);
653
			}
654
			else
655
			{
656
				adjustments = realign_weights_undecimated(
657
					config.profile, bsd, blk, workscb);
658
			}
659

660
			// Post-realign test
661
			float errorval = compute_difference(config, bsd, workscb, blk);
662
			if (errorval == -ERROR_CALC_DEFAULT)
663
			{
664
				errorval = -errorval;
665
				workscb.block_type = SYM_BTYPE_ERROR;
666
			}
667

668
			trace_add_data("error_postrealign", errorval);
669
			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
670

671
			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
672
			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
673
			// give benefit of the doubt ...
674
			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
675
			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
676
			if (errorval > (threshold * best_errorval_in_scb))
677
			{
678
				break;
679
			}
680

681
			if (errorval < best_errorval_in_scb)
682
			{
683
				best_errorval_in_scb = errorval;
684
				workscb.errorval = errorval;
685
				scb = workscb;
686

687
				if (errorval < tune_errorval_threshold)
688
				{
689
					// Skip remaining candidates - this is "good enough"
690
					i = candidate_count;
691
					break;
692
				}
693
			}
694

695
			if (!adjustments)
696
			{
697
				break;
698
			}
699
		}
700
	}
701

702
	return best_errorval_in_mode;
703
}
704

705
/**
706
 * @brief Compress a block using a chosen partitioning and 2 planes of weights.
707
 *
708
 * @param      config                    The compressor configuration.
709
 * @param      bsd                       The block size information.
710
 * @param      blk                       The image block color data to compress.
711
 * @param      tune_errorval_threshold   The error value threshold.
712
 * @param      plane2_component          The component index for the second plane of weights.
713
 * @param[out] scb                       The symbolic compressed block output.
714
 * @param[out] tmpbuf                    The quantized weights for plane 1.
715
 */
716
static float compress_symbolic_block_for_partition_2planes(
717
	const astcenc_config& config,
718
	const block_size_descriptor& bsd,
719
	const image_block& blk,
720
	float tune_errorval_threshold,
721
	unsigned int plane2_component,
722
	symbolic_compressed_block& scb,
723
	compression_working_buffers& tmpbuf,
724
	int quant_limit
725
) {
726
	promise(config.tune_candidate_limit > 0);
727
	promise(config.tune_refinement_limit > 0);
728
	promise(bsd.decimation_mode_count_selected > 0);
729

730
	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
731

732
	// Compute ideal weights and endpoint colors, with no quantization or decimation
733
	endpoints_and_weights& ei1 = tmpbuf.ei1;
734
	endpoints_and_weights& ei2 = tmpbuf.ei2;
735

736
	compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
737

738
	// Compute ideal weights and endpoint colors for every decimation
739
	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
740
	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
741

742
	// For each decimation mode, compute an ideal set of weights with no quantization
743
	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
744
	{
745
		const auto& dm = bsd.get_decimation_mode(i);
746
		if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
747
		{
748
			continue;
749
		}
750

751
		const auto& di = bsd.get_decimation_info(i);
752

753
		compute_ideal_weights_for_decimation(
754
		    ei1,
755
		    di,
756
		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
757

758
		compute_ideal_weights_for_decimation(
759
		    ei2,
760
		    di,
761
		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
762
	}
763

764
	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
765
	// weight pair, compute the smallest weight that will result in a color value greater than 1
766
	vfloat4 min_ep1(10.0f);
767
	vfloat4 min_ep2(10.0f);
768

769
	vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
770
	vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
771
	min_ep1 = select(min_ep1, ep1, use_ep1);
772

773
	vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
774
	vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
775
	min_ep2 = select(min_ep2, ep2, use_ep2);
776

777
	vfloat4 err_max(ERROR_CALC_DEFAULT);
778
	vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
779

780
	// Set the plane2 component to max error in ep1
781
	min_ep1 = select(min_ep1, err_max, err_mask);
782

783
	float min_wt_cutoff1 = hmin_s(min_ep1);
784

785
	// Set the minwt2 to the plane2 component min in ep2
786
	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
787

788
	compute_angular_endpoints_2planes(
789
	    bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
790

791
	// For each mode (which specifies a decimation and a quantization):
792
	//     * Compute number of bits needed for the quantized weights
793
	//     * Generate an optimized set of quantized weights
794
	//     * Compute quantization errors for the mode
795

796
	float* weight_low_value1 = tmpbuf.weight_low_value1;
797
	float* weight_high_value1 = tmpbuf.weight_high_value1;
798
	float* weight_low_value2 = tmpbuf.weight_low_value2;
799
	float* weight_high_value2 = tmpbuf.weight_high_value2;
800

801
	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
802
	float* qwt_errors = tmpbuf.qwt_errors;
803

804
	unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
805
	unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
806

807
	for (unsigned int i = start_2plane; i < end_2plane; i++)
808
	{
809
		const block_mode& bm = bsd.block_modes[i];
810
		assert(bm.is_dual_plane);
811

812
		if (bm.quant_mode > max_weight_quant)
813
		{
814
			qwt_errors[i] = 1e38f;
815
			continue;
816
		}
817

818
		qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
819

820
		if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
821
		{
822
			weight_high_value1[i] = 1.0f;
823
		}
824

825
		if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
826
		{
827
			weight_high_value2[i] = 1.0f;
828
		}
829

830
		unsigned int decimation_mode = bm.decimation_mode;
831
		const auto& di = bsd.get_decimation_info(decimation_mode);
832

833
		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
834

835
		// Generate the optimized set of weights for the mode
836
		compute_quantized_weights_for_decimation(
837
		    di,
838
		    weight_low_value1[i],
839
		    weight_high_value1[i],
840
		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
841
		    dec_weights_uquantf,
842
		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
843
		    bm.get_weight_quant_mode());
844

845
		compute_quantized_weights_for_decimation(
846
		    di,
847
		    weight_low_value2[i],
848
		    weight_high_value2[i],
849
		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
850
		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
851
		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
852
		    bm.get_weight_quant_mode());
853

854
		// Compute weight quantization errors for the block mode
855
		qwt_errors[i] = compute_error_of_weight_set_2planes(
856
		    ei1,
857
		    ei2,
858
		    di,
859
		    dec_weights_uquantf,
860
		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
861
	}
862

863
	// Decide the optimal combination of color endpoint encodings and weight encodings
864
	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
865
	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
866

867
	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
868
	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
869

870
	endpoints epm;
871
	merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
872

873
	const auto& pi = bsd.get_partition_info(1, 0);
874
	unsigned int candidate_count = compute_ideal_endpoint_formats(
875
	    pi, blk, epm, qwt_bitcounts, qwt_errors,
876
	    config.tune_candidate_limit,
877
		bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
878
	    partition_format_specifiers, block_mode_index,
879
	    color_quant_level, color_quant_level_mod, tmpbuf);
880

881
	// Iterate over the N believed-to-be-best modes to find out which one is actually best
882
	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
883
	float best_errorval_in_scb = scb.errorval;
884

885
	for (unsigned int i = 0; i < candidate_count; i++)
886
	{
887
		TRACE_NODE(node0, "candidate");
888

889
		const int bm_packed_index = block_mode_index[i];
890
		assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
891
		       bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
892
		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
893

894
		int decimation_mode = qw_bm.decimation_mode;
895
		const auto& di = bsd.get_decimation_info(decimation_mode);
896
		promise(di.weight_count > 0);
897

898
		trace_add_data("weight_x", di.weight_x);
899
		trace_add_data("weight_y", di.weight_y);
900
		trace_add_data("weight_z", di.weight_z);
901
		trace_add_data("weight_quant", qw_bm.quant_mode);
902

903
		vfloat4 rgbs_color;
904
		vfloat4 rgbo_color;
905

906
		symbolic_compressed_block workscb;
907
		endpoints workep = epm;
908

909
		uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
910
		uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
911

912
		for (int j = 0; j < di.weight_count; j++)
913
		{
914
			workscb.weights[j] = u8_weight1_src[j];
915
			workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
916
		}
917

918
		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
919
		{
920
			recompute_ideal_colors_2planes(
921
			    blk, bsd, di,
922
			    workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
923
			    workep, rgbs_color, rgbo_color, plane2_component);
924

925
			// Quantize the chosen color
926
			workscb.color_formats[0] = pack_color_endpoints(
927
			                               workep.endpt0[0],
928
			                               workep.endpt1[0],
929
			                               rgbs_color, rgbo_color,
930
			                               partition_format_specifiers[i][0],
931
			                               workscb.color_values[0],
932
			                               color_quant_level[i]);
933

934
			// Store header fields
935
			workscb.partition_count = 1;
936
			workscb.partition_index = 0;
937
			workscb.quant_mode = color_quant_level[i];
938
			workscb.color_formats_matched = 0;
939
			workscb.block_mode = qw_bm.mode_index;
940
			workscb.plane2_component = static_cast<int8_t>(plane2_component);
941
			workscb.block_type = SYM_BTYPE_NONCONST;
942

943
			// Pre-realign test
944
			if (l == 0)
945
			{
946
				float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
947
				if (errorval == -ERROR_CALC_DEFAULT)
948
				{
949
					errorval = -errorval;
950
					workscb.block_type = SYM_BTYPE_ERROR;
951
				}
952

953
				trace_add_data("error_prerealign", errorval);
954
				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
955

956
				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
957
				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
958
				// drive a heuristic to skip blocks that are unlikely to catch up with the best
959
				// block we have already.
960
				unsigned int iters_remaining = config.tune_refinement_limit - l;
961
				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
962
				if (errorval > (threshold * best_errorval_in_scb))
963
				{
964
					break;
965
				}
966

967
				if (errorval < best_errorval_in_scb)
968
				{
969
					best_errorval_in_scb = errorval;
970
					workscb.errorval = errorval;
971
					scb = workscb;
972

973
					if (errorval < tune_errorval_threshold)
974
					{
975
						// Skip remaining candidates - this is "good enough"
976
						i = candidate_count;
977
						break;
978
					}
979
				}
980
			}
981

982
			// Perform a final pass over the weights to try to improve them.
983
			bool adjustments;
984
			if (di.weight_count != bsd.texel_count)
985
			{
986
				adjustments = realign_weights_decimated(
987
					config.profile, bsd, blk, workscb);
988
			}
989
			else
990
			{
991
				adjustments = realign_weights_undecimated(
992
					config.profile, bsd, blk, workscb);
993
			}
994

995
			// Post-realign test
996
			float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
997
			if (errorval == -ERROR_CALC_DEFAULT)
998
			{
999
				errorval = -errorval;
1000
				workscb.block_type = SYM_BTYPE_ERROR;
1001
			}
1002

1003
			trace_add_data("error_postrealign", errorval);
1004
			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1005

1006
			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
1007
			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
1008
			// give benefit of the doubt ...
1009
			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
1010
			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
1011
			if (errorval > (threshold * best_errorval_in_scb))
1012
			{
1013
				break;
1014
			}
1015

1016
			if (errorval < best_errorval_in_scb)
1017
			{
1018
				best_errorval_in_scb = errorval;
1019
				workscb.errorval = errorval;
1020
				scb = workscb;
1021

1022
				if (errorval < tune_errorval_threshold)
1023
				{
1024
					// Skip remaining candidates - this is "good enough"
1025
					i = candidate_count;
1026
					break;
1027
				}
1028
			}
1029

1030
			if (!adjustments)
1031
			{
1032
				break;
1033
			}
1034
		}
1035
	}
1036

1037
	return best_errorval_in_mode;
1038
}
1039

1040
/**
1041
 * @brief Determine the lowest cross-channel correlation factor.
1042
 *
1043
 * @param texels_per_block   The number of texels in a block.
1044
 * @param blk                The image block color data to compress.
1045
 *
1046
 * @return Return the lowest correlation factor.
1047
 */
1048
static float prepare_block_statistics(
1049
	int texels_per_block,
1050
	const image_block& blk
1051
) {
1052
	// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1053
	// of the matrix. The matrix is symmetric, so this is all we need for this use case.
1054
	float rs = 0.0f;
1055
	float gs = 0.0f;
1056
	float bs = 0.0f;
1057
	float as = 0.0f;
1058
	float rr_var = 0.0f;
1059
	float gg_var = 0.0f;
1060
	float bb_var = 0.0f;
1061
	float aa_var = 0.0f;
1062
	float rg_cov = 0.0f;
1063
	float rb_cov = 0.0f;
1064
	float ra_cov = 0.0f;
1065
	float gb_cov = 0.0f;
1066
	float ga_cov = 0.0f;
1067
	float ba_cov = 0.0f;
1068

1069
	float weight_sum = 0.0f;
1070

1071
	promise(texels_per_block > 0);
1072
	for (int i = 0; i < texels_per_block; i++)
1073
	{
1074
		float weight = hadd_s(blk.channel_weight) / 4.0f;
1075
		assert(weight >= 0.0f);
1076
		weight_sum += weight;
1077

1078
		float r = blk.data_r[i];
1079
		float g = blk.data_g[i];
1080
		float b = blk.data_b[i];
1081
		float a = blk.data_a[i];
1082

1083
		float rw = r * weight;
1084
		rs += rw;
1085
		rr_var += r * rw;
1086
		rg_cov += g * rw;
1087
		rb_cov += b * rw;
1088
		ra_cov += a * rw;
1089

1090
		float gw = g * weight;
1091
		gs += gw;
1092
		gg_var += g * gw;
1093
		gb_cov += b * gw;
1094
		ga_cov += a * gw;
1095

1096
		float bw = b * weight;
1097
		bs += bw;
1098
		bb_var += b * bw;
1099
		ba_cov += a * bw;
1100

1101
		float aw = a * weight;
1102
		as += aw;
1103
		aa_var += a * aw;
1104
	}
1105

1106
	float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
1107

1108
	rr_var -= rs * (rs * rpt);
1109
	rg_cov -= gs * (rs * rpt);
1110
	rb_cov -= bs * (rs * rpt);
1111
	ra_cov -= as * (rs * rpt);
1112

1113
	gg_var -= gs * (gs * rpt);
1114
	gb_cov -= bs * (gs * rpt);
1115
	ga_cov -= as * (gs * rpt);
1116

1117
	bb_var -= bs * (bs * rpt);
1118
	ba_cov -= as * (bs * rpt);
1119

1120
	aa_var -= as * (as * rpt);
1121

1122
	// These will give a NaN if a channel is constant - these are fixed up in the next step
1123
	rg_cov *= astc::rsqrt(rr_var * gg_var);
1124
	rb_cov *= astc::rsqrt(rr_var * bb_var);
1125
	ra_cov *= astc::rsqrt(rr_var * aa_var);
1126
	gb_cov *= astc::rsqrt(gg_var * bb_var);
1127
	ga_cov *= astc::rsqrt(gg_var * aa_var);
1128
	ba_cov *= astc::rsqrt(bb_var * aa_var);
1129

1130
	if (astc::isnan(rg_cov)) rg_cov = 1.0f;
1131
	if (astc::isnan(rb_cov)) rb_cov = 1.0f;
1132
	if (astc::isnan(ra_cov)) ra_cov = 1.0f;
1133
	if (astc::isnan(gb_cov)) gb_cov = 1.0f;
1134
	if (astc::isnan(ga_cov)) ga_cov = 1.0f;
1135
	if (astc::isnan(ba_cov)) ba_cov = 1.0f;
1136

1137
	float lowest_correlation = astc::min(fabsf(rg_cov),      fabsf(rb_cov));
1138
	lowest_correlation       = astc::min(lowest_correlation, fabsf(ra_cov));
1139
	lowest_correlation       = astc::min(lowest_correlation, fabsf(gb_cov));
1140
	lowest_correlation       = astc::min(lowest_correlation, fabsf(ga_cov));
1141
	lowest_correlation       = astc::min(lowest_correlation, fabsf(ba_cov));
1142

1143
	// Diagnostic trace points
1144
	trace_add_data("min_r", blk.data_min.lane<0>());
1145
	trace_add_data("max_r", blk.data_max.lane<0>());
1146
	trace_add_data("min_g", blk.data_min.lane<1>());
1147
	trace_add_data("max_g", blk.data_max.lane<1>());
1148
	trace_add_data("min_b", blk.data_min.lane<2>());
1149
	trace_add_data("max_b", blk.data_max.lane<2>());
1150
	trace_add_data("min_a", blk.data_min.lane<3>());
1151
	trace_add_data("max_a", blk.data_max.lane<3>());
1152
	trace_add_data("cov_rg", fabsf(rg_cov));
1153
	trace_add_data("cov_rb", fabsf(rb_cov));
1154
	trace_add_data("cov_ra", fabsf(ra_cov));
1155
	trace_add_data("cov_gb", fabsf(gb_cov));
1156
	trace_add_data("cov_ga", fabsf(ga_cov));
1157
	trace_add_data("cov_ba", fabsf(ba_cov));
1158

1159
	return lowest_correlation;
1160
}
1161

1162
/* See header for documentation. */
1163
void compress_block(
1164
	const astcenc_contexti& ctx,
1165
	const image_block& blk,
1166
	uint8_t pcb[16],
1167
	compression_working_buffers& tmpbuf)
1168
{
1169
	astcenc_profile decode_mode = ctx.config.profile;
1170
	symbolic_compressed_block scb;
1171
	const block_size_descriptor& bsd = *ctx.bsd;
1172
	float lowest_correl;
1173

1174
	TRACE_NODE(node0, "block");
1175
	trace_add_data("pos_x", blk.xpos);
1176
	trace_add_data("pos_y", blk.ypos);
1177
	trace_add_data("pos_z", blk.zpos);
1178

1179
	// Set stricter block targets for luminance data as we have more bits to play with
1180
	bool block_is_l = blk.is_luminance();
1181
	float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
1182

1183
	// Set slightly stricter block targets for lumalpha data as we have more bits to play with
1184
	bool block_is_la = blk.is_luminancealpha();
1185
	float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
1186

1187
	bool block_skip_two_plane = false;
1188
	int max_partitions = ctx.config.tune_partition_count_limit;
1189

1190
	unsigned int requested_partition_indices[3] {
1191
		ctx.config.tune_2partition_index_limit,
1192
		ctx.config.tune_3partition_index_limit,
1193
		ctx.config.tune_4partition_index_limit
1194
	};
1195

1196
	unsigned int requested_partition_trials[3] {
1197
		ctx.config.tune_2partitioning_candidate_limit,
1198
		ctx.config.tune_3partitioning_candidate_limit,
1199
		ctx.config.tune_4partitioning_candidate_limit
1200
	};
1201

1202
#if defined(ASTCENC_DIAGNOSTICS)
1203
	// Do this early in diagnostic builds so we can dump uniform metrics
1204
	// for every block. Do it later in release builds to avoid redundant work!
1205
	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1206
	float error_threshold = ctx.config.tune_db_limit
1207
	                      * error_weight_sum
1208
	                      * block_is_l_scale
1209
	                      * block_is_la_scale;
1210

1211
	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1212
	trace_add_data("lowest_correl", lowest_correl);
1213
	trace_add_data("tune_error_threshold", error_threshold);
1214
#endif
1215

1216
	// Detected a constant-color block
1217
	if (all(blk.data_min == blk.data_max))
1218
	{
1219
		TRACE_NODE(node1, "pass");
1220
		trace_add_data("partition_count", 0);
1221
		trace_add_data("plane_count", 1);
1222

1223
		scb.partition_count = 0;
1224

1225
		// Encode as FP16 if using HDR
1226
		if ((decode_mode == ASTCENC_PRF_HDR) ||
1227
		    (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1228
		{
1229
			scb.block_type = SYM_BTYPE_CONST_F16;
1230
			vint4 color_f16 = float_to_float16(blk.origin_texel);
1231
			store(color_f16, scb.constant_color);
1232
		}
1233
		// Encode as UNORM16 if NOT using HDR
1234
		else
1235
		{
1236
			scb.block_type = SYM_BTYPE_CONST_U16;
1237
			vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1238
			vint4 color_u16 = float_to_int_rtn(color_f32);
1239
			store(color_u16, scb.constant_color);
1240
		}
1241

1242
		trace_add_data("exit", "quality hit");
1243

1244
		symbolic_to_physical(bsd, scb, pcb);
1245
		return;
1246
	}
1247

1248
#if !defined(ASTCENC_DIAGNOSTICS)
1249
	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1250
	float error_threshold = ctx.config.tune_db_limit
1251
	                      * error_weight_sum
1252
	                      * block_is_l_scale
1253
	                      * block_is_la_scale;
1254
#endif
1255

1256
	// Set SCB and mode errors to a very high error value
1257
	scb.errorval = ERROR_CALC_DEFAULT;
1258
	scb.block_type = SYM_BTYPE_ERROR;
1259

1260
	float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1261
		ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1262
	};
1263

1264
	float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1265
		0.0f,
1266
		ctx.config.tune_2partition_early_out_limit_factor,
1267
		ctx.config.tune_3partition_early_out_limit_factor,
1268
		0.0f
1269
	};
1270

1271
	// Trial using 1 plane of weights and 1 partition.
1272

1273
	// Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1274
	// mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1275
	// optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1276
	// compression and slightly reduces image quality.
1277

1278
	float errorval_mult[2] {
1279
		1.0f / ctx.config.tune_mse_overshoot,
1280
		1.0f
1281
	};
1282

1283
	const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
1284

1285
	// Only enable MODE0 fast path if enabled
1286
	// Never enable for 3D blocks as no "always" block modes are available
1287
	int start_trial = 1;
1288
 	if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
1289
	{
1290
		start_trial = 0;
1291
	}
1292

1293
	int quant_limit = QUANT_32;
1294
	for (int i = start_trial; i < 2; i++)
1295
	{
1296
		TRACE_NODE(node1, "pass");
1297
		trace_add_data("partition_count", 1);
1298
		trace_add_data("plane_count", 1);
1299
		trace_add_data("search_mode", i);
1300

1301
		float errorval = compress_symbolic_block_for_partition_1plane(
1302
		    ctx.config, bsd, blk, i == 0,
1303
		    error_threshold * errorval_mult[i] * errorval_overshoot,
1304
		    1, 0,  scb, tmpbuf, QUANT_32);
1305

1306
		// Record the quant level so we can use the filter later searches
1307
		const auto& bm = bsd.get_block_mode(scb.block_mode);
1308
		quant_limit = bm.get_weight_quant_mode();
1309

1310
		best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
1311
		if (errorval < (error_threshold * errorval_mult[i]))
1312
		{
1313
			trace_add_data("exit", "quality hit");
1314
			goto END_OF_TESTS;
1315
		}
1316
	}
1317

1318
#if !defined(ASTCENC_DIAGNOSTICS)
1319
	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1320
#endif
1321

1322
	block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
1323

1324
	// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1325
	// alpha is the most likely to be non-correlated if it is present in the data.
1326
	for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
1327
	{
1328
		TRACE_NODE(node1, "pass");
1329
		trace_add_data("partition_count", 1);
1330
		trace_add_data("plane_count", 2);
1331
		trace_add_data("plane_component", i);
1332

1333
		if (block_skip_two_plane)
1334
		{
1335
			trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
1336
			continue;
1337
		}
1338

1339
		if (blk.grayscale && i != 3)
1340
		{
1341
			trace_add_data("skip", "grayscale block");
1342
			continue;
1343
		}
1344

1345
		if (blk.is_constant_channel(i))
1346
		{
1347
			trace_add_data("skip", "constant component");
1348
			continue;
1349
		}
1350

1351
		float errorval = compress_symbolic_block_for_partition_2planes(
1352
		    ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1353
		    i, scb, tmpbuf, quant_limit);
1354

1355
		// If attempting two planes is much worse than the best one plane result
1356
		// then further two plane searches are unlikely to help so move on ...
1357
		if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
1358
		{
1359
			break;
1360
		}
1361

1362
		if (errorval < error_threshold)
1363
		{
1364
			trace_add_data("exit", "quality hit");
1365
			goto END_OF_TESTS;
1366
		}
1367
	}
1368

1369
	// Find best blocks for 2, 3 and 4 partitions
1370
	for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
1371
	{
1372
		unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
1373

1374
		unsigned int requested_indices = requested_partition_indices[partition_count - 2];
1375

1376
		unsigned int requested_trials = requested_partition_trials[partition_count - 2];
1377
		requested_trials = astc::min(requested_trials, requested_indices);
1378

1379
		unsigned int actual_trials = find_best_partition_candidates(
1380
		    bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1381

1382
		float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
1383

1384
		for (unsigned int i = 0; i < actual_trials; i++)
1385
		{
1386
			TRACE_NODE(node1, "pass");
1387
			trace_add_data("partition_count", partition_count);
1388
			trace_add_data("partition_index", partition_indices[i]);
1389
			trace_add_data("plane_count", 1);
1390
			trace_add_data("search_mode", i);
1391

1392
			float errorval = compress_symbolic_block_for_partition_1plane(
1393
			    ctx.config, bsd, blk, false,
1394
			    error_threshold * errorval_overshoot,
1395
			    partition_count, partition_indices[i],
1396
			    scb, tmpbuf, quant_limit);
1397

1398
			best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
1399

1400
			// If using N partitions doesn't improve much over using N-1 partitions then skip trying
1401
			// N+1. Error can dramatically improve if the data is correlated or non-correlated and
1402
			// aligns with a partitioning that suits that encoding, so for this inner loop check add
1403
			// a large error scale because the "other" trial could be a lot better.
1404
			float best_error = best_errorvals_for_pcount[partition_count - 1];
1405
			float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
1406
			if (best_error > (best_error_in_prev * best_error_scale))
1407
			{
1408
				trace_add_data("skip", "tune_partition_early_out_limit_factor");
1409
				goto END_OF_TESTS;
1410
			}
1411

1412
			if (errorval < error_threshold)
1413
			{
1414
				trace_add_data("exit", "quality hit");
1415
				goto END_OF_TESTS;
1416
			}
1417
		}
1418

1419
		// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1420
		float best_error = best_errorvals_for_pcount[partition_count - 1];
1421
		float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
1422
		if (best_error > (best_error_in_prev * best_error_scale))
1423
		{
1424
			trace_add_data("skip", "tune_partition_early_out_limit_factor");
1425
			goto END_OF_TESTS;
1426
		}
1427
	}
1428

1429
	trace_add_data("exit", "quality not hit");
1430

1431
END_OF_TESTS:
1432
	// If we still have an error block then convert to something we can encode
1433
	// TODO: Do something more sensible here, such as average color block
1434
	if (scb.block_type == SYM_BTYPE_ERROR)
1435
	{
1436
#if defined(ASTCENC_DIAGNOSTICS)
1437
		static bool printed_once = false;
1438
		if (!printed_once)
1439
		{
1440
			printed_once = true;
1441
			printf("WARN: At least one block failed to find a valid encoding.\n"
1442
			       "      Try increasing compression quality settings.\n\n");
1443
		}
1444
#endif
1445

1446
		scb.block_type = SYM_BTYPE_CONST_U16;
1447
		vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1448
		vint4 color_u16 = float_to_int_rtn(color_f32);
1449
		store(color_u16, scb.constant_color);
1450
	}
1451

1452
	// Compress to a physical block
1453
	symbolic_to_physical(bsd, scb, pcb);
1454
}
1455

1456
#endif
1457

1458
Product

Resources

Company