CoCalc -- astcenc

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_entry.cpp
⁹⁹⁰² views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2025 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17

18
/**
19
 * @brief Functions for the library entrypoint.
20
 */
21

22
#include <array>
23
#include <cstring>
24
#include <new>
25

26
#include "astcenc.h"
27
#include "astcenc_internal_entry.h"
28
#include "astcenc_diagnostic_trace.h"
29

30
/**
31
 * @brief Record of the quality tuning parameter values.
32
 *
33
 * See the @c astcenc_config structure for detailed parameter documentation.
34
 *
35
 * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
36
 * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
37
 * for the more through search presets because the underlying db_limit is so much higher.
38
 */
39
struct astcenc_preset_config
40
{
41
	float quality;
42
	unsigned int tune_partition_count_limit;
43
	unsigned int tune_2partition_index_limit;
44
	unsigned int tune_3partition_index_limit;
45
	unsigned int tune_4partition_index_limit;
46
	unsigned int tune_block_mode_limit;
47
	unsigned int tune_refinement_limit;
48
	unsigned int tune_candidate_limit;
49
	unsigned int tune_2partitioning_candidate_limit;
50
	unsigned int tune_3partitioning_candidate_limit;
51
	unsigned int tune_4partitioning_candidate_limit;
52
	float tune_db_limit_a_base;
53
	float tune_db_limit_b_base;
54
	float tune_mse_overshoot;
55
	float tune_2partition_early_out_limit_factor;
56
	float tune_3partition_early_out_limit_factor;
57
	float tune_2plane_early_out_limit_correlation;
58
	float tune_search_mode0_enable;
59
};
60

61
/**
62
 * @brief The static presets for high bandwidth encodings (x < 25 texels per block).
63
 */
64
static const std::array<astcenc_preset_config, 6> preset_configs_high {{
65
	{
66
		ASTCENC_PRE_FASTEST,
67
		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f
68
	}, {
69
		ASTCENC_PRE_FAST,
70
		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f
71
	}, {
72
		ASTCENC_PRE_MEDIUM,
73
		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f
74
	}, {
75
		ASTCENC_PRE_THOROUGH,
76
		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f
77
	}, {
78
		ASTCENC_PRE_VERYTHOROUGH,
79
		4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
80
	}, {
81
		ASTCENC_PRE_EXHAUSTIVE,
82
		4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
83
	}
84
}};
85

86
/**
87
 * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
88
 */
89
static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
90
	{
91
		ASTCENC_PRE_FASTEST,
92
		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
93
	}, {
94
		ASTCENC_PRE_FAST,
95
		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
96
	}, {
97
		ASTCENC_PRE_MEDIUM,
98
		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f
99
	}, {
100
		ASTCENC_PRE_THOROUGH,
101
		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f
102
	}, {
103
		ASTCENC_PRE_VERYTHOROUGH,
104
		4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
105
	}, {
106
		ASTCENC_PRE_EXHAUSTIVE,
107
		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
108
	}
109
}};
110

111
/**
112
 * @brief The static presets for low bandwidth encodings (64 <= x texels per block).
113
 */
114
static const std::array<astcenc_preset_config, 6> preset_configs_low {{
115
	{
116
		ASTCENC_PRE_FASTEST,
117
		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
118
	}, {
119
		ASTCENC_PRE_FAST,
120
		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
121
	}, {
122
		ASTCENC_PRE_MEDIUM,
123
		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f
124
	}, {
125
		ASTCENC_PRE_THOROUGH,
126
		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f
127
	}, {
128
		ASTCENC_PRE_VERYTHOROUGH,
129
		4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f
130
	}, {
131
		ASTCENC_PRE_EXHAUSTIVE,
132
		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f
133
	}
134
}};
135

136
/**
137
 * @brief Validate CPU floating point meets assumptions made in the codec.
138
 *
139
 * The codec is written with the assumption that a float threaded through the @c if32 union will be
140
 * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
141
 * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
142
 * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
143
 *
144
 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
145
 */
146
static astcenc_error validate_cpu_float()
147
{
148
	if32 p;
149
	volatile float xprec_testval = 2.51f;
150
	p.f = xprec_testval + 12582912.0f;
151
	float q = p.f - 12582912.0f;
152

153
	if (q != 3.0f)
154
	{
155
		return ASTCENC_ERR_BAD_CPU_FLOAT;
156
	}
157

158
	return ASTCENC_SUCCESS;
159
}
160

161
/**
162
 * @brief Validate config profile.
163
 *
164
 * @param profile   The profile to check.
165
 *
166
 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
167
 */
168
static astcenc_error validate_profile(
169
	astcenc_profile profile
170
) {
171
	// Values in this enum are from an external user, so not guaranteed to be
172
	// bounded to the enum values
173
	switch (static_cast<int>(profile))
174
	{
175
	case ASTCENC_PRF_LDR_SRGB:
176
	case ASTCENC_PRF_LDR:
177
	case ASTCENC_PRF_HDR_RGB_LDR_A:
178
	case ASTCENC_PRF_HDR:
179
		return ASTCENC_SUCCESS;
180
	default:
181
		return ASTCENC_ERR_BAD_PROFILE;
182
	}
183
}
184

185
/**
186
 * @brief Validate block size.
187
 *
188
 * @param block_x   The block x dimensions.
189
 * @param block_y   The block y dimensions.
190
 * @param block_z   The block z dimensions.
191
 *
192
 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
193
 */
194
static astcenc_error validate_block_size(
195
	unsigned int block_x,
196
	unsigned int block_y,
197
	unsigned int block_z
198
) {
199
	// Test if this is a legal block size at all
200
	bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
201
	                 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
202
	if (!is_legal)
203
	{
204
		return ASTCENC_ERR_BAD_BLOCK_SIZE;
205
	}
206

207
	// Test if this build has sufficient capacity for this block size
208
	bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
209
	if (!have_capacity)
210
	{
211
		return ASTCENC_ERR_NOT_IMPLEMENTED;
212
	}
213

214
	return ASTCENC_SUCCESS;
215
}
216

217
/**
218
 * @brief Validate flags.
219
 *
220
 * @param profile   The profile to check.
221
 * @param flags     The flags to check.
222
 *
223
 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
224
 */
225
static astcenc_error validate_flags(
226
	astcenc_profile profile,
227
	unsigned int flags
228
) {
229
	// Flags field must not contain any unknown flag bits
230
	unsigned int exMask = ~ASTCENC_ALL_FLAGS;
231
	if (popcount(flags & exMask) != 0)
232
	{
233
		return ASTCENC_ERR_BAD_FLAGS;
234
	}
235

236
	// Flags field must only contain at most a single map type
237
	exMask = ASTCENC_FLG_MAP_NORMAL
238
	       | ASTCENC_FLG_MAP_RGBM;
239
	if (popcount(flags & exMask) > 1)
240
	{
241
		return ASTCENC_ERR_BAD_FLAGS;
242
	}
243

244
	// Decode_unorm8 must only be used with an LDR profile
245
	bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
246
	bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
247
	if (is_unorm8 && is_hdr)
248
	{
249
		return ASTCENC_ERR_BAD_DECODE_MODE;
250
	}
251

252
	return ASTCENC_SUCCESS;
253
}
254

255
#if !defined(ASTCENC_DECOMPRESS_ONLY)
256

257
/**
258
 * @brief Validate single channel compression swizzle.
259
 *
260
 * @param swizzle   The swizzle to check.
261
 *
262
 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
263
 */
264
static astcenc_error validate_compression_swz(
265
	astcenc_swz swizzle
266
) {
267
	// Not all enum values are handled; SWZ_Z is invalid for compression
268
	switch (static_cast<int>(swizzle))
269
	{
270
	case ASTCENC_SWZ_R:
271
	case ASTCENC_SWZ_G:
272
	case ASTCENC_SWZ_B:
273
	case ASTCENC_SWZ_A:
274
	case ASTCENC_SWZ_0:
275
	case ASTCENC_SWZ_1:
276
		return ASTCENC_SUCCESS;
277
	default:
278
		return ASTCENC_ERR_BAD_SWIZZLE;
279
	}
280
}
281

282
/**
283
 * @brief Validate overall compression swizzle.
284
 *
285
 * @param swizzle   The swizzle to check.
286
 *
287
 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
288
 */
289
static astcenc_error validate_compression_swizzle(
290
	const astcenc_swizzle& swizzle
291
) {
292
	if (validate_compression_swz(swizzle.r) ||
293
	    validate_compression_swz(swizzle.g) ||
294
	    validate_compression_swz(swizzle.b) ||
295
	    validate_compression_swz(swizzle.a))
296
	{
297
		return ASTCENC_ERR_BAD_SWIZZLE;
298
	}
299

300
	return ASTCENC_SUCCESS;
301
}
302
#endif
303

304
/**
305
 * @brief Validate single channel decompression swizzle.
306
 *
307
 * @param swizzle   The swizzle to check.
308
 *
309
 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
310
 */
311
static astcenc_error validate_decompression_swz(
312
	astcenc_swz swizzle
313
) {
314
	// Values in this enum are from an external user, so not guaranteed to be
315
	// bounded to the enum values
316
	switch (static_cast<int>(swizzle))
317
	{
318
	case ASTCENC_SWZ_R:
319
	case ASTCENC_SWZ_G:
320
	case ASTCENC_SWZ_B:
321
	case ASTCENC_SWZ_A:
322
	case ASTCENC_SWZ_0:
323
	case ASTCENC_SWZ_1:
324
	case ASTCENC_SWZ_Z:
325
		return ASTCENC_SUCCESS;
326
	default:
327
		return ASTCENC_ERR_BAD_SWIZZLE;
328
	}
329
}
330

331
/**
332
 * @brief Validate overall decompression swizzle.
333
 *
334
 * @param swizzle   The swizzle to check.
335
 *
336
 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
337
 */
338
static astcenc_error validate_decompression_swizzle(
339
	const astcenc_swizzle& swizzle
340
) {
341
	if (validate_decompression_swz(swizzle.r) ||
342
	    validate_decompression_swz(swizzle.g) ||
343
	    validate_decompression_swz(swizzle.b) ||
344
	    validate_decompression_swz(swizzle.a))
345
	{
346
		return ASTCENC_ERR_BAD_SWIZZLE;
347
	}
348

349
	return ASTCENC_SUCCESS;
350
}
351

352
/**
353
 * Validate that an incoming configuration is in-spec.
354
 *
355
 * This function can respond in two ways:
356
 *
357
 *   * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
358
 *     for out-of-range inputs in this case.
359
 *   * Numerical inputs and logic inputs are are logically invalid and which make no sense
360
 *     algorithmically will return an error.
361
 *
362
 * @param[in,out] config   The input compressor configuration.
363
 *
364
 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
365
 */
366
static astcenc_error validate_config(
367
	astcenc_config &config
368
) {
369
	astcenc_error status;
370

371
	status = validate_profile(config.profile);
372
	if (status != ASTCENC_SUCCESS)
373
	{
374
		return status;
375
	}
376

377
	status = validate_flags(config.profile, config.flags);
378
	if (status != ASTCENC_SUCCESS)
379
	{
380
		return status;
381
	}
382

383
	status = validate_block_size(config.block_x, config.block_y, config.block_z);
384
	if (status != ASTCENC_SUCCESS)
385
	{
386
		return status;
387
	}
388

389
#if defined(ASTCENC_DECOMPRESS_ONLY)
390
	// Decompress-only builds only support decompress-only contexts
391
	if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
392
	{
393
		return ASTCENC_ERR_BAD_PARAM;
394
	}
395
#endif
396

397
	config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
398

399
	config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
400
	config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
401
	config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
402
	config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
403
	config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
404
	config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
405
	config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
406
	config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
407
	config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
408
	config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
409
	config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
410
	config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
411
	config.tune_2partition_early_out_limit_factor = astc::max(config.tune_2partition_early_out_limit_factor, 0.0f);
412
	config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f);
413
	config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
414

415
	// Specifying a zero weight color component is not allowed; force to small value
416
	float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
417
	                             astc::max(config.cw_b_weight, config.cw_a_weight));
418
	if (max_weight > 0.0f)
419
	{
420
		max_weight /= 1000.0f;
421
		config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
422
		config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
423
		config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
424
		config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
425
	}
426
	// If all color components error weights are zero then return an error
427
	else
428
	{
429
		return ASTCENC_ERR_BAD_PARAM;
430
	}
431

432
	return ASTCENC_SUCCESS;
433
}
434

435
/* See header for documentation. */
436
astcenc_error astcenc_config_init(
437
	astcenc_profile profile,
438
	unsigned int block_x,
439
	unsigned int block_y,
440
	unsigned int block_z,
441
	float quality,
442
	unsigned int flags,
443
	astcenc_config* configp
444
) {
445
	astcenc_error status;
446

447
	status = validate_cpu_float();
448
	if (status != ASTCENC_SUCCESS)
449
	{
450
		return status;
451
	}
452

453
	// Zero init all config fields; although most of will be over written
454
	astcenc_config& config = *configp;
455
	std::memset(&config, 0, sizeof(config));
456

457
	// Process the block size
458
	block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
459
	status = validate_block_size(block_x, block_y, block_z);
460
	if (status != ASTCENC_SUCCESS)
461
	{
462
		return status;
463
	}
464

465
	config.block_x = block_x;
466
	config.block_y = block_y;
467
	config.block_z = block_z;
468

469
	float texels = static_cast<float>(block_x * block_y * block_z);
470
	float ltexels = logf(texels) / logf(10.0f);
471

472
	// Process the performance quality level or preset; note that this must be done before we
473
	// process any additional settings, such as color profile and flags, which may replace some of
474
	// these settings with more use case tuned values
475
	if (quality < ASTCENC_PRE_FASTEST ||
476
	    quality > ASTCENC_PRE_EXHAUSTIVE)
477
	{
478
		return ASTCENC_ERR_BAD_QUALITY;
479
	}
480

481
	static const std::array<astcenc_preset_config, 6>* preset_configs;
482
	int texels_int = block_x * block_y * block_z;
483
	if (texels_int < 25)
484
	{
485
		preset_configs = &preset_configs_high;
486
	}
487
	else if (texels_int < 64)
488
	{
489
		preset_configs = &preset_configs_mid;
490
	}
491
	else
492
	{
493
		preset_configs = &preset_configs_low;
494
	}
495

496
	// Determine which preset to use, or which pair to interpolate
497
	size_t start;
498
	size_t end;
499
	for (end = 0; end < preset_configs->size(); end++)
500
	{
501
		if ((*preset_configs)[end].quality >= quality)
502
		{
503
			break;
504
		}
505
	}
506

507
	start = end == 0 ? 0 : end - 1;
508

509
	// Start and end node are the same - so just transfer the values.
510
	if (start == end)
511
	{
512
		config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
513
		config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
514
		config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
515
		config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
516
		config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
517
		config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
518
		config.tune_candidate_limit = (*preset_configs)[start].tune_candidate_limit;
519
		config.tune_2partitioning_candidate_limit = (*preset_configs)[start].tune_2partitioning_candidate_limit;
520
		config.tune_3partitioning_candidate_limit = (*preset_configs)[start].tune_3partitioning_candidate_limit;
521
		config.tune_4partitioning_candidate_limit = (*preset_configs)[start].tune_4partitioning_candidate_limit;
522
		config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
523
		                                 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
524

525
		config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;
526

527
		config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor;
528
		config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
529
		config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
530
		config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
531
	}
532
	// Start and end node are not the same - so interpolate between them
533
	else
534
	{
535
		auto& node_a = (*preset_configs)[start];
536
		auto& node_b = (*preset_configs)[end];
537

538
		float wt_range = node_b.quality - node_a.quality;
539
		assert(wt_range > 0);
540

541
		// Compute interpolation factors
542
		float wt_node_a = (node_b.quality - quality) / wt_range;
543
		float wt_node_b = (quality - node_a.quality) / wt_range;
544

545
		#define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
546
		#define LERPI(param) astc::flt2int_rtn(\
547
		                         (static_cast<float>(node_a.param) * wt_node_a) + \
548
		                         (static_cast<float>(node_b.param) * wt_node_b))
549
		#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
550

551
		config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
552
		config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
553
		config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
554
		config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
555
		config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
556
		config.tune_refinement_limit = LERPI(tune_refinement_limit);
557
		config.tune_candidate_limit = LERPUI(tune_candidate_limit);
558
		config.tune_2partitioning_candidate_limit = LERPUI(tune_2partitioning_candidate_limit);
559
		config.tune_3partitioning_candidate_limit = LERPUI(tune_3partitioning_candidate_limit);
560
		config.tune_4partitioning_candidate_limit = LERPUI(tune_4partitioning_candidate_limit);
561
		config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
562
		                                 LERP(tune_db_limit_b_base) - 19 * ltexels);
563

564
		config.tune_mse_overshoot = LERP(tune_mse_overshoot);
565

566
		config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor);
567
		config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
568
		config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
569
		config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
570
		#undef LERP
571
		#undef LERPI
572
		#undef LERPUI
573
	}
574

575
	// Set heuristics to the defaults for each color profile
576
	config.cw_r_weight = 1.0f;
577
	config.cw_g_weight = 1.0f;
578
	config.cw_b_weight = 1.0f;
579
	config.cw_a_weight = 1.0f;
580

581
	config.a_scale_radius = 0;
582

583
	config.rgbm_m_scale = 0.0f;
584

585
	config.profile = profile;
586

587
	// Values in this enum are from an external user, so not guaranteed to be
588
	// bounded to the enum values
589
	switch (static_cast<int>(profile))
590
	{
591
	case ASTCENC_PRF_LDR:
592
	case ASTCENC_PRF_LDR_SRGB:
593
		break;
594
	case ASTCENC_PRF_HDR_RGB_LDR_A:
595
	case ASTCENC_PRF_HDR:
596
		config.tune_db_limit = 999.0f;
597
		config.tune_search_mode0_enable = 0.0f;
598
		break;
599
	default:
600
		return ASTCENC_ERR_BAD_PROFILE;
601
	}
602

603
	// Flags field must not contain any unknown flag bits
604
	status = validate_flags(profile, flags);
605
	if (status != ASTCENC_SUCCESS)
606
	{
607
		return status;
608
	}
609

610
	if (flags & ASTCENC_FLG_MAP_NORMAL)
611
	{
612
		// Normal map encoding uses L+A blocks, so allow one more partitioning
613
		// than normal. We need need fewer bits for endpoints, so more likely
614
		// to be able to use more partitions than an RGB/RGBA block
615
		config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
616

617
		config.cw_g_weight = 0.0f;
618
		config.cw_b_weight = 0.0f;
619
		config.tune_2partition_early_out_limit_factor *= 1.5f;
620
		config.tune_3partition_early_out_limit_factor *= 1.5f;
621
		config.tune_2plane_early_out_limit_correlation = 0.99f;
622

623
		// Normals are prone to blocking artifacts on smooth curves
624
		// so force compressor to try harder here ...
625
		config.tune_db_limit *= 1.03f;
626
	}
627
	else if (flags & ASTCENC_FLG_MAP_RGBM)
628
	{
629
		config.rgbm_m_scale = 5.0f;
630
		config.cw_a_weight = 2.0f * config.rgbm_m_scale;
631
	}
632
	else // (This is color data)
633
	{
634
		// This is a very basic perceptual metric for RGB color data, which weights error
635
		// significance by the perceptual luminance contribution of each color channel. For
636
		// luminance the usual weights to compute luminance from a linear RGB value are as
637
		// follows:
638
		//
639
		//     l = r * 0.3 + g * 0.59 + b * 0.11
640
		//
641
		// ... but we scale these up to keep a better balance between color and alpha. Note
642
		// that if the content is using alpha we'd recommend using the -a option to weight
643
		// the color contribution by the alpha transparency.
644
		if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
645
		{
646
			config.cw_r_weight = 0.30f * 2.25f;
647
			config.cw_g_weight = 0.59f * 2.25f;
648
			config.cw_b_weight = 0.11f * 2.25f;
649
		}
650
	}
651
	config.flags = flags;
652

653
	return ASTCENC_SUCCESS;
654
}
655

656
/* See header for documentation. */
657
astcenc_error astcenc_context_alloc(
658
	const astcenc_config* configp,
659
	unsigned int thread_count,
660
	astcenc_context** context
661
) {
662
	astcenc_error status;
663
	const astcenc_config& config = *configp;
664

665
	status = validate_cpu_float();
666
	if (status != ASTCENC_SUCCESS)
667
	{
668
		return status;
669
	}
670

671
	if (thread_count == 0)
672
	{
673
		return ASTCENC_ERR_BAD_PARAM;
674
	}
675

676
#if defined(ASTCENC_DIAGNOSTICS)
677
	// Force single threaded compressor use in diagnostic mode.
678
	if (thread_count != 1)
679
	{
680
		return ASTCENC_ERR_BAD_PARAM;
681
	}
682
#endif
683

684
	astcenc_context* ctxo = new astcenc_context;
685
	astcenc_contexti* ctx = &ctxo->context;
686
	ctx->thread_count = thread_count;
687
	ctx->config = config;
688
	ctx->working_buffers = nullptr;
689

690
	// These are allocated per-compress, as they depend on image size
691
	ctx->input_alpha_averages = nullptr;
692

693
	// Copy the config first and validate the copy (we may modify it)
694
	status = validate_config(ctx->config);
695
	if (status != ASTCENC_SUCCESS)
696
	{
697
		delete ctxo;
698
		return status;
699
	}
700

701
	ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
702
	if (!ctx->bsd)
703
	{
704
		delete ctxo;
705
		return ASTCENC_ERR_OUT_OF_MEM;
706
	}
707

708
	bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
709
	init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
710
	                           can_omit_modes,
711
	                           config.tune_partition_count_limit,
712
	                           static_cast<float>(config.tune_block_mode_limit) / 100.0f,
713
	                           *ctx->bsd);
714

715
#if !defined(ASTCENC_DECOMPRESS_ONLY)
716
	// Do setup only needed by compression
717
	if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
718
	{
719
		// Turn a dB limit into a per-texel error for faster use later
720
		if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
721
		{
722
			ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
723
		}
724
		else
725
		{
726
			ctx->config.tune_db_limit = 0.0f;
727
		}
728

729
		size_t worksize = sizeof(compression_working_buffers) * thread_count;
730
		ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
731
		static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
732
		              "compression_working_buffers size must be multiple of vector alignment");
733
		if (!ctx->working_buffers)
734
		{
735
			aligned_free<block_size_descriptor>(ctx->bsd);
736
			delete ctxo;
737
			*context = nullptr;
738
			return ASTCENC_ERR_OUT_OF_MEM;
739
		}
740
	}
741
#endif
742

743
#if defined(ASTCENC_DIAGNOSTICS)
744
	ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
745
	if (!ctx->trace_log->m_file)
746
	{
747
		return ASTCENC_ERR_DTRACE_FAILURE;
748
	}
749

750
	trace_add_data("block_x", config.block_x);
751
	trace_add_data("block_y", config.block_y);
752
	trace_add_data("block_z", config.block_z);
753
#endif
754

755
	*context = ctxo;
756

757
#if !defined(ASTCENC_DECOMPRESS_ONLY)
758
	prepare_angular_tables();
759
#endif
760

761
	return ASTCENC_SUCCESS;
762
}
763

764
/* See header dor documentation. */
765
void astcenc_context_free(
766
	astcenc_context* ctxo
767
) {
768
	if (ctxo)
769
	{
770
		astcenc_contexti* ctx = &ctxo->context;
771
		aligned_free<compression_working_buffers>(ctx->working_buffers);
772
		aligned_free<block_size_descriptor>(ctx->bsd);
773
#if defined(ASTCENC_DIAGNOSTICS)
774
		delete ctx->trace_log;
775
#endif
776
		delete ctxo;
777
	}
778
}
779

780
#if !defined(ASTCENC_DECOMPRESS_ONLY)
781

782
/**
783
 * @brief Compress an image, after any preflight has completed.
784
 *
785
 * @param[out] ctxo           The compressor context.
786
 * @param      thread_index   The thread index.
787
 * @param      image          The intput image.
788
 * @param      swizzle        The input swizzle.
789
 * @param[out] buffer         The output array for the compressed data.
790
 */
791
static void compress_image(
792
	astcenc_context& ctxo,
793
	unsigned int thread_index,
794
	const astcenc_image& image,
795
	const astcenc_swizzle& swizzle,
796
	uint8_t* buffer
797
) {
798
	astcenc_contexti& ctx = ctxo.context;
799
	const block_size_descriptor& bsd = *ctx.bsd;
800
	astcenc_profile decode_mode = ctx.config.profile;
801

802
	image_block blk;
803

804
	int block_x = bsd.xdim;
805
	int block_y = bsd.ydim;
806
	int block_z = bsd.zdim;
807
	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
808

809
	int dim_x = image.dim_x;
810
	int dim_y = image.dim_y;
811
	int dim_z = image.dim_z;
812

813
	int xblocks = (dim_x + block_x - 1) / block_x;
814
	int yblocks = (dim_y + block_y - 1) / block_y;
815
	int zblocks = (dim_z + block_z - 1) / block_z;
816
	int block_count = zblocks * yblocks * xblocks;
817

818
	int row_blocks = xblocks;
819
	int plane_blocks = xblocks * yblocks;
820

821
	blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
822

823
	// Populate the block channel weights
824
	blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
825
	                             ctx.config.cw_g_weight,
826
	                             ctx.config.cw_b_weight,
827
	                             ctx.config.cw_a_weight);
828

829
	// Use preallocated scratch buffer
830
	auto& temp_buffers = ctx.working_buffers[thread_index];
831

832
	// Only the first thread actually runs the initializer
833
	ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
834

835
	// Determine if we can use an optimized load function
836
	bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
837
	                 (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
838

839
	bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
840
	                 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
841

842
	bool use_fast_load = !needs_swz && !needs_hdr &&
843
	                     block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
844

845
	auto load_func = load_image_block;
846
	if (use_fast_load)
847
	{
848
		load_func = load_image_block_fast_ldr;
849
	}
850

851
	// All threads run this processing loop until there is no work remaining
852
	while (true)
853
	{
854
		unsigned int count;
855
		unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
856
		if (!count)
857
		{
858
			break;
859
		}
860

861
		for (unsigned int i = base; i < base + count; i++)
862
		{
863
			// Decode i into x, y, z block indices
864
			int z = i / plane_blocks;
865
			unsigned int rem = i - (z * plane_blocks);
866
			int y = rem / row_blocks;
867
			int x = rem - (y * row_blocks);
868

869
			// Test if we can apply some basic alpha-scale RDO
870
			bool use_full_block = true;
871
			if (ctx.config.a_scale_radius != 0 && block_z == 1)
872
			{
873
				int start_x = x * block_x;
874
				int end_x = astc::min(dim_x, start_x + block_x);
875

876
				int start_y = y * block_y;
877
				int end_y = astc::min(dim_y, start_y + block_y);
878

879
				// SATs accumulate error, so don't test exactly zero. Test for
880
				// less than 1 alpha in the expanded block footprint that
881
				// includes the alpha radius.
882
				int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
883

884
				int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
885

886
				float footprint = static_cast<float>(x_footprint * y_footprint);
887
				float threshold = 0.9f / (255.0f * footprint);
888

889
				// Do we have any alpha values?
890
				use_full_block = false;
891
				for (int ay = start_y; ay < end_y; ay++)
892
				{
893
					for (int ax = start_x; ax < end_x; ax++)
894
					{
895
						float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
896
						if (a_avg > threshold)
897
						{
898
							use_full_block = true;
899
							ax = end_x;
900
							ay = end_y;
901
						}
902
					}
903
				}
904
			}
905

906
			// Fetch the full block for compression
907
			if (use_full_block)
908
			{
909
				load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
910

911
				// Scale RGB error contribution by the maximum alpha in the block
912
				// This encourages preserving alpha accuracy in regions with high
913
				// transparency, and can buy up to 0.5 dB PSNR.
914
				if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
915
				{
916
					float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
917
					blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
918
					                             ctx.config.cw_g_weight * alpha_scale,
919
					                             ctx.config.cw_b_weight * alpha_scale,
920
					                             ctx.config.cw_a_weight);
921
				}
922
			}
923
			// Apply alpha scale RDO - substitute constant color block
924
			else
925
			{
926
				blk.origin_texel = vfloat4::zero();
927
				blk.data_min = vfloat4::zero();
928
				blk.data_mean = vfloat4::zero();
929
				blk.data_max = vfloat4::zero();
930
				blk.grayscale = true;
931
			}
932

933
			int offset = ((z * yblocks + y) * xblocks + x) * 16;
934
			uint8_t *bp = buffer + offset;
935
			compress_block(ctx, blk, bp, temp_buffers);
936
		}
937

938
		ctxo.manage_compress.complete_task_assignment(count);
939
	}
940
}
941

942
/**
943
 * @brief Compute regional averages in an image.
944
 *
945
 * This function can be called by multiple threads, but only after a single
946
 * thread calls the setup function @c init_compute_averages().
947
 *
948
 * Results are written back into @c img->input_alpha_averages.
949
 *
950
 * @param[out] ctx   The context.
951
 * @param      ag    The average and variance arguments created during setup.
952
 */
953
static void compute_averages(
954
	astcenc_context& ctx,
955
	const avg_args &ag
956
) {
957
	pixel_region_args arg = ag.arg;
958
	arg.work_memory = new vfloat4[ag.work_memory_size];
959

960
	int size_x = ag.img_size_x;
961
	int size_y = ag.img_size_y;
962
	int size_z = ag.img_size_z;
963

964
	int step_xy = ag.blk_size_xy;
965
	int step_z = ag.blk_size_z;
966

967
	int y_tasks = (size_y + step_xy - 1) / step_xy;
968

969
	// All threads run this processing loop until there is no work remaining
970
	while (true)
971
	{
972
		unsigned int count;
973
		unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
974
		if (!count)
975
		{
976
			break;
977
		}
978

979
		for (unsigned int i = base; i < base + count; i++)
980
		{
981
			int z = (i / (y_tasks)) * step_z;
982
			int y = (i - (z * y_tasks)) * step_xy;
983

984
			arg.size_z = astc::min(step_z, size_z - z);
985
			arg.offset_z = z;
986

987
			arg.size_y = astc::min(step_xy, size_y - y);
988
			arg.offset_y = y;
989

990
			for (int x = 0; x < size_x; x += step_xy)
991
			{
992
				arg.size_x = astc::min(step_xy, size_x - x);
993
				arg.offset_x = x;
994
				compute_pixel_region_variance(ctx.context, arg);
995
			}
996
		}
997

998
		ctx.manage_avg.complete_task_assignment(count);
999
	}
1000

1001
	delete[] arg.work_memory;
1002
}
1003

1004
#endif
1005

1006
/* See header for documentation. */
1007
astcenc_error astcenc_compress_image(
1008
	astcenc_context* ctxo,
1009
	astcenc_image* imagep,
1010
	const astcenc_swizzle* swizzle,
1011
	uint8_t* data_out,
1012
	size_t data_len,
1013
	unsigned int thread_index
1014
) {
1015
#if defined(ASTCENC_DECOMPRESS_ONLY)
1016
	(void)ctxo;
1017
	(void)imagep;
1018
	(void)swizzle;
1019
	(void)data_out;
1020
	(void)data_len;
1021
	(void)thread_index;
1022
	return ASTCENC_ERR_BAD_CONTEXT;
1023
#else
1024
	astcenc_contexti* ctx = &ctxo->context;
1025
	astcenc_error status;
1026
	astcenc_image& image = *imagep;
1027

1028
	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1029
	{
1030
		return ASTCENC_ERR_BAD_CONTEXT;
1031
	}
1032

1033
	status = validate_compression_swizzle(*swizzle);
1034
	if (status != ASTCENC_SUCCESS)
1035
	{
1036
		return status;
1037
	}
1038

1039
	if (thread_index >= ctx->thread_count)
1040
	{
1041
		return ASTCENC_ERR_BAD_PARAM;
1042
	}
1043

1044
	unsigned int block_x = ctx->config.block_x;
1045
	unsigned int block_y = ctx->config.block_y;
1046
	unsigned int block_z = ctx->config.block_z;
1047

1048
	unsigned int xblocks = (image.dim_x + block_x - 1) / block_x;
1049
	unsigned int yblocks = (image.dim_y + block_y - 1) / block_y;
1050
	unsigned int zblocks = (image.dim_z + block_z - 1) / block_z;
1051

1052
	// Check we have enough output space (16 bytes per block)
1053
	size_t size_needed = xblocks * yblocks * zblocks * 16;
1054
	if (data_len < size_needed)
1055
	{
1056
		return ASTCENC_ERR_OUT_OF_MEM;
1057
	}
1058

1059
	// If context thread count is one then implicitly reset
1060
	if (ctx->thread_count == 1)
1061
	{
1062
		astcenc_compress_reset(ctxo);
1063
	}
1064

1065
	if (ctx->config.a_scale_radius != 0)
1066
	{
1067
		// First thread to enter will do setup, other threads will subsequently
1068
		// enter the critical section but simply skip over the initialization
1069
		auto init_avg = [ctx, &image, swizzle]() {
1070
			// Perform memory allocations for the destination buffers
1071
			size_t texel_count = image.dim_x * image.dim_y * image.dim_z;
1072
			ctx->input_alpha_averages = new float[texel_count];
1073

1074
			return init_compute_averages(
1075
				image, ctx->config.a_scale_radius, *swizzle,
1076
				ctx->avg_preprocess_args);
1077
		};
1078

1079
		// Only the first thread actually runs the initializer
1080
		ctxo->manage_avg.init(init_avg);
1081

1082
		// All threads will enter this function and dynamically grab work
1083
		compute_averages(*ctxo, ctx->avg_preprocess_args);
1084
	}
1085

1086
	// Wait for compute_averages to complete before compressing
1087
	ctxo->manage_avg.wait();
1088

1089
	compress_image(*ctxo, thread_index, image, *swizzle, data_out);
1090

1091
	// Wait for compress to complete before freeing memory
1092
	ctxo->manage_compress.wait();
1093

1094
	auto term_compress = [ctx]() {
1095
		delete[] ctx->input_alpha_averages;
1096
		ctx->input_alpha_averages = nullptr;
1097
	};
1098

1099
	// Only the first thread to arrive actually runs the term
1100
	ctxo->manage_compress.term(term_compress);
1101

1102
	return ASTCENC_SUCCESS;
1103
#endif
1104
}
1105

1106
/* See header for documentation. */
1107
astcenc_error astcenc_compress_reset(
1108
	astcenc_context* ctxo
1109
) {
1110
#if defined(ASTCENC_DECOMPRESS_ONLY)
1111
	(void)ctxo;
1112
	return ASTCENC_ERR_BAD_CONTEXT;
1113
#else
1114
	astcenc_contexti* ctx = &ctxo->context;
1115
	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1116
	{
1117
		return ASTCENC_ERR_BAD_CONTEXT;
1118
	}
1119

1120
	ctxo->manage_avg.reset();
1121
	ctxo->manage_compress.reset();
1122
	return ASTCENC_SUCCESS;
1123
#endif
1124
}
1125

1126
/* See header for documentation. */
1127
astcenc_error astcenc_compress_cancel(
1128
	astcenc_context* ctxo
1129
) {
1130
#if defined(ASTCENC_DECOMPRESS_ONLY)
1131
	(void)ctxo;
1132
	return ASTCENC_ERR_BAD_CONTEXT;
1133
#else
1134
	astcenc_contexti* ctx = &ctxo->context;
1135
	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1136
	{
1137
		return ASTCENC_ERR_BAD_CONTEXT;
1138
	}
1139

1140
	// Cancel compression before cancelling avg. This avoids the race condition
1141
	// where cancelling them in the other order could see a compression worker
1142
	// starting to process even though some of the avg data is undefined.
1143
	ctxo->manage_compress.cancel();
1144
	ctxo->manage_avg.cancel();
1145
	return ASTCENC_SUCCESS;
1146
#endif
1147
}
1148

1149
/* See header for documentation. */
1150
astcenc_error astcenc_decompress_image(
1151
	astcenc_context* ctxo,
1152
	const uint8_t* data,
1153
	size_t data_len,
1154
	astcenc_image* image_outp,
1155
	const astcenc_swizzle* swizzle,
1156
	unsigned int thread_index
1157
) {
1158
	astcenc_error status;
1159
	astcenc_image& image_out = *image_outp;
1160
	astcenc_contexti* ctx = &ctxo->context;
1161

1162
	// Today this doesn't matter (working set on stack) but might in future ...
1163
	if (thread_index >= ctx->thread_count)
1164
	{
1165
		return ASTCENC_ERR_BAD_PARAM;
1166
	}
1167

1168
	status = validate_decompression_swizzle(*swizzle);
1169
	if (status != ASTCENC_SUCCESS)
1170
	{
1171
		return status;
1172
	}
1173

1174
	unsigned int block_x = ctx->config.block_x;
1175
	unsigned int block_y = ctx->config.block_y;
1176
	unsigned int block_z = ctx->config.block_z;
1177

1178
	unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
1179
	unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
1180
	unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
1181
	unsigned int block_count = zblocks * yblocks * xblocks;
1182

1183
	int row_blocks = xblocks;
1184
	int plane_blocks = xblocks * yblocks;
1185

1186
	// Check we have enough output space (16 bytes per block)
1187
	size_t size_needed = xblocks * yblocks * zblocks * 16;
1188
	if (data_len < size_needed)
1189
	{
1190
		return ASTCENC_ERR_OUT_OF_MEM;
1191
	}
1192

1193
	image_block blk {};
1194
	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
1195

1196
	// Decode mode inferred from the output data type
1197
	blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
1198

1199
	// If context thread count is one then implicitly reset
1200
	if (ctx->thread_count == 1)
1201
	{
1202
		astcenc_decompress_reset(ctxo);
1203
	}
1204

1205
	// Only the first thread actually runs the initializer
1206
	ctxo->manage_decompress.init(block_count, nullptr);
1207

1208
	// All threads run this processing loop until there is no work remaining
1209
	while (true)
1210
	{
1211
		unsigned int count;
1212
		unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
1213
		if (!count)
1214
		{
1215
			break;
1216
		}
1217

1218
		for (unsigned int i = base; i < base + count; i++)
1219
		{
1220
			// Decode i into x, y, z block indices
1221
			int z = i / plane_blocks;
1222
			unsigned int rem = i - (z * plane_blocks);
1223
			int y = rem / row_blocks;
1224
			int x = rem - (y * row_blocks);
1225

1226
			unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
1227
			const uint8_t* bp = data + offset;
1228

1229
			symbolic_compressed_block scb;
1230

1231
			physical_to_symbolic(*ctx->bsd, bp, scb);
1232

1233
			decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
1234
			                          x * block_x, y * block_y, z * block_z,
1235
			                          scb, blk);
1236

1237
			store_image_block(image_out, blk, *ctx->bsd,
1238
			                  x * block_x, y * block_y, z * block_z, *swizzle);
1239
		}
1240

1241
		ctxo->manage_decompress.complete_task_assignment(count);
1242
	}
1243

1244
	return ASTCENC_SUCCESS;
1245
}
1246

1247
/* See header for documentation. */
1248
astcenc_error astcenc_decompress_reset(
1249
	astcenc_context* ctxo
1250
) {
1251
	ctxo->manage_decompress.reset();
1252
	return ASTCENC_SUCCESS;
1253
}
1254

1255
/* See header for documentation. */
1256
astcenc_error astcenc_get_block_info(
1257
	astcenc_context* ctxo,
1258
	const uint8_t data[16],
1259
	astcenc_block_info* info
1260
) {
1261
#if defined(ASTCENC_DECOMPRESS_ONLY)
1262
	(void)ctxo;
1263
	(void)data;
1264
	(void)info;
1265
	return ASTCENC_ERR_BAD_CONTEXT;
1266
#else
1267
	astcenc_contexti* ctx = &ctxo->context;
1268

1269
	// Decode the compressed data into a symbolic form
1270
	symbolic_compressed_block scb;
1271
	physical_to_symbolic(*ctx->bsd, data, scb);
1272

1273
	// Fetch the appropriate partition and decimation tables
1274
	block_size_descriptor& bsd = *ctx->bsd;
1275

1276
	// Start from a clean slate
1277
	memset(info, 0, sizeof(*info));
1278

1279
	// Basic info we can always populate
1280
	info->profile = ctx->config.profile;
1281

1282
	info->block_x = ctx->config.block_x;
1283
	info->block_y = ctx->config.block_y;
1284
	info->block_z = ctx->config.block_z;
1285
	info->texel_count = bsd.texel_count;
1286

1287
	// Check for error blocks first
1288
	info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
1289
	if (info->is_error_block)
1290
	{
1291
		return ASTCENC_SUCCESS;
1292
	}
1293

1294
	// Check for constant color blocks second
1295
	info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
1296
	                          scb.block_type == SYM_BTYPE_CONST_U16;
1297
	if (info->is_constant_block)
1298
	{
1299
		return ASTCENC_SUCCESS;
1300
	}
1301

1302
	// Otherwise handle a full block ; known to be valid after conditions above have been checked
1303
	int partition_count = scb.partition_count;
1304
	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
1305

1306
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
1307
	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
1308

1309
	info->weight_x = di.weight_x;
1310
	info->weight_y = di.weight_y;
1311
	info->weight_z = di.weight_z;
1312

1313
	info->is_dual_plane_block = bm.is_dual_plane != 0;
1314

1315
	info->partition_count = scb.partition_count;
1316
	info->partition_index = scb.partition_index;
1317
	info->dual_plane_component = scb.plane2_component;
1318

1319
	info->color_level_count = get_quant_level(scb.get_color_quant_mode());
1320
	info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
1321

1322
	// Unpack color endpoints for each active partition
1323
	for (unsigned int i = 0; i < scb.partition_count; i++)
1324
	{
1325
		bool rgb_hdr;
1326
		bool a_hdr;
1327
		vint4 endpnt[2];
1328

1329
		unpack_color_endpoints(ctx->config.profile,
1330
		                       scb.color_formats[i],
1331
		                       scb.color_values[i],
1332
		                       rgb_hdr, a_hdr,
1333
		                       endpnt[0], endpnt[1]);
1334

1335
		// Store the color endpoint mode info
1336
		info->color_endpoint_modes[i] = scb.color_formats[i];
1337
		info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
1338

1339
		// Store the unpacked and decoded color endpoint
1340
		vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
1341
		for (int j = 0; j < 2; j++)
1342
		{
1343
			vint4 color_lns = lns_to_sf16(endpnt[j]);
1344
			vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
1345
			vint4 datai = select(color_unorm, color_lns, hdr_mask);
1346
			store(float16_to_float(datai), info->color_endpoints[i][j]);
1347
		}
1348
	}
1349

1350
	// Unpack weights for each texel
1351
	int weight_plane1[BLOCK_MAX_TEXELS];
1352
	int weight_plane2[BLOCK_MAX_TEXELS];
1353

1354
	unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
1355
	for (unsigned int i = 0; i < bsd.texel_count; i++)
1356
	{
1357
		info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1358
		if (info->is_dual_plane_block)
1359
		{
1360
			info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1361
		}
1362
	}
1363

1364
	// Unpack partition assignments for each texel
1365
	for (unsigned int i = 0; i < bsd.texel_count; i++)
1366
	{
1367
		info->partition_assignment[i] = pi.partition_of_texel[i];
1368
	}
1369

1370
	return ASTCENC_SUCCESS;
1371
#endif
1372
}
1373

1374
/* See header for documentation. */
1375
const char* astcenc_get_error_string(
1376
	astcenc_error status
1377
) {
1378
	// Values in this enum are from an external user, so not guaranteed to be
1379
	// bounded to the enum values
1380
	switch (static_cast<int>(status))
1381
	{
1382
	case ASTCENC_SUCCESS:
1383
		return "ASTCENC_SUCCESS";
1384
	case ASTCENC_ERR_OUT_OF_MEM:
1385
		return "ASTCENC_ERR_OUT_OF_MEM";
1386
	case ASTCENC_ERR_BAD_CPU_FLOAT:
1387
		return "ASTCENC_ERR_BAD_CPU_FLOAT";
1388
	case ASTCENC_ERR_BAD_PARAM:
1389
		return "ASTCENC_ERR_BAD_PARAM";
1390
	case ASTCENC_ERR_BAD_BLOCK_SIZE:
1391
		return "ASTCENC_ERR_BAD_BLOCK_SIZE";
1392
	case ASTCENC_ERR_BAD_PROFILE:
1393
		return "ASTCENC_ERR_BAD_PROFILE";
1394
	case ASTCENC_ERR_BAD_QUALITY:
1395
		return "ASTCENC_ERR_BAD_QUALITY";
1396
	case ASTCENC_ERR_BAD_FLAGS:
1397
		return "ASTCENC_ERR_BAD_FLAGS";
1398
	case ASTCENC_ERR_BAD_SWIZZLE:
1399
		return "ASTCENC_ERR_BAD_SWIZZLE";
1400
	case ASTCENC_ERR_BAD_CONTEXT:
1401
		return "ASTCENC_ERR_BAD_CONTEXT";
1402
	case ASTCENC_ERR_NOT_IMPLEMENTED:
1403
		return "ASTCENC_ERR_NOT_IMPLEMENTED";
1404
	case ASTCENC_ERR_BAD_DECODE_MODE:
1405
		return "ASTCENC_ERR_BAD_DECODE_MODE";
1406
#if defined(ASTCENC_DIAGNOSTICS)
1407
	case ASTCENC_ERR_DTRACE_FAILURE:
1408
		return "ASTCENC_ERR_DTRACE_FAILURE";
1409
#endif
1410
	default:
1411
		return nullptr;
1412
	}
1413
}
1414

1415
Product

Resources

Company