CoCalc -- astcenc_weight

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_weight_align.cpp
⁹⁹⁰⁵ views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2024 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17

18
#if !defined(ASTCENC_DECOMPRESS_ONLY)
19

20
/**
21
 * @brief Functions for angular-sum algorithm for weight alignment.
22
 *
23
 * This algorithm works as follows:
24
 * - we compute a complex number P as (cos s*i, sin s*i) for each weight,
25
 *   where i is the input value and s is a scaling factor based on the spacing between the weights.
26
 * - we then add together complex numbers for all the weights.
27
 * - we then compute the length and angle of the resulting sum.
28
 *
29
 * This should produce the following results:
30
 * - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
31
 * - even distribution results in a vector of length 0.
32
 * - all samples identical results in perfect alignment for every scaling.
33
 *
34
 * For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
35
 * should then result in some scalings standing out as having particularly good alignment factors;
36
 * we can use this to produce a set of candidate scale/shift values for various quantization levels;
37
 * we should then actually try them and see what happens.
38
 */
39

40
#include "astcenc_internal.h"
41
#include "astcenc_vecmathlib.h"
42

43
#include <stdio.h>
44
#include <cassert>
45
#include <cstring>
46
#include <cfloat>
47

48
static constexpr unsigned int ANGULAR_STEPS { 32 };
49

50
static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
51
              "ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
52

53
static_assert(ANGULAR_STEPS >= 32,
54
              "ANGULAR_STEPS must be at least max(steps_for_quant_level)");
55

56
// Store a reduced sin/cos table for 64 possible weight values; this causes
57
// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
58
static constexpr unsigned int SINCOS_STEPS { 64 };
59

60
static const uint8_t steps_for_quant_level[12] {
61
	2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
62
};
63

64
ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
65
ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
66

67
#if defined(ASTCENC_DIAGNOSTICS)
68
	static bool print_once { true };
69
#endif
70

71
/* See header for documentation. */
72
void prepare_angular_tables()
73
{
74
	for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
75
	{
76
		float angle_step = static_cast<float>(i + 1);
77

78
		for (unsigned int j = 0; j < SINCOS_STEPS; j++)
79
		{
80
			sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
81
			cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
82
		}
83
	}
84
}
85

86
/**
87
 * @brief Compute the angular alignment factors and offsets.
88
 *
89
 * @param      weight_count              The number of (decimated) weights.
90
 * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
91
 * @param      max_angular_steps         The maximum number of steps to be tested.
92
 * @param[out] offsets                   The output angular offsets array.
93
 */
94
static void compute_angular_offsets(
95
	unsigned int weight_count,
96
	const float* dec_weight_ideal_value,
97
	unsigned int max_angular_steps,
98
	float* offsets
99
) {
100
	promise(weight_count > 0);
101
	promise(max_angular_steps > 0);
102

103
	ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
104

105
	// Precompute isample; arrays are always allocated 64 elements long
106
	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
107
	{
108
		// Ideal weight can be outside [0, 1] range, so clamp to fit table
109
		vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
110

111
		// Convert a weight to a sincos table index
112
		vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
113
		vint isample = float_to_int_rtn(sample);
114
		storea(isample, isamplev + i);
115
	}
116

117
	// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
118
	vfloat mult(1.0f / (2.0f * astc::PI));
119

120
	for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
121
	{
122
		vfloat anglesum_x = vfloat::zero();
123
		vfloat anglesum_y = vfloat::zero();
124

125
		for (unsigned int j = 0; j < weight_count; j++)
126
		{
127
			int isample = isamplev[j];
128
			anglesum_x += loada(cos_table[isample] + i);
129
			anglesum_y += loada(sin_table[isample] + i);
130
		}
131

132
		vfloat angle = atan2(anglesum_y, anglesum_x);
133
		vfloat ofs = angle * mult;
134
		storea(ofs, offsets + i);
135
	}
136
}
137

138
/**
139
 * @brief For a given step size compute the lowest and highest weight.
140
 *
141
 * Compute the lowest and highest weight that results from quantizing using the given stepsize and
142
 * offset, and then compute the resulting error. The cut errors indicate the error that results from
143
 * forcing samples that should have had one weight value one step up or down.
144
 *
145
 * @param      weight_count              The number of (decimated) weights.
146
 * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
147
 * @param      max_angular_steps         The maximum number of steps to be tested.
148
 * @param      max_quant_steps           The maximum quantization level to be tested.
149
 * @param      offsets                   The angular offsets array.
150
 * @param[out] lowest_weight             Per angular step, the lowest weight.
151
 * @param[out] weight_span               Per angular step, the span between lowest and highest weight.
152
 * @param[out] error                     Per angular step, the error.
153
 * @param[out] cut_low_weight_error      Per angular step, the low weight cut error.
154
 * @param[out] cut_high_weight_error     Per angular step, the high weight cut error.
155
 */
156
static void compute_lowest_and_highest_weight(
157
	unsigned int weight_count,
158
	const float* dec_weight_ideal_value,
159
	unsigned int max_angular_steps,
160
	unsigned int max_quant_steps,
161
	const float* offsets,
162
	float* lowest_weight,
163
	int* weight_span,
164
	float* error,
165
	float* cut_low_weight_error,
166
	float* cut_high_weight_error
167
) {
168
	promise(weight_count > 0);
169
	promise(max_angular_steps > 0);
170

171
	vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
172

173
	// Compute minimum/maximum weights in the weight array. Our remapping
174
	// is monotonic, so the min/max rounded weights relate to the min/max
175
	// unrounded weights in a straightforward way.
176
	vfloat min_weight(FLT_MAX);
177
	vfloat max_weight(-FLT_MAX);
178

179
	vint lane_id = vint::lane_id();
180
	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
181
	{
182
		vmask active = lane_id < vint(weight_count);
183
		lane_id += vint(ASTCENC_SIMD_WIDTH);
184

185
		vfloat weights = loada(dec_weight_ideal_value + i);
186
		min_weight = min(min_weight, select(min_weight, weights, active));
187
		max_weight = max(max_weight, select(max_weight, weights, active));
188
	}
189

190
	min_weight = hmin(min_weight);
191
	max_weight = hmax(max_weight);
192

193
	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
194
	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
195
	{
196
		vfloat errval = vfloat::zero();
197
		vfloat cut_low_weight_err = vfloat::zero();
198
		vfloat cut_high_weight_err = vfloat::zero();
199
		vfloat offset = loada(offsets + sp);
200

201
		// We know the min and max weight values, so we can figure out
202
		// the corresponding indices before we enter the loop.
203
		vfloat minidx = round(min_weight * rcp_stepsize - offset);
204
		vfloat maxidx = round(max_weight * rcp_stepsize - offset);
205

206
		for (unsigned int j = 0; j < weight_count; j++)
207
		{
208
			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
209
			vfloat svalrte = round(sval);
210
			vfloat diff = sval - svalrte;
211
			errval += diff * diff;
212

213
			// Accumulate errors for minimum index
214
			vmask mask = svalrte == minidx;
215
			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
216
			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
217

218
			// Accumulate errors for maximum index
219
			mask = svalrte == maxidx;
220
			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
221
			cut_high_weight_err = select(cut_high_weight_err, accum, mask);
222
		}
223

224
		// Write out min weight and weight span; clamp span to a usable range
225
		vint span = float_to_int(maxidx - minidx + vfloat(1));
226
		span = min(span, vint(max_quant_steps + 3));
227
		span = max(span, vint(2));
228
		storea(minidx, lowest_weight + sp);
229
		storea(span, weight_span + sp);
230

231
		// The cut_(lowest/highest)_weight_error indicate the error that results from  forcing
232
		// samples that should have had the weight value one step (up/down).
233
		vfloat ssize = 1.0f / rcp_stepsize;
234
		vfloat errscale = ssize * ssize;
235
		storea(errval * errscale, error + sp);
236
		storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
237
		storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
238

239
		rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
240
	}
241
}
242

243
/**
244
 * @brief The main function for the angular algorithm.
245
 *
246
 * @param      weight_count              The number of (decimated) weights.
247
 * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
248
 * @param      max_quant_level           The maximum quantization level to be tested.
249
 * @param[out] low_value                 Per angular step, the lowest weight value.
250
 * @param[out] high_value                Per angular step, the highest weight value.
251
 */
252
static void compute_angular_endpoints_for_quant_levels(
253
	unsigned int weight_count,
254
	const float* dec_weight_ideal_value,
255
	unsigned int max_quant_level,
256
	float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
257
	float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
258
) {
259
	unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
260
	unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
261

262
	ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
263

264
	compute_angular_offsets(weight_count, dec_weight_ideal_value,
265
	                        max_angular_steps, angular_offsets);
266

267
	ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
268
	ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
269
	ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
270
	ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
271
	ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
272

273
	compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
274
	                                  max_angular_steps, max_quant_steps,
275
	                                  angular_offsets, lowest_weight, weight_span, error,
276
	                                  cut_low_weight_error, cut_high_weight_error);
277

278
	// For each quantization level, find the best error terms. Use packed vectors so data-dependent
279
	// branches can become selects. This involves some integer to float casts, but the values are
280
	// small enough so they never round the wrong way.
281
	vfloat4 best_results[36];
282

283
	// Initialize the array to some safe defaults
284
	promise(max_quant_steps > 0);
285
	for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
286
	{
287
		// Lane<0> = Best error
288
		// Lane<1> = Best scale; -1 indicates no solution found
289
		// Lane<2> = Cut low weight
290
		best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
291
	}
292

293
	promise(max_angular_steps > 0);
294
	for (unsigned int i = 0; i < max_angular_steps; i++)
295
	{
296
		float i_flt = static_cast<float>(i);
297

298
		int idx_span = weight_span[i];
299

300
		float error_cut_low = error[i] + cut_low_weight_error[i];
301
		float error_cut_high = error[i] + cut_high_weight_error[i];
302
		float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
303

304
		// Check best error against record N
305
		vfloat4 best_result = best_results[idx_span];
306
		vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
307
		vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
308
		best_results[idx_span] = select(best_result, new_result, mask);
309

310
		// Check best error against record N-1 with either cut low or cut high
311
		best_result = best_results[idx_span - 1];
312

313
		new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
314
		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
315
		best_result = select(best_result, new_result, mask);
316

317
		new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
318
		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
319
		best_results[idx_span - 1] = select(best_result, new_result, mask);
320

321
		// Check best error against record N-2 with both cut low and high
322
		best_result = best_results[idx_span - 2];
323
		new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
324
		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
325
		best_results[idx_span - 2] = select(best_result, new_result, mask);
326
	}
327

328
	for (unsigned int i = 0; i <= max_quant_level; i++)
329
	{
330
		unsigned int q = steps_for_quant_level[i];
331
		int bsi = static_cast<int>(best_results[q].lane<1>());
332

333
		// Did we find anything?
334
#if defined(ASTCENC_DIAGNOSTICS)
335
		if ((bsi < 0) && print_once)
336
		{
337
			print_once = false;
338
			printf("INFO: Unable to find full encoding within search error limit.\n\n");
339
		}
340
#endif
341

342
		bsi = astc::max(0, bsi);
343

344
		float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
345
		float hwi = lwi + static_cast<float>(q) - 1.0f;
346

347
		float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
348
		low_value[i]  = (angular_offsets[bsi] + lwi) * stepsize;
349
		high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
350
	}
351
}
352

353
/* See header for documentation. */
354
void compute_angular_endpoints_1plane(
355
	bool only_always,
356
	const block_size_descriptor& bsd,
357
	const float* dec_weight_ideal_value,
358
	unsigned int max_weight_quant,
359
	compression_working_buffers& tmpbuf
360
) {
361
	float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
362
	float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
363

364
	float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
365
	float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
366

367
	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
368
	                                                : bsd.decimation_mode_count_selected;
369
	promise(max_decimation_modes > 0);
370
	for (unsigned int i = 0; i < max_decimation_modes; i++)
371
	{
372
		const decimation_mode& dm = bsd.decimation_modes[i];
373
		if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
374
		{
375
			continue;
376
		}
377

378
		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
379

380
		unsigned int max_precision = dm.maxprec_1plane;
381
		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
382
		{
383
			max_precision = TUNE_MAX_ANGULAR_QUANT;
384
		}
385

386
		if (max_precision > max_weight_quant)
387
		{
388
			max_precision = max_weight_quant;
389
		}
390

391
		compute_angular_endpoints_for_quant_levels(
392
		    weight_count,
393
		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
394
		    max_precision, low_values[i], high_values[i]);
395
	}
396

397
	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
398
	                                           : bsd.block_mode_count_1plane_selected;
399
	promise(max_block_modes > 0);
400
	for (unsigned int i = 0; i < max_block_modes; i++)
401
	{
402
		const block_mode& bm = bsd.block_modes[i];
403
		assert(!bm.is_dual_plane);
404

405
		unsigned int quant_mode = bm.quant_mode;
406
		unsigned int decim_mode = bm.decimation_mode;
407

408
		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
409
		{
410
			low_value[i] = low_values[decim_mode][quant_mode];
411
			high_value[i] = high_values[decim_mode][quant_mode];
412
		}
413
		else
414
		{
415
			low_value[i] = 0.0f;
416
			high_value[i] = 1.0f;
417
		}
418
	}
419
}
420

421
/* See header for documentation. */
422
void compute_angular_endpoints_2planes(
423
	const block_size_descriptor& bsd,
424
	const float* dec_weight_ideal_value,
425
	unsigned int max_weight_quant,
426
	compression_working_buffers& tmpbuf
427
) {
428
	float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
429
	float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
430
	float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
431
	float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
432

433
	float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
434
	float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
435
	float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
436
	float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
437

438
	promise(bsd.decimation_mode_count_selected > 0);
439
	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
440
	{
441
		const decimation_mode& dm = bsd.decimation_modes[i];
442
		if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
443
		{
444
			continue;
445
		}
446

447
		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
448

449
		unsigned int max_precision = dm.maxprec_2planes;
450
		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
451
		{
452
			max_precision = TUNE_MAX_ANGULAR_QUANT;
453
		}
454

455
		if (max_precision > max_weight_quant)
456
		{
457
			max_precision = max_weight_quant;
458
		}
459

460
		compute_angular_endpoints_for_quant_levels(
461
		    weight_count,
462
		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
463
		    max_precision, low_values1[i], high_values1[i]);
464

465
		compute_angular_endpoints_for_quant_levels(
466
		    weight_count,
467
		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
468
		    max_precision, low_values2[i], high_values2[i]);
469
	}
470

471
	unsigned int start = bsd.block_mode_count_1plane_selected;
472
	unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
473
	for (unsigned int i = start; i < end; i++)
474
	{
475
		const block_mode& bm = bsd.block_modes[i];
476
		unsigned int quant_mode = bm.quant_mode;
477
		unsigned int decim_mode = bm.decimation_mode;
478

479
		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
480
		{
481
			low_value1[i] = low_values1[decim_mode][quant_mode];
482
			high_value1[i] = high_values1[decim_mode][quant_mode];
483
			low_value2[i] = low_values2[decim_mode][quant_mode];
484
			high_value2[i] = high_values2[decim_mode][quant_mode];
485
		}
486
		else
487
		{
488
			low_value1[i] = 0.0f;
489
			high_value1[i] = 1.0f;
490
			low_value2[i] = 0.0f;
491
			high_value2[i] = 1.0f;
492
		}
493
	}
494
}
495

496
#endif
497

498
Product

Resources

Company