Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_weight_align.cpp
9905 views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2024 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
// http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20
/**
21
* @brief Functions for angular-sum algorithm for weight alignment.
22
*
23
* This algorithm works as follows:
24
* - we compute a complex number P as (cos s*i, sin s*i) for each weight,
25
* where i is the input value and s is a scaling factor based on the spacing between the weights.
26
* - we then add together complex numbers for all the weights.
27
* - we then compute the length and angle of the resulting sum.
28
*
29
* This should produce the following results:
30
* - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
31
* - even distribution results in a vector of length 0.
32
* - all samples identical results in perfect alignment for every scaling.
33
*
34
* For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
35
* should then result in some scalings standing out as having particularly good alignment factors;
36
* we can use this to produce a set of candidate scale/shift values for various quantization levels;
37
* we should then actually try them and see what happens.
38
*/
39
40
#include "astcenc_internal.h"
41
#include "astcenc_vecmathlib.h"
42
43
#include <stdio.h>
44
#include <cassert>
45
#include <cstring>
46
#include <cfloat>
47
48
static constexpr unsigned int ANGULAR_STEPS { 32 };
49
50
static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
51
"ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
52
53
static_assert(ANGULAR_STEPS >= 32,
54
"ANGULAR_STEPS must be at least max(steps_for_quant_level)");
55
56
// Store a reduced sin/cos table for 64 possible weight values; this causes
57
// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
58
static constexpr unsigned int SINCOS_STEPS { 64 };
59
60
static const uint8_t steps_for_quant_level[12] {
61
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
62
};
63
64
ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
65
ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
66
67
#if defined(ASTCENC_DIAGNOSTICS)
68
static bool print_once { true };
69
#endif
70
71
/* See header for documentation. */
72
void prepare_angular_tables()
73
{
74
for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
75
{
76
float angle_step = static_cast<float>(i + 1);
77
78
for (unsigned int j = 0; j < SINCOS_STEPS; j++)
79
{
80
sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
81
cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
82
}
83
}
84
}
85
86
/**
87
* @brief Compute the angular alignment factors and offsets.
88
*
89
* @param weight_count The number of (decimated) weights.
90
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
91
* @param max_angular_steps The maximum number of steps to be tested.
92
* @param[out] offsets The output angular offsets array.
93
*/
94
static void compute_angular_offsets(
95
unsigned int weight_count,
96
const float* dec_weight_ideal_value,
97
unsigned int max_angular_steps,
98
float* offsets
99
) {
100
promise(weight_count > 0);
101
promise(max_angular_steps > 0);
102
103
ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
104
105
// Precompute isample; arrays are always allocated 64 elements long
106
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
107
{
108
// Ideal weight can be outside [0, 1] range, so clamp to fit table
109
vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
110
111
// Convert a weight to a sincos table index
112
vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
113
vint isample = float_to_int_rtn(sample);
114
storea(isample, isamplev + i);
115
}
116
117
// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
118
vfloat mult(1.0f / (2.0f * astc::PI));
119
120
for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
121
{
122
vfloat anglesum_x = vfloat::zero();
123
vfloat anglesum_y = vfloat::zero();
124
125
for (unsigned int j = 0; j < weight_count; j++)
126
{
127
int isample = isamplev[j];
128
anglesum_x += loada(cos_table[isample] + i);
129
anglesum_y += loada(sin_table[isample] + i);
130
}
131
132
vfloat angle = atan2(anglesum_y, anglesum_x);
133
vfloat ofs = angle * mult;
134
storea(ofs, offsets + i);
135
}
136
}
137
138
/**
139
* @brief For a given step size compute the lowest and highest weight.
140
*
141
* Compute the lowest and highest weight that results from quantizing using the given stepsize and
142
* offset, and then compute the resulting error. The cut errors indicate the error that results from
143
* forcing samples that should have had one weight value one step up or down.
144
*
145
* @param weight_count The number of (decimated) weights.
146
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
147
* @param max_angular_steps The maximum number of steps to be tested.
148
* @param max_quant_steps The maximum quantization level to be tested.
149
* @param offsets The angular offsets array.
150
* @param[out] lowest_weight Per angular step, the lowest weight.
151
* @param[out] weight_span Per angular step, the span between lowest and highest weight.
152
* @param[out] error Per angular step, the error.
153
* @param[out] cut_low_weight_error Per angular step, the low weight cut error.
154
* @param[out] cut_high_weight_error Per angular step, the high weight cut error.
155
*/
156
static void compute_lowest_and_highest_weight(
157
unsigned int weight_count,
158
const float* dec_weight_ideal_value,
159
unsigned int max_angular_steps,
160
unsigned int max_quant_steps,
161
const float* offsets,
162
float* lowest_weight,
163
int* weight_span,
164
float* error,
165
float* cut_low_weight_error,
166
float* cut_high_weight_error
167
) {
168
promise(weight_count > 0);
169
promise(max_angular_steps > 0);
170
171
vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
172
173
// Compute minimum/maximum weights in the weight array. Our remapping
174
// is monotonic, so the min/max rounded weights relate to the min/max
175
// unrounded weights in a straightforward way.
176
vfloat min_weight(FLT_MAX);
177
vfloat max_weight(-FLT_MAX);
178
179
vint lane_id = vint::lane_id();
180
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
181
{
182
vmask active = lane_id < vint(weight_count);
183
lane_id += vint(ASTCENC_SIMD_WIDTH);
184
185
vfloat weights = loada(dec_weight_ideal_value + i);
186
min_weight = min(min_weight, select(min_weight, weights, active));
187
max_weight = max(max_weight, select(max_weight, weights, active));
188
}
189
190
min_weight = hmin(min_weight);
191
max_weight = hmax(max_weight);
192
193
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
194
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
195
{
196
vfloat errval = vfloat::zero();
197
vfloat cut_low_weight_err = vfloat::zero();
198
vfloat cut_high_weight_err = vfloat::zero();
199
vfloat offset = loada(offsets + sp);
200
201
// We know the min and max weight values, so we can figure out
202
// the corresponding indices before we enter the loop.
203
vfloat minidx = round(min_weight * rcp_stepsize - offset);
204
vfloat maxidx = round(max_weight * rcp_stepsize - offset);
205
206
for (unsigned int j = 0; j < weight_count; j++)
207
{
208
vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
209
vfloat svalrte = round(sval);
210
vfloat diff = sval - svalrte;
211
errval += diff * diff;
212
213
// Accumulate errors for minimum index
214
vmask mask = svalrte == minidx;
215
vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
216
cut_low_weight_err = select(cut_low_weight_err, accum, mask);
217
218
// Accumulate errors for maximum index
219
mask = svalrte == maxidx;
220
accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
221
cut_high_weight_err = select(cut_high_weight_err, accum, mask);
222
}
223
224
// Write out min weight and weight span; clamp span to a usable range
225
vint span = float_to_int(maxidx - minidx + vfloat(1));
226
span = min(span, vint(max_quant_steps + 3));
227
span = max(span, vint(2));
228
storea(minidx, lowest_weight + sp);
229
storea(span, weight_span + sp);
230
231
// The cut_(lowest/highest)_weight_error indicate the error that results from forcing
232
// samples that should have had the weight value one step (up/down).
233
vfloat ssize = 1.0f / rcp_stepsize;
234
vfloat errscale = ssize * ssize;
235
storea(errval * errscale, error + sp);
236
storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
237
storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
238
239
rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
240
}
241
}
242
243
/**
244
* @brief The main function for the angular algorithm.
245
*
246
* @param weight_count The number of (decimated) weights.
247
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
248
* @param max_quant_level The maximum quantization level to be tested.
249
* @param[out] low_value Per angular step, the lowest weight value.
250
* @param[out] high_value Per angular step, the highest weight value.
251
*/
252
static void compute_angular_endpoints_for_quant_levels(
253
unsigned int weight_count,
254
const float* dec_weight_ideal_value,
255
unsigned int max_quant_level,
256
float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
257
float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
258
) {
259
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
260
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
261
262
ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
263
264
compute_angular_offsets(weight_count, dec_weight_ideal_value,
265
max_angular_steps, angular_offsets);
266
267
ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
268
ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
269
ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
270
ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
271
ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
272
273
compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
274
max_angular_steps, max_quant_steps,
275
angular_offsets, lowest_weight, weight_span, error,
276
cut_low_weight_error, cut_high_weight_error);
277
278
// For each quantization level, find the best error terms. Use packed vectors so data-dependent
279
// branches can become selects. This involves some integer to float casts, but the values are
280
// small enough so they never round the wrong way.
281
vfloat4 best_results[36];
282
283
// Initialize the array to some safe defaults
284
promise(max_quant_steps > 0);
285
for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
286
{
287
// Lane<0> = Best error
288
// Lane<1> = Best scale; -1 indicates no solution found
289
// Lane<2> = Cut low weight
290
best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
291
}
292
293
promise(max_angular_steps > 0);
294
for (unsigned int i = 0; i < max_angular_steps; i++)
295
{
296
float i_flt = static_cast<float>(i);
297
298
int idx_span = weight_span[i];
299
300
float error_cut_low = error[i] + cut_low_weight_error[i];
301
float error_cut_high = error[i] + cut_high_weight_error[i];
302
float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
303
304
// Check best error against record N
305
vfloat4 best_result = best_results[idx_span];
306
vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
307
vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
308
best_results[idx_span] = select(best_result, new_result, mask);
309
310
// Check best error against record N-1 with either cut low or cut high
311
best_result = best_results[idx_span - 1];
312
313
new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
314
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
315
best_result = select(best_result, new_result, mask);
316
317
new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
318
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
319
best_results[idx_span - 1] = select(best_result, new_result, mask);
320
321
// Check best error against record N-2 with both cut low and high
322
best_result = best_results[idx_span - 2];
323
new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
324
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
325
best_results[idx_span - 2] = select(best_result, new_result, mask);
326
}
327
328
for (unsigned int i = 0; i <= max_quant_level; i++)
329
{
330
unsigned int q = steps_for_quant_level[i];
331
int bsi = static_cast<int>(best_results[q].lane<1>());
332
333
// Did we find anything?
334
#if defined(ASTCENC_DIAGNOSTICS)
335
if ((bsi < 0) && print_once)
336
{
337
print_once = false;
338
printf("INFO: Unable to find full encoding within search error limit.\n\n");
339
}
340
#endif
341
342
bsi = astc::max(0, bsi);
343
344
float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
345
float hwi = lwi + static_cast<float>(q) - 1.0f;
346
347
float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
348
low_value[i] = (angular_offsets[bsi] + lwi) * stepsize;
349
high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
350
}
351
}
352
353
/* See header for documentation. */
354
void compute_angular_endpoints_1plane(
355
bool only_always,
356
const block_size_descriptor& bsd,
357
const float* dec_weight_ideal_value,
358
unsigned int max_weight_quant,
359
compression_working_buffers& tmpbuf
360
) {
361
float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
362
float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
363
364
float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
365
float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
366
367
unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
368
: bsd.decimation_mode_count_selected;
369
promise(max_decimation_modes > 0);
370
for (unsigned int i = 0; i < max_decimation_modes; i++)
371
{
372
const decimation_mode& dm = bsd.decimation_modes[i];
373
if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
374
{
375
continue;
376
}
377
378
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
379
380
unsigned int max_precision = dm.maxprec_1plane;
381
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
382
{
383
max_precision = TUNE_MAX_ANGULAR_QUANT;
384
}
385
386
if (max_precision > max_weight_quant)
387
{
388
max_precision = max_weight_quant;
389
}
390
391
compute_angular_endpoints_for_quant_levels(
392
weight_count,
393
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
394
max_precision, low_values[i], high_values[i]);
395
}
396
397
unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
398
: bsd.block_mode_count_1plane_selected;
399
promise(max_block_modes > 0);
400
for (unsigned int i = 0; i < max_block_modes; i++)
401
{
402
const block_mode& bm = bsd.block_modes[i];
403
assert(!bm.is_dual_plane);
404
405
unsigned int quant_mode = bm.quant_mode;
406
unsigned int decim_mode = bm.decimation_mode;
407
408
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
409
{
410
low_value[i] = low_values[decim_mode][quant_mode];
411
high_value[i] = high_values[decim_mode][quant_mode];
412
}
413
else
414
{
415
low_value[i] = 0.0f;
416
high_value[i] = 1.0f;
417
}
418
}
419
}
420
421
/* See header for documentation. */
422
void compute_angular_endpoints_2planes(
423
const block_size_descriptor& bsd,
424
const float* dec_weight_ideal_value,
425
unsigned int max_weight_quant,
426
compression_working_buffers& tmpbuf
427
) {
428
float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
429
float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
430
float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
431
float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
432
433
float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
434
float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
435
float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
436
float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
437
438
promise(bsd.decimation_mode_count_selected > 0);
439
for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
440
{
441
const decimation_mode& dm = bsd.decimation_modes[i];
442
if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
443
{
444
continue;
445
}
446
447
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
448
449
unsigned int max_precision = dm.maxprec_2planes;
450
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
451
{
452
max_precision = TUNE_MAX_ANGULAR_QUANT;
453
}
454
455
if (max_precision > max_weight_quant)
456
{
457
max_precision = max_weight_quant;
458
}
459
460
compute_angular_endpoints_for_quant_levels(
461
weight_count,
462
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
463
max_precision, low_values1[i], high_values1[i]);
464
465
compute_angular_endpoints_for_quant_levels(
466
weight_count,
467
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
468
max_precision, low_values2[i], high_values2[i]);
469
}
470
471
unsigned int start = bsd.block_mode_count_1plane_selected;
472
unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
473
for (unsigned int i = start; i < end; i++)
474
{
475
const block_mode& bm = bsd.block_modes[i];
476
unsigned int quant_mode = bm.quant_mode;
477
unsigned int decim_mode = bm.decimation_mode;
478
479
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
480
{
481
low_value1[i] = low_values1[decim_mode][quant_mode];
482
high_value1[i] = high_values1[decim_mode][quant_mode];
483
low_value2[i] = low_values2[decim_mode][quant_mode];
484
high_value2[i] = high_values2[decim_mode][quant_mode];
485
}
486
else
487
{
488
low_value1[i] = 0.0f;
489
high_value1[i] = 1.0f;
490
low_value2[i] = 0.0f;
491
high_value2[i] = 1.0f;
492
}
493
}
494
}
495
496
#endif
497
498