Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_block_sizes.cpp
9896 views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2025 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
// http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
/**
19
* @brief Functions to generate block size descriptor and decimation tables.
20
*/
21
22
#include "astcenc_internal.h"
23
24
/**
25
* @brief Decode the properties of an encoded 2D block mode.
26
*
27
* @param block_mode The encoded block mode.
28
* @param[out] x_weights The number of weights in the X dimension.
29
* @param[out] y_weights The number of weights in the Y dimension.
30
* @param[out] is_dual_plane True if this block mode has two weight planes.
31
* @param[out] quant_mode The quantization level for the weights.
32
* @param[out] weight_bits The storage bit count for the weights.
33
*
34
* @return Returns true if a valid mode, false otherwise.
35
*/
36
static bool decode_block_mode_2d(
37
unsigned int block_mode,
38
unsigned int& x_weights,
39
unsigned int& y_weights,
40
bool& is_dual_plane,
41
unsigned int& quant_mode,
42
unsigned int& weight_bits
43
) {
44
unsigned int base_quant_mode = (block_mode >> 4) & 1;
45
unsigned int H = (block_mode >> 9) & 1;
46
unsigned int D = (block_mode >> 10) & 1;
47
unsigned int A = (block_mode >> 5) & 0x3;
48
49
x_weights = 0;
50
y_weights = 0;
51
52
if ((block_mode & 3) != 0)
53
{
54
base_quant_mode |= (block_mode & 3) << 1;
55
unsigned int B = (block_mode >> 7) & 3;
56
switch ((block_mode >> 2) & 3)
57
{
58
case 0:
59
x_weights = B + 4;
60
y_weights = A + 2;
61
break;
62
case 1:
63
x_weights = B + 8;
64
y_weights = A + 2;
65
break;
66
case 2:
67
x_weights = A + 2;
68
y_weights = B + 8;
69
break;
70
case 3:
71
B &= 1;
72
if (block_mode & 0x100)
73
{
74
x_weights = B + 2;
75
y_weights = A + 2;
76
}
77
else
78
{
79
x_weights = A + 2;
80
y_weights = B + 6;
81
}
82
break;
83
}
84
}
85
else
86
{
87
base_quant_mode |= ((block_mode >> 2) & 3) << 1;
88
if (((block_mode >> 2) & 3) == 0)
89
{
90
return false;
91
}
92
93
unsigned int B = (block_mode >> 9) & 3;
94
switch ((block_mode >> 7) & 3)
95
{
96
case 0:
97
x_weights = 12;
98
y_weights = A + 2;
99
break;
100
case 1:
101
x_weights = A + 2;
102
y_weights = 12;
103
break;
104
case 2:
105
x_weights = A + 6;
106
y_weights = B + 6;
107
D = 0;
108
H = 0;
109
break;
110
case 3:
111
switch ((block_mode >> 5) & 3)
112
{
113
case 0:
114
x_weights = 6;
115
y_weights = 10;
116
break;
117
case 1:
118
x_weights = 10;
119
y_weights = 6;
120
break;
121
case 2:
122
case 3:
123
return false;
124
}
125
break;
126
}
127
}
128
129
unsigned int weight_count = x_weights * y_weights * (D + 1);
130
quant_mode = (base_quant_mode - 2) + 6 * H;
131
is_dual_plane = D != 0;
132
133
weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode));
134
return (weight_count <= BLOCK_MAX_WEIGHTS &&
135
weight_bits >= BLOCK_MIN_WEIGHT_BITS &&
136
weight_bits <= BLOCK_MAX_WEIGHT_BITS);
137
}
138
139
/**
140
* @brief Decode the properties of an encoded 3D block mode.
141
*
142
* @param block_mode The encoded block mode.
143
* @param[out] x_weights The number of weights in the X dimension.
144
* @param[out] y_weights The number of weights in the Y dimension.
145
* @param[out] z_weights The number of weights in the Z dimension.
146
* @param[out] is_dual_plane True if this block mode has two weight planes.
147
* @param[out] quant_mode The quantization level for the weights.
148
* @param[out] weight_bits The storage bit count for the weights.
149
*
150
* @return Returns true if a valid mode, false otherwise.
151
*/
152
static bool decode_block_mode_3d(
153
unsigned int block_mode,
154
unsigned int& x_weights,
155
unsigned int& y_weights,
156
unsigned int& z_weights,
157
bool& is_dual_plane,
158
unsigned int& quant_mode,
159
unsigned int& weight_bits
160
) {
161
unsigned int base_quant_mode = (block_mode >> 4) & 1;
162
unsigned int H = (block_mode >> 9) & 1;
163
unsigned int D = (block_mode >> 10) & 1;
164
unsigned int A = (block_mode >> 5) & 0x3;
165
166
x_weights = 0;
167
y_weights = 0;
168
z_weights = 0;
169
170
if ((block_mode & 3) != 0)
171
{
172
base_quant_mode |= (block_mode & 3) << 1;
173
unsigned int B = (block_mode >> 7) & 3;
174
unsigned int C = (block_mode >> 2) & 0x3;
175
x_weights = A + 2;
176
y_weights = B + 2;
177
z_weights = C + 2;
178
}
179
else
180
{
181
base_quant_mode |= ((block_mode >> 2) & 3) << 1;
182
if (((block_mode >> 2) & 3) == 0)
183
{
184
return false;
185
}
186
187
int B = (block_mode >> 9) & 3;
188
if (((block_mode >> 7) & 3) != 3)
189
{
190
D = 0;
191
H = 0;
192
}
193
switch ((block_mode >> 7) & 3)
194
{
195
case 0:
196
x_weights = 6;
197
y_weights = B + 2;
198
z_weights = A + 2;
199
break;
200
case 1:
201
x_weights = A + 2;
202
y_weights = 6;
203
z_weights = B + 2;
204
break;
205
case 2:
206
x_weights = A + 2;
207
y_weights = B + 2;
208
z_weights = 6;
209
break;
210
case 3:
211
x_weights = 2;
212
y_weights = 2;
213
z_weights = 2;
214
switch ((block_mode >> 5) & 3)
215
{
216
case 0:
217
x_weights = 6;
218
break;
219
case 1:
220
y_weights = 6;
221
break;
222
case 2:
223
z_weights = 6;
224
break;
225
case 3:
226
return false;
227
}
228
break;
229
}
230
}
231
232
unsigned int weight_count = x_weights * y_weights * z_weights * (D + 1);
233
quant_mode = (base_quant_mode - 2) + 6 * H;
234
is_dual_plane = D != 0;
235
236
weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode));
237
return (weight_count <= BLOCK_MAX_WEIGHTS &&
238
weight_bits >= BLOCK_MIN_WEIGHT_BITS &&
239
weight_bits <= BLOCK_MAX_WEIGHT_BITS);
240
}
241
242
/**
243
* @brief Create a 2D decimation entry for a block-size and weight-decimation pair.
244
*
245
* @param x_texels The number of texels in the X dimension.
246
* @param y_texels The number of texels in the Y dimension.
247
* @param x_weights The number of weights in the X dimension.
248
* @param y_weights The number of weights in the Y dimension.
249
* @param[out] di The decimation info structure to populate.
250
* @param[out] wb The decimation table init scratch working buffers.
251
*/
252
static void init_decimation_info_2d(
253
unsigned int x_texels,
254
unsigned int y_texels,
255
unsigned int x_weights,
256
unsigned int y_weights,
257
decimation_info& di,
258
dt_init_working_buffers& wb
259
) {
260
unsigned int texels_per_block = x_texels * y_texels;
261
unsigned int weights_per_block = x_weights * y_weights;
262
263
uint8_t max_texel_count_of_weight = 0;
264
265
promise(weights_per_block > 0);
266
promise(texels_per_block > 0);
267
promise(x_texels > 0);
268
promise(y_texels > 0);
269
270
for (unsigned int i = 0; i < weights_per_block; i++)
271
{
272
wb.texel_count_of_weight[i] = 0;
273
}
274
275
for (unsigned int i = 0; i < texels_per_block; i++)
276
{
277
wb.weight_count_of_texel[i] = 0;
278
}
279
280
for (unsigned int y = 0; y < y_texels; y++)
281
{
282
for (unsigned int x = 0; x < x_texels; x++)
283
{
284
unsigned int texel = y * x_texels + x;
285
286
unsigned int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6;
287
unsigned int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6;
288
289
unsigned int x_weight_frac = x_weight & 0xF;
290
unsigned int y_weight_frac = y_weight & 0xF;
291
unsigned int x_weight_int = x_weight >> 4;
292
unsigned int y_weight_int = y_weight >> 4;
293
294
unsigned int qweight[4];
295
qweight[0] = x_weight_int + y_weight_int * x_weights;
296
qweight[1] = qweight[0] + 1;
297
qweight[2] = qweight[0] + x_weights;
298
qweight[3] = qweight[2] + 1;
299
300
// Truncated-precision bilinear interpolation
301
unsigned int prod = x_weight_frac * y_weight_frac;
302
303
unsigned int weight[4];
304
weight[3] = (prod + 8) >> 4;
305
weight[1] = x_weight_frac - weight[3];
306
weight[2] = y_weight_frac - weight[3];
307
weight[0] = 16 - x_weight_frac - y_weight_frac + weight[3];
308
309
for (unsigned int i = 0; i < 4; i++)
310
{
311
if (weight[i] != 0)
312
{
313
wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
314
wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
315
wb.weight_count_of_texel[texel]++;
316
wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
317
wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
318
wb.texel_count_of_weight[qweight[i]]++;
319
max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
320
}
321
}
322
}
323
}
324
325
uint8_t max_texel_weight_count = 0;
326
for (unsigned int i = 0; i < texels_per_block; i++)
327
{
328
di.texel_weight_count[i] = wb.weight_count_of_texel[i];
329
max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]);
330
331
for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
332
{
333
di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
334
di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
335
di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
336
}
337
338
// Init all 4 entries so we can rely on zeros for vectorization
339
for (unsigned int j = wb.weight_count_of_texel[i]; j < 4; j++)
340
{
341
di.texel_weight_contribs_int_tr[j][i] = 0;
342
di.texel_weight_contribs_float_tr[j][i] = 0.0f;
343
di.texel_weights_tr[j][i] = 0;
344
}
345
}
346
347
di.max_texel_weight_count = max_texel_weight_count;
348
349
for (unsigned int i = 0; i < weights_per_block; i++)
350
{
351
unsigned int texel_count_wt = wb.texel_count_of_weight[i];
352
di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt);
353
354
for (unsigned int j = 0; j < texel_count_wt; j++)
355
{
356
uint8_t texel = wb.texels_of_weight[i][j];
357
358
// Create transposed versions of these for better vectorization
359
di.weight_texels_tr[j][i] = texel;
360
di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
361
362
// Store the per-texel contribution of this weight for each texel it contributes to
363
di.texel_contrib_for_weight[j][i] = 0.0f;
364
for (unsigned int k = 0; k < 4; k++)
365
{
366
uint8_t dttw = di.texel_weights_tr[k][texel];
367
float dttwf = di.texel_weight_contribs_float_tr[k][texel];
368
if (dttw == i && dttwf != 0.0f)
369
{
370
di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
371
break;
372
}
373
}
374
}
375
376
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
377
// Match last texel in active lane in SIMD group, for better gathers
378
uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i];
379
for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
380
{
381
di.weight_texels_tr[j][i] = last_texel;
382
di.weights_texel_contribs_tr[j][i] = 0.0f;
383
}
384
}
385
386
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
387
size_t texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
388
for (size_t i = texels_per_block; i < texels_per_block_simd; i++)
389
{
390
di.texel_weight_count[i] = 0;
391
392
for (size_t j = 0; j < 4; j++)
393
{
394
di.texel_weight_contribs_float_tr[j][i] = 0;
395
di.texel_weights_tr[j][i] = 0;
396
di.texel_weight_contribs_int_tr[j][i] = 0;
397
}
398
}
399
400
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
401
// Match last texel in active lane in SIMD group, for better gathers
402
unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
403
uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
404
405
size_t weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
406
for (size_t i = weights_per_block; i < weights_per_block_simd; i++)
407
{
408
di.weight_texel_count[i] = 0;
409
410
for (size_t j = 0; j < max_texel_count_of_weight; j++)
411
{
412
di.weight_texels_tr[j][i] = last_texel;
413
di.weights_texel_contribs_tr[j][i] = 0.0f;
414
}
415
}
416
417
di.texel_count = static_cast<uint8_t>(texels_per_block);
418
di.weight_count = static_cast<uint8_t>(weights_per_block);
419
di.weight_x = static_cast<uint8_t>(x_weights);
420
di.weight_y = static_cast<uint8_t>(y_weights);
421
di.weight_z = 1;
422
}
423
424
/**
425
* @brief Create a 3D decimation entry for a block-size and weight-decimation pair.
426
*
427
* @param x_texels The number of texels in the X dimension.
428
* @param y_texels The number of texels in the Y dimension.
429
* @param z_texels The number of texels in the Z dimension.
430
* @param x_weights The number of weights in the X dimension.
431
* @param y_weights The number of weights in the Y dimension.
432
* @param z_weights The number of weights in the Z dimension.
433
* @param[out] di The decimation info structure to populate.
434
@param[out] wb The decimation table init scratch working buffers.
435
*/
436
static void init_decimation_info_3d(
437
unsigned int x_texels,
438
unsigned int y_texels,
439
unsigned int z_texels,
440
unsigned int x_weights,
441
unsigned int y_weights,
442
unsigned int z_weights,
443
decimation_info& di,
444
dt_init_working_buffers& wb
445
) {
446
unsigned int texels_per_block = x_texels * y_texels * z_texels;
447
unsigned int weights_per_block = x_weights * y_weights * z_weights;
448
449
uint8_t max_texel_count_of_weight = 0;
450
451
promise(weights_per_block > 0);
452
promise(texels_per_block > 0);
453
454
for (unsigned int i = 0; i < weights_per_block; i++)
455
{
456
wb.texel_count_of_weight[i] = 0;
457
}
458
459
for (unsigned int i = 0; i < texels_per_block; i++)
460
{
461
wb.weight_count_of_texel[i] = 0;
462
}
463
464
for (unsigned int z = 0; z < z_texels; z++)
465
{
466
for (unsigned int y = 0; y < y_texels; y++)
467
{
468
for (unsigned int x = 0; x < x_texels; x++)
469
{
470
int texel = (z * y_texels + y) * x_texels + x;
471
472
int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6;
473
int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6;
474
int z_weight = (((1024 + z_texels / 2) / (z_texels - 1)) * z * (z_weights - 1) + 32) >> 6;
475
476
int x_weight_frac = x_weight & 0xF;
477
int y_weight_frac = y_weight & 0xF;
478
int z_weight_frac = z_weight & 0xF;
479
int x_weight_int = x_weight >> 4;
480
int y_weight_int = y_weight >> 4;
481
int z_weight_int = z_weight >> 4;
482
int qweight[4];
483
int weight[4];
484
qweight[0] = (z_weight_int * y_weights + y_weight_int) * x_weights + x_weight_int;
485
qweight[3] = ((z_weight_int + 1) * y_weights + (y_weight_int + 1)) * x_weights + (x_weight_int + 1);
486
487
// simplex interpolation
488
int fs = x_weight_frac;
489
int ft = y_weight_frac;
490
int fp = z_weight_frac;
491
492
int cas = ((fs > ft) << 2) + ((ft > fp) << 1) + ((fs > fp));
493
int N = x_weights;
494
int NM = x_weights * y_weights;
495
496
int s1, s2, w0, w1, w2, w3;
497
switch (cas)
498
{
499
case 7:
500
s1 = 1;
501
s2 = N;
502
w0 = 16 - fs;
503
w1 = fs - ft;
504
w2 = ft - fp;
505
w3 = fp;
506
break;
507
case 3:
508
s1 = N;
509
s2 = 1;
510
w0 = 16 - ft;
511
w1 = ft - fs;
512
w2 = fs - fp;
513
w3 = fp;
514
break;
515
case 5:
516
s1 = 1;
517
s2 = NM;
518
w0 = 16 - fs;
519
w1 = fs - fp;
520
w2 = fp - ft;
521
w3 = ft;
522
break;
523
case 4:
524
s1 = NM;
525
s2 = 1;
526
w0 = 16 - fp;
527
w1 = fp - fs;
528
w2 = fs - ft;
529
w3 = ft;
530
break;
531
case 2:
532
s1 = N;
533
s2 = NM;
534
w0 = 16 - ft;
535
w1 = ft - fp;
536
w2 = fp - fs;
537
w3 = fs;
538
break;
539
case 0:
540
s1 = NM;
541
s2 = N;
542
w0 = 16 - fp;
543
w1 = fp - ft;
544
w2 = ft - fs;
545
w3 = fs;
546
break;
547
default:
548
s1 = NM;
549
s2 = N;
550
w0 = 16 - fp;
551
w1 = fp - ft;
552
w2 = ft - fs;
553
w3 = fs;
554
break;
555
}
556
557
qweight[1] = qweight[0] + s1;
558
qweight[2] = qweight[1] + s2;
559
weight[0] = w0;
560
weight[1] = w1;
561
weight[2] = w2;
562
weight[3] = w3;
563
564
for (unsigned int i = 0; i < 4; i++)
565
{
566
if (weight[i] != 0)
567
{
568
wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
569
wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
570
wb.weight_count_of_texel[texel]++;
571
wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
572
wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
573
wb.texel_count_of_weight[qweight[i]]++;
574
max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
575
}
576
}
577
}
578
}
579
}
580
581
uint8_t max_texel_weight_count = 0;
582
for (unsigned int i = 0; i < texels_per_block; i++)
583
{
584
di.texel_weight_count[i] = wb.weight_count_of_texel[i];
585
max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]);
586
587
// Init all 4 entries so we can rely on zeros for vectorization
588
for (unsigned int j = 0; j < 4; j++)
589
{
590
di.texel_weight_contribs_int_tr[j][i] = 0;
591
di.texel_weight_contribs_float_tr[j][i] = 0.0f;
592
di.texel_weights_tr[j][i] = 0;
593
}
594
595
for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
596
{
597
di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
598
di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
599
di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
600
}
601
}
602
603
di.max_texel_weight_count = max_texel_weight_count;
604
605
for (unsigned int i = 0; i < weights_per_block; i++)
606
{
607
unsigned int texel_count_wt = wb.texel_count_of_weight[i];
608
di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt);
609
610
for (unsigned int j = 0; j < texel_count_wt; j++)
611
{
612
unsigned int texel = wb.texels_of_weight[i][j];
613
614
// Create transposed versions of these for better vectorization
615
di.weight_texels_tr[j][i] = static_cast<uint8_t>(texel);
616
di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
617
618
// Store the per-texel contribution of this weight for each texel it contributes to
619
di.texel_contrib_for_weight[j][i] = 0.0f;
620
for (unsigned int k = 0; k < 4; k++)
621
{
622
uint8_t dttw = di.texel_weights_tr[k][texel];
623
float dttwf = di.texel_weight_contribs_float_tr[k][texel];
624
if (dttw == i && dttwf != 0.0f)
625
{
626
di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
627
break;
628
}
629
}
630
}
631
632
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
633
// Match last texel in active lane in SIMD group, for better gathers
634
uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i];
635
for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
636
{
637
di.weight_texels_tr[j][i] = last_texel;
638
di.weights_texel_contribs_tr[j][i] = 0.0f;
639
}
640
}
641
642
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
643
size_t texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
644
for (size_t i = texels_per_block; i < texels_per_block_simd; i++)
645
{
646
di.texel_weight_count[i] = 0;
647
648
for (size_t j = 0; j < 4; j++)
649
{
650
di.texel_weight_contribs_float_tr[j][i] = 0;
651
di.texel_weights_tr[j][i] = 0;
652
di.texel_weight_contribs_int_tr[j][i] = 0;
653
}
654
}
655
656
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
657
// Match last texel in active lane in SIMD group, for better gathers
658
int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
659
uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
660
661
size_t weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
662
for (size_t i = weights_per_block; i < weights_per_block_simd; i++)
663
{
664
di.weight_texel_count[i] = 0;
665
666
for (size_t j = 0; j < max_texel_count_of_weight; j++)
667
{
668
di.weight_texels_tr[j][i] = last_texel;
669
di.weights_texel_contribs_tr[j][i] = 0.0f;
670
}
671
}
672
673
di.texel_count = static_cast<uint8_t>(texels_per_block);
674
di.weight_count = static_cast<uint8_t>(weights_per_block);
675
di.weight_x = static_cast<uint8_t>(x_weights);
676
di.weight_y = static_cast<uint8_t>(y_weights);
677
di.weight_z = static_cast<uint8_t>(z_weights);
678
}
679
680
/**
681
* @brief Assign the texels to use for kmeans clustering.
682
*
683
* The max limit is @c BLOCK_MAX_KMEANS_TEXELS; above this a random selection is used.
684
* The @c bsd.texel_count is an input and must be populated beforehand.
685
*
686
* @param[in,out] bsd The block size descriptor to populate.
687
*/
688
static void assign_kmeans_texels(
689
block_size_descriptor& bsd
690
) {
691
// Use all texels for kmeans on a small block
692
if (bsd.texel_count <= BLOCK_MAX_KMEANS_TEXELS)
693
{
694
for (uint8_t i = 0; i < bsd.texel_count; i++)
695
{
696
bsd.kmeans_texels[i] = i;
697
}
698
699
return;
700
}
701
702
// Select a random subset of BLOCK_MAX_KMEANS_TEXELS for kmeans on a large block
703
uint64_t rng_state[2];
704
astc::rand_init(rng_state);
705
706
// Initialize array used for tracking used indices
707
bool seen[BLOCK_MAX_TEXELS];
708
for (uint8_t i = 0; i < bsd.texel_count; i++)
709
{
710
seen[i] = false;
711
}
712
713
// Assign 64 random indices, retrying if we see repeats
714
unsigned int arr_elements_set = 0;
715
while (arr_elements_set < BLOCK_MAX_KMEANS_TEXELS)
716
{
717
uint8_t texel = static_cast<uint8_t>(astc::rand(rng_state));
718
texel = texel % bsd.texel_count;
719
if (!seen[texel])
720
{
721
bsd.kmeans_texels[arr_elements_set++] = texel;
722
seen[texel] = true;
723
}
724
}
725
}
726
727
/**
728
* @brief Allocate a single 2D decimation table entry.
729
*
730
* @param x_texels The number of texels in the X dimension.
731
* @param y_texels The number of texels in the Y dimension.
732
* @param x_weights The number of weights in the X dimension.
733
* @param y_weights The number of weights in the Y dimension.
734
* @param bsd The block size descriptor we are populating.
735
* @param wb The decimation table init scratch working buffers.
736
* @param index The packed array index to populate.
737
*/
738
static void construct_dt_entry_2d(
739
unsigned int x_texels,
740
unsigned int y_texels,
741
unsigned int x_weights,
742
unsigned int y_weights,
743
block_size_descriptor& bsd,
744
dt_init_working_buffers& wb,
745
unsigned int index
746
) {
747
unsigned int weight_count = x_weights * y_weights;
748
assert(weight_count <= BLOCK_MAX_WEIGHTS);
749
750
bool try_2planes = (2 * weight_count) <= BLOCK_MAX_WEIGHTS;
751
752
decimation_info& di = bsd.decimation_tables[index];
753
init_decimation_info_2d(x_texels, y_texels, x_weights, y_weights, di, wb);
754
755
int maxprec_1plane = -1;
756
int maxprec_2planes = -1;
757
for (int i = 0; i < 12; i++)
758
{
759
unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i));
760
if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS)
761
{
762
maxprec_1plane = i;
763
}
764
765
if (try_2planes)
766
{
767
unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i));
768
if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS)
769
{
770
maxprec_2planes = i;
771
}
772
}
773
}
774
775
// At least one of the two should be valid ...
776
assert(maxprec_1plane >= 0 || maxprec_2planes >= 0);
777
bsd.decimation_modes[index].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
778
bsd.decimation_modes[index].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
779
bsd.decimation_modes[index].refprec_1plane = 0;
780
bsd.decimation_modes[index].refprec_2planes = 0;
781
}
782
783
/**
784
* @brief Allocate block modes and decimation tables for a single 2D block size.
785
*
786
* @param x_texels The number of texels in the X dimension.
787
* @param y_texels The number of texels in the Y dimension.
788
* @param can_omit_modes Can we discard modes that astcenc won't use, even if legal?
789
* @param mode_cutoff Percentile cutoff in range [0,1]. Low values more likely to be used.
790
* @param[out] bsd The block size descriptor to populate.
791
*/
792
static void construct_block_size_descriptor_2d(
793
unsigned int x_texels,
794
unsigned int y_texels,
795
bool can_omit_modes,
796
float mode_cutoff,
797
block_size_descriptor& bsd
798
) {
799
// Store a remap table for storing packed decimation modes.
800
// Indexing uses [Y * 16 + X] and max size for each axis is 12.
801
static const unsigned int MAX_DMI = 12 * 16 + 12;
802
int decimation_mode_index[MAX_DMI];
803
804
dt_init_working_buffers* wb = new dt_init_working_buffers;
805
806
bsd.xdim = static_cast<uint8_t>(x_texels);
807
bsd.ydim = static_cast<uint8_t>(y_texels);
808
bsd.zdim = 1;
809
bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels);
810
811
for (unsigned int i = 0; i < MAX_DMI; i++)
812
{
813
decimation_mode_index[i] = -1;
814
}
815
816
// Gather all the decimation grids that can be used with the current block
817
#if !defined(ASTCENC_DECOMPRESS_ONLY)
818
const float *percentiles = get_2d_percentile_table(x_texels, y_texels);
819
float always_cutoff = 0.0f;
820
#else
821
// Unused in decompress-only builds
822
(void)can_omit_modes;
823
(void)mode_cutoff;
824
#endif
825
826
// Construct the list of block formats referencing the decimation tables
827
unsigned int packed_bm_idx = 0;
828
unsigned int packed_dm_idx = 0;
829
830
// Trackers
831
unsigned int bm_counts[4] { 0 };
832
unsigned int dm_counts[4] { 0 };
833
834
// Clear the list to a known-bad value
835
for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
836
{
837
bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE;
838
}
839
840
// Iterate four times to build a usefully ordered list:
841
// - Pass 0 - keep selected single plane "always" block modes
842
// - Pass 1 - keep selected single plane "non-always" block modes
843
// - Pass 2 - keep select dual plane block modes
844
// - Pass 3 - keep everything else that's legal
845
unsigned int limit = can_omit_modes ? 3 : 4;
846
for (unsigned int j = 0; j < limit; j ++)
847
{
848
for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
849
{
850
// Skip modes we've already included in a previous pass
851
if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE)
852
{
853
continue;
854
}
855
856
// Decode parameters
857
unsigned int x_weights;
858
unsigned int y_weights;
859
bool is_dual_plane;
860
unsigned int quant_mode;
861
unsigned int weight_bits;
862
bool valid = decode_block_mode_2d(i, x_weights, y_weights, is_dual_plane, quant_mode, weight_bits);
863
864
// Always skip invalid encodings for the current block size
865
if (!valid || (x_weights > x_texels) || (y_weights > y_texels))
866
{
867
continue;
868
}
869
870
// Selectively skip dual plane encodings
871
if (((j <= 1) && is_dual_plane) || (j == 2 && !is_dual_plane))
872
{
873
continue;
874
}
875
876
// Always skip encodings we can't physically encode based on
877
// generic encoding bit availability
878
if (is_dual_plane)
879
{
880
// This is the only check we need as only support 1 partition
881
if ((109 - weight_bits) <= 0)
882
{
883
continue;
884
}
885
}
886
else
887
{
888
// This is conservative - fewer bits may be available for > 1 partition
889
if ((111 - weight_bits) <= 0)
890
{
891
continue;
892
}
893
}
894
895
// Selectively skip encodings based on percentile
896
bool percentile_hit = false;
897
#if !defined(ASTCENC_DECOMPRESS_ONLY)
898
if (j == 0)
899
{
900
percentile_hit = percentiles[i] <= always_cutoff;
901
}
902
else
903
{
904
percentile_hit = percentiles[i] <= mode_cutoff;
905
}
906
#endif
907
908
if (j != 3 && !percentile_hit)
909
{
910
continue;
911
}
912
913
// Allocate and initialize the decimation table entry if we've not used it yet
914
int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights];
915
if (decimation_mode < 0)
916
{
917
construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd, *wb, packed_dm_idx);
918
decimation_mode_index[y_weights * 16 + x_weights] = packed_dm_idx;
919
decimation_mode = packed_dm_idx;
920
921
dm_counts[j]++;
922
packed_dm_idx++;
923
}
924
925
auto& bm = bsd.block_modes[packed_bm_idx];
926
927
bm.decimation_mode = static_cast<uint8_t>(decimation_mode);
928
bm.quant_mode = static_cast<uint8_t>(quant_mode);
929
bm.is_dual_plane = static_cast<uint8_t>(is_dual_plane);
930
bm.weight_bits = static_cast<uint8_t>(weight_bits);
931
bm.mode_index = static_cast<uint16_t>(i);
932
933
auto& dm = bsd.decimation_modes[decimation_mode];
934
935
if (is_dual_plane)
936
{
937
dm.set_ref_2plane(bm.get_weight_quant_mode());
938
}
939
else
940
{
941
dm.set_ref_1plane(bm.get_weight_quant_mode());
942
}
943
944
bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_bm_idx);
945
946
packed_bm_idx++;
947
bm_counts[j]++;
948
}
949
}
950
951
bsd.block_mode_count_1plane_always = bm_counts[0];
952
bsd.block_mode_count_1plane_selected = bm_counts[0] + bm_counts[1];
953
bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1] + bm_counts[2];
954
bsd.block_mode_count_all = bm_counts[0] + bm_counts[1] + bm_counts[2] + bm_counts[3];
955
956
bsd.decimation_mode_count_always = dm_counts[0];
957
bsd.decimation_mode_count_selected = dm_counts[0] + dm_counts[1] + dm_counts[2];
958
bsd.decimation_mode_count_all = dm_counts[0] + dm_counts[1] + dm_counts[2] + dm_counts[3];
959
960
#if !defined(ASTCENC_DECOMPRESS_ONLY)
961
assert(bsd.block_mode_count_1plane_always > 0);
962
assert(bsd.decimation_mode_count_always > 0);
963
964
delete[] percentiles;
965
#endif
966
967
// Ensure the end of the array contains valid data (should never get read)
968
for (unsigned int i = bsd.decimation_mode_count_all; i < WEIGHTS_MAX_DECIMATION_MODES; i++)
969
{
970
bsd.decimation_modes[i].maxprec_1plane = -1;
971
bsd.decimation_modes[i].maxprec_2planes = -1;
972
bsd.decimation_modes[i].refprec_1plane = 0;
973
bsd.decimation_modes[i].refprec_2planes = 0;
974
}
975
976
// Determine the texels to use for kmeans clustering.
977
assign_kmeans_texels(bsd);
978
979
delete wb;
980
}
981
982
/**
983
* @brief Allocate block modes and decimation tables for a single 3D block size.
984
*
985
* TODO: This function doesn't include all of the heuristics that we use for 2D block sizes such as
986
* the percentile mode cutoffs. If 3D becomes more widely used we should look at this.
987
*
988
* @param x_texels The number of texels in the X dimension.
989
* @param y_texels The number of texels in the Y dimension.
990
* @param z_texels The number of texels in the Z dimension.
991
* @param[out] bsd The block size descriptor to populate.
992
*/
993
static void construct_block_size_descriptor_3d(
994
unsigned int x_texels,
995
unsigned int y_texels,
996
unsigned int z_texels,
997
block_size_descriptor& bsd
998
) {
999
// Store a remap table for storing packed decimation modes.
1000
// Indexing uses [Z * 64 + Y * 8 + X] and max size for each axis is 6.
1001
static constexpr unsigned int MAX_DMI = 6 * 64 + 6 * 8 + 6;
1002
int decimation_mode_index[MAX_DMI];
1003
unsigned int decimation_mode_count = 0;
1004
1005
dt_init_working_buffers* wb = new dt_init_working_buffers;
1006
1007
bsd.xdim = static_cast<uint8_t>(x_texels);
1008
bsd.ydim = static_cast<uint8_t>(y_texels);
1009
bsd.zdim = static_cast<uint8_t>(z_texels);
1010
bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels * z_texels);
1011
1012
for (unsigned int i = 0; i < MAX_DMI; i++)
1013
{
1014
decimation_mode_index[i] = -1;
1015
}
1016
1017
// gather all the infill-modes that can be used with the current block size
1018
for (unsigned int x_weights = 2; x_weights <= x_texels; x_weights++)
1019
{
1020
for (unsigned int y_weights = 2; y_weights <= y_texels; y_weights++)
1021
{
1022
for (unsigned int z_weights = 2; z_weights <= z_texels; z_weights++)
1023
{
1024
unsigned int weight_count = x_weights * y_weights * z_weights;
1025
if (weight_count > BLOCK_MAX_WEIGHTS)
1026
{
1027
continue;
1028
}
1029
1030
decimation_info& di = bsd.decimation_tables[decimation_mode_count];
1031
decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights] = decimation_mode_count;
1032
init_decimation_info_3d(x_texels, y_texels, z_texels, x_weights, y_weights, z_weights, di, *wb);
1033
1034
int maxprec_1plane = -1;
1035
int maxprec_2planes = -1;
1036
for (unsigned int i = 0; i < 12; i++)
1037
{
1038
unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i));
1039
if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS)
1040
{
1041
maxprec_1plane = i;
1042
}
1043
1044
unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i));
1045
if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS)
1046
{
1047
maxprec_2planes = i;
1048
}
1049
}
1050
1051
if ((2 * weight_count) > BLOCK_MAX_WEIGHTS)
1052
{
1053
maxprec_2planes = -1;
1054
}
1055
1056
bsd.decimation_modes[decimation_mode_count].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
1057
bsd.decimation_modes[decimation_mode_count].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
1058
bsd.decimation_modes[decimation_mode_count].refprec_1plane = maxprec_1plane == -1 ? 0 : 0xFFFF;
1059
bsd.decimation_modes[decimation_mode_count].refprec_2planes = maxprec_2planes == -1 ? 0 : 0xFFFF;
1060
decimation_mode_count++;
1061
}
1062
}
1063
}
1064
1065
// Ensure the end of the array contains valid data (should never get read)
1066
for (unsigned int i = decimation_mode_count; i < WEIGHTS_MAX_DECIMATION_MODES; i++)
1067
{
1068
bsd.decimation_modes[i].maxprec_1plane = -1;
1069
bsd.decimation_modes[i].maxprec_2planes = -1;
1070
bsd.decimation_modes[i].refprec_1plane = 0;
1071
bsd.decimation_modes[i].refprec_2planes = 0;
1072
}
1073
1074
bsd.decimation_mode_count_always = 0; // Skipped for 3D modes
1075
bsd.decimation_mode_count_selected = decimation_mode_count;
1076
bsd.decimation_mode_count_all = decimation_mode_count;
1077
1078
// Construct the list of block formats referencing the decimation tables
1079
1080
// Clear the list to a known-bad value
1081
for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
1082
{
1083
bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE;
1084
}
1085
1086
unsigned int packed_idx = 0;
1087
unsigned int bm_counts[2] { 0 };
1088
1089
// Iterate two times to build a usefully ordered list:
1090
// - Pass 0 - keep valid single plane block modes
1091
// - Pass 1 - keep valid dual plane block modes
1092
for (unsigned int j = 0; j < 2; j++)
1093
{
1094
for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
1095
{
1096
// Skip modes we've already included in a previous pass
1097
if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE)
1098
{
1099
continue;
1100
}
1101
1102
unsigned int x_weights;
1103
unsigned int y_weights;
1104
unsigned int z_weights;
1105
bool is_dual_plane;
1106
unsigned int quant_mode;
1107
unsigned int weight_bits;
1108
1109
bool valid = decode_block_mode_3d(i, x_weights, y_weights, z_weights, is_dual_plane, quant_mode, weight_bits);
1110
// Skip invalid encodings
1111
if (!valid || x_weights > x_texels || y_weights > y_texels || z_weights > z_texels)
1112
{
1113
continue;
1114
}
1115
1116
// Skip encodings in the wrong iteration
1117
if ((j == 0 && is_dual_plane) || (j == 1 && !is_dual_plane))
1118
{
1119
continue;
1120
}
1121
1122
// Always skip encodings we can't physically encode based on bit availability
1123
if (is_dual_plane)
1124
{
1125
// This is the only check we need as only support 1 partition
1126
if ((109 - weight_bits) <= 0)
1127
{
1128
continue;
1129
}
1130
}
1131
else
1132
{
1133
// This is conservative - fewer bits may be available for > 1 partition
1134
if ((111 - weight_bits) <= 0)
1135
{
1136
continue;
1137
}
1138
}
1139
1140
int decimation_mode = decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights];
1141
bsd.block_modes[packed_idx].decimation_mode = static_cast<uint8_t>(decimation_mode);
1142
bsd.block_modes[packed_idx].quant_mode = static_cast<uint8_t>(quant_mode);
1143
bsd.block_modes[packed_idx].weight_bits = static_cast<uint8_t>(weight_bits);
1144
bsd.block_modes[packed_idx].is_dual_plane = static_cast<uint8_t>(is_dual_plane);
1145
bsd.block_modes[packed_idx].mode_index = static_cast<uint16_t>(i);
1146
1147
bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_idx);
1148
bm_counts[j]++;
1149
packed_idx++;
1150
}
1151
}
1152
1153
bsd.block_mode_count_1plane_always = 0; // Skipped for 3D modes
1154
bsd.block_mode_count_1plane_selected = bm_counts[0];
1155
bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1];
1156
bsd.block_mode_count_all = bm_counts[0] + bm_counts[1];
1157
1158
// Determine the texels to use for kmeans clustering.
1159
assign_kmeans_texels(bsd);
1160
1161
delete wb;
1162
}
1163
1164
/* See header for documentation. */
1165
void init_block_size_descriptor(
1166
unsigned int x_texels,
1167
unsigned int y_texels,
1168
unsigned int z_texels,
1169
bool can_omit_modes,
1170
unsigned int partition_count_cutoff,
1171
float mode_cutoff,
1172
block_size_descriptor& bsd
1173
) {
1174
if (z_texels > 1)
1175
{
1176
construct_block_size_descriptor_3d(x_texels, y_texels, z_texels, bsd);
1177
}
1178
else
1179
{
1180
construct_block_size_descriptor_2d(x_texels, y_texels, can_omit_modes, mode_cutoff, bsd);
1181
}
1182
1183
init_partition_tables(bsd, can_omit_modes, partition_count_cutoff);
1184
}
1185
1186