Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_averages_and_directions.cpp
9902 views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2025 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
// http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
/**
19
* @brief Functions for finding dominant direction of a set of colors.
20
*/
21
#if !defined(ASTCENC_DECOMPRESS_ONLY)
22
23
#include "astcenc_internal.h"
24
25
#include <cassert>
26
27
/**
28
* @brief Compute the average RGB color of each partition.
29
*
30
* The algorithm here uses a vectorized sequential scan and per-partition
31
* color accumulators, using select() to mask texel lanes in other partitions.
32
*
33
* We only accumulate sums for N-1 partitions during the scan; the value for
34
* the last partition can be computed given that we know the block-wide average
35
* already.
36
*
37
* Because of this we could reduce the loop iteration count so it "just" spans
38
* the max texel index needed for the N-1 partitions, which could need fewer
39
* iterations than the full block texel count. However, this makes the loop
40
* count erratic and causes more branch mispredictions so is a net loss.
41
*
42
* @param pi The partitioning to use.
43
* @param blk The block data to process.
44
* @param[out] averages The output averages. Unused partition indices will
45
* not be initialized, and lane<3> will be zero.
46
*/
47
static void compute_partition_averages_rgb(
48
const partition_info& pi,
49
const image_block& blk,
50
vfloat4 averages[BLOCK_MAX_PARTITIONS]
51
) {
52
unsigned int partition_count = pi.partition_count;
53
size_t texel_count = blk.texel_count;
54
promise(texel_count > 0);
55
56
// For 1 partition just use the precomputed mean
57
if (partition_count == 1)
58
{
59
averages[0] = blk.data_mean.swz<0, 1, 2>();
60
}
61
// For 2 partitions scan results for partition 0, compute partition 1
62
else if (partition_count == 2)
63
{
64
vfloatacc pp_avg_rgb[3] {};
65
66
vint lane_id = vint::lane_id();
67
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
68
{
69
vint texel_partition(pi.partition_of_texel + i);
70
71
vmask lane_mask = lane_id < vint_from_size(texel_count);
72
lane_id += vint(ASTCENC_SIMD_WIDTH);
73
74
vmask p0_mask = lane_mask & (texel_partition == vint(0));
75
76
vfloat data_r = loada(blk.data_r + i);
77
haccumulate(pp_avg_rgb[0], data_r, p0_mask);
78
79
vfloat data_g = loada(blk.data_g + i);
80
haccumulate(pp_avg_rgb[1], data_g, p0_mask);
81
82
vfloat data_b = loada(blk.data_b + i);
83
haccumulate(pp_avg_rgb[2], data_b, p0_mask);
84
}
85
86
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
87
88
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
89
hadd_s(pp_avg_rgb[1]),
90
hadd_s(pp_avg_rgb[2]));
91
92
vfloat4 p1_total = block_total - p0_total;
93
94
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
95
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
96
}
97
// For 3 partitions scan results for partition 0/1, compute partition 2
98
else if (partition_count == 3)
99
{
100
vfloatacc pp_avg_rgb[2][3] {};
101
102
vint lane_id = vint::lane_id();
103
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
104
{
105
vint texel_partition(pi.partition_of_texel + i);
106
107
vmask lane_mask = lane_id < vint_from_size(texel_count);
108
lane_id += vint(ASTCENC_SIMD_WIDTH);
109
110
vmask p0_mask = lane_mask & (texel_partition == vint(0));
111
vmask p1_mask = lane_mask & (texel_partition == vint(1));
112
113
vfloat data_r = loada(blk.data_r + i);
114
haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
115
haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
116
117
vfloat data_g = loada(blk.data_g + i);
118
haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
119
haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
120
121
vfloat data_b = loada(blk.data_b + i);
122
haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
123
haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
124
}
125
126
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
127
128
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
129
hadd_s(pp_avg_rgb[0][1]),
130
hadd_s(pp_avg_rgb[0][2]));
131
132
vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
133
hadd_s(pp_avg_rgb[1][1]),
134
hadd_s(pp_avg_rgb[1][2]));
135
136
vfloat4 p2_total = block_total - p0_total - p1_total;
137
138
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
139
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
140
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
141
}
142
else
143
{
144
// For 4 partitions scan results for partition 0/1/2, compute partition 3
145
vfloatacc pp_avg_rgb[3][3] {};
146
147
vint lane_id = vint::lane_id();
148
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
149
{
150
vint texel_partition(pi.partition_of_texel + i);
151
152
vmask lane_mask = lane_id < vint_from_size(texel_count);
153
lane_id += vint(ASTCENC_SIMD_WIDTH);
154
155
vmask p0_mask = lane_mask & (texel_partition == vint(0));
156
vmask p1_mask = lane_mask & (texel_partition == vint(1));
157
vmask p2_mask = lane_mask & (texel_partition == vint(2));
158
159
vfloat data_r = loada(blk.data_r + i);
160
haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
161
haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
162
haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
163
164
vfloat data_g = loada(blk.data_g + i);
165
haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
166
haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
167
haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
168
169
vfloat data_b = loada(blk.data_b + i);
170
haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
171
haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
172
haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
173
}
174
175
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
176
177
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
178
hadd_s(pp_avg_rgb[0][1]),
179
hadd_s(pp_avg_rgb[0][2]));
180
181
vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
182
hadd_s(pp_avg_rgb[1][1]),
183
hadd_s(pp_avg_rgb[1][2]));
184
185
vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
186
hadd_s(pp_avg_rgb[2][1]),
187
hadd_s(pp_avg_rgb[2][2]));
188
189
vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
190
191
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
192
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
193
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
194
averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
195
}
196
}
197
198
/**
199
* @brief Compute the average RGBA color of each partition.
200
*
201
* The algorithm here uses a vectorized sequential scan and per-partition
202
* color accumulators, using select() to mask texel lanes in other partitions.
203
*
204
* We only accumulate sums for N-1 partitions during the scan; the value for
205
* the last partition can be computed given that we know the block-wide average
206
* already.
207
*
208
* Because of this we could reduce the loop iteration count so it "just" spans
209
* the max texel index needed for the N-1 partitions, which could need fewer
210
* iterations than the full block texel count. However, this makes the loop
211
* count erratic and causes more branch mispredictions so is a net loss.
212
*
213
* @param pi The partitioning to use.
214
* @param blk The block data to process.
215
* @param[out] averages The output averages. Unused partition indices will
216
* not be initialized.
217
*/
218
static void compute_partition_averages_rgba(
219
const partition_info& pi,
220
const image_block& blk,
221
vfloat4 averages[BLOCK_MAX_PARTITIONS]
222
) {
223
unsigned int partition_count = pi.partition_count;
224
size_t texel_count = blk.texel_count;
225
promise(texel_count > 0);
226
227
// For 1 partition just use the precomputed mean
228
if (partition_count == 1)
229
{
230
averages[0] = blk.data_mean;
231
}
232
// For 2 partitions scan results for partition 0, compute partition 1
233
else if (partition_count == 2)
234
{
235
vfloat4 pp_avg_rgba[4] {};
236
237
vint lane_id = vint::lane_id();
238
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
239
{
240
vint texel_partition(pi.partition_of_texel + i);
241
242
vmask lane_mask = lane_id < vint_from_size(texel_count);
243
lane_id += vint(ASTCENC_SIMD_WIDTH);
244
245
vmask p0_mask = lane_mask & (texel_partition == vint(0));
246
247
vfloat data_r = loada(blk.data_r + i);
248
haccumulate(pp_avg_rgba[0], data_r, p0_mask);
249
250
vfloat data_g = loada(blk.data_g + i);
251
haccumulate(pp_avg_rgba[1], data_g, p0_mask);
252
253
vfloat data_b = loada(blk.data_b + i);
254
haccumulate(pp_avg_rgba[2], data_b, p0_mask);
255
256
vfloat data_a = loada(blk.data_a + i);
257
haccumulate(pp_avg_rgba[3], data_a, p0_mask);
258
}
259
260
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
261
262
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
263
hadd_s(pp_avg_rgba[1]),
264
hadd_s(pp_avg_rgba[2]),
265
hadd_s(pp_avg_rgba[3]));
266
267
vfloat4 p1_total = block_total - p0_total;
268
269
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
270
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
271
}
272
// For 3 partitions scan results for partition 0/1, compute partition 2
273
else if (partition_count == 3)
274
{
275
vfloat4 pp_avg_rgba[2][4] {};
276
277
vint lane_id = vint::lane_id();
278
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
279
{
280
vint texel_partition(pi.partition_of_texel + i);
281
282
vmask lane_mask = lane_id < vint_from_size(texel_count);
283
lane_id += vint(ASTCENC_SIMD_WIDTH);
284
285
vmask p0_mask = lane_mask & (texel_partition == vint(0));
286
vmask p1_mask = lane_mask & (texel_partition == vint(1));
287
288
vfloat data_r = loada(blk.data_r + i);
289
haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
290
haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
291
292
vfloat data_g = loada(blk.data_g + i);
293
haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
294
haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
295
296
vfloat data_b = loada(blk.data_b + i);
297
haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
298
haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
299
300
vfloat data_a = loada(blk.data_a + i);
301
haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
302
haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
303
}
304
305
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
306
307
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
308
hadd_s(pp_avg_rgba[0][1]),
309
hadd_s(pp_avg_rgba[0][2]),
310
hadd_s(pp_avg_rgba[0][3]));
311
312
vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
313
hadd_s(pp_avg_rgba[1][1]),
314
hadd_s(pp_avg_rgba[1][2]),
315
hadd_s(pp_avg_rgba[1][3]));
316
317
vfloat4 p2_total = block_total - p0_total - p1_total;
318
319
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
320
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
321
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
322
}
323
else
324
{
325
// For 4 partitions scan results for partition 0/1/2, compute partition 3
326
vfloat4 pp_avg_rgba[3][4] {};
327
328
vint lane_id = vint::lane_id();
329
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
330
{
331
vint texel_partition(pi.partition_of_texel + i);
332
333
vmask lane_mask = lane_id < vint_from_size(texel_count);
334
lane_id += vint(ASTCENC_SIMD_WIDTH);
335
336
vmask p0_mask = lane_mask & (texel_partition == vint(0));
337
vmask p1_mask = lane_mask & (texel_partition == vint(1));
338
vmask p2_mask = lane_mask & (texel_partition == vint(2));
339
340
vfloat data_r = loada(blk.data_r + i);
341
haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
342
haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
343
haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
344
345
vfloat data_g = loada(blk.data_g + i);
346
haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
347
haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
348
haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
349
350
vfloat data_b = loada(blk.data_b + i);
351
haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
352
haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
353
haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
354
355
vfloat data_a = loada(blk.data_a + i);
356
haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
357
haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
358
haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
359
}
360
361
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
362
363
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
364
hadd_s(pp_avg_rgba[0][1]),
365
hadd_s(pp_avg_rgba[0][2]),
366
hadd_s(pp_avg_rgba[0][3]));
367
368
vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
369
hadd_s(pp_avg_rgba[1][1]),
370
hadd_s(pp_avg_rgba[1][2]),
371
hadd_s(pp_avg_rgba[1][3]));
372
373
vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
374
hadd_s(pp_avg_rgba[2][1]),
375
hadd_s(pp_avg_rgba[2][2]),
376
hadd_s(pp_avg_rgba[2][3]));
377
378
vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
379
380
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
381
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
382
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
383
averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
384
}
385
}
386
387
/* See header for documentation. */
388
void compute_avgs_and_dirs_4_comp(
389
const partition_info& pi,
390
const image_block& blk,
391
partition_metrics pm[BLOCK_MAX_PARTITIONS]
392
) {
393
size_t partition_count = pi.partition_count;
394
promise(partition_count > 0);
395
396
// Pre-compute partition_averages
397
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
398
compute_partition_averages_rgba(pi, blk, partition_averages);
399
400
for (size_t partition = 0; partition < partition_count; partition++)
401
{
402
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
403
size_t texel_count = pi.partition_texel_count[partition];
404
promise(texel_count > 0);
405
406
vfloat4 average = partition_averages[partition];
407
pm[partition].avg = average;
408
409
vfloat4 sum_xp = vfloat4::zero();
410
vfloat4 sum_yp = vfloat4::zero();
411
vfloat4 sum_zp = vfloat4::zero();
412
vfloat4 sum_wp = vfloat4::zero();
413
414
for (size_t i = 0; i < texel_count; i++)
415
{
416
unsigned int iwt = texel_indexes[i];
417
vfloat4 texel_datum = blk.texel(iwt);
418
texel_datum = texel_datum - average;
419
420
vfloat4 zero = vfloat4::zero();
421
422
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
423
sum_xp += select(zero, texel_datum, tdm0);
424
425
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
426
sum_yp += select(zero, texel_datum, tdm1);
427
428
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
429
sum_zp += select(zero, texel_datum, tdm2);
430
431
vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
432
sum_wp += select(zero, texel_datum, tdm3);
433
}
434
435
vfloat4 prod_xp = dot(sum_xp, sum_xp);
436
vfloat4 prod_yp = dot(sum_yp, sum_yp);
437
vfloat4 prod_zp = dot(sum_zp, sum_zp);
438
vfloat4 prod_wp = dot(sum_wp, sum_wp);
439
440
vfloat4 best_vector = sum_xp;
441
vfloat4 best_sum = prod_xp;
442
443
vmask4 mask = prod_yp > best_sum;
444
best_vector = select(best_vector, sum_yp, mask);
445
best_sum = select(best_sum, prod_yp, mask);
446
447
mask = prod_zp > best_sum;
448
best_vector = select(best_vector, sum_zp, mask);
449
best_sum = select(best_sum, prod_zp, mask);
450
451
mask = prod_wp > best_sum;
452
best_vector = select(best_vector, sum_wp, mask);
453
454
pm[partition].dir = best_vector;
455
}
456
}
457
458
/* See header for documentation. */
459
void compute_avgs_and_dirs_3_comp(
460
const partition_info& pi,
461
const image_block& blk,
462
unsigned int omitted_component,
463
partition_metrics pm[BLOCK_MAX_PARTITIONS]
464
) {
465
// Pre-compute partition_averages
466
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
467
compute_partition_averages_rgba(pi, blk, partition_averages);
468
469
const float* data_vr = blk.data_r;
470
const float* data_vg = blk.data_g;
471
const float* data_vb = blk.data_b;
472
473
// TODO: Data-driven permute would be useful to avoid this ...
474
if (omitted_component == 0)
475
{
476
partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
477
partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
478
partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
479
partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
480
481
data_vr = blk.data_g;
482
data_vg = blk.data_b;
483
data_vb = blk.data_a;
484
}
485
else if (omitted_component == 1)
486
{
487
partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
488
partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
489
partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
490
partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
491
492
data_vg = blk.data_b;
493
data_vb = blk.data_a;
494
}
495
else if (omitted_component == 2)
496
{
497
partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
498
partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
499
partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
500
partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
501
502
data_vb = blk.data_a;
503
}
504
else
505
{
506
partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
507
partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
508
partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
509
partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
510
}
511
512
size_t partition_count = pi.partition_count;
513
promise(partition_count > 0);
514
515
for (size_t partition = 0; partition < partition_count; partition++)
516
{
517
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
518
size_t texel_count = pi.partition_texel_count[partition];
519
promise(texel_count > 0);
520
521
vfloat4 average = partition_averages[partition];
522
pm[partition].avg = average;
523
524
vfloat4 sum_xp = vfloat4::zero();
525
vfloat4 sum_yp = vfloat4::zero();
526
vfloat4 sum_zp = vfloat4::zero();
527
528
for (size_t i = 0; i < texel_count; i++)
529
{
530
unsigned int iwt = texel_indexes[i];
531
532
vfloat4 texel_datum = vfloat3(data_vr[iwt],
533
data_vg[iwt],
534
data_vb[iwt]);
535
texel_datum = texel_datum - average;
536
537
vfloat4 zero = vfloat4::zero();
538
539
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
540
sum_xp += select(zero, texel_datum, tdm0);
541
542
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
543
sum_yp += select(zero, texel_datum, tdm1);
544
545
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
546
sum_zp += select(zero, texel_datum, tdm2);
547
}
548
549
vfloat4 prod_xp = dot(sum_xp, sum_xp);
550
vfloat4 prod_yp = dot(sum_yp, sum_yp);
551
vfloat4 prod_zp = dot(sum_zp, sum_zp);
552
553
vfloat4 best_vector = sum_xp;
554
vfloat4 best_sum = prod_xp;
555
556
vmask4 mask = prod_yp > best_sum;
557
best_vector = select(best_vector, sum_yp, mask);
558
best_sum = select(best_sum, prod_yp, mask);
559
560
mask = prod_zp > best_sum;
561
best_vector = select(best_vector, sum_zp, mask);
562
563
pm[partition].dir = best_vector;
564
}
565
}
566
567
/* See header for documentation. */
568
void compute_avgs_and_dirs_3_comp_rgb(
569
const partition_info& pi,
570
const image_block& blk,
571
partition_metrics pm[BLOCK_MAX_PARTITIONS]
572
) {
573
size_t partition_count = pi.partition_count;
574
promise(partition_count > 0);
575
576
// Pre-compute partition_averages
577
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
578
compute_partition_averages_rgb(pi, blk, partition_averages);
579
580
for (size_t partition = 0; partition < partition_count; partition++)
581
{
582
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
583
size_t texel_count = pi.partition_texel_count[partition];
584
promise(texel_count > 0);
585
586
vfloat4 average = partition_averages[partition];
587
pm[partition].avg = average;
588
589
vfloat4 sum_xp = vfloat4::zero();
590
vfloat4 sum_yp = vfloat4::zero();
591
vfloat4 sum_zp = vfloat4::zero();
592
593
for (size_t i = 0; i < texel_count; i++)
594
{
595
unsigned int iwt = texel_indexes[i];
596
597
vfloat4 texel_datum = blk.texel3(iwt);
598
texel_datum = texel_datum - average;
599
600
vfloat4 zero = vfloat4::zero();
601
602
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
603
sum_xp += select(zero, texel_datum, tdm0);
604
605
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
606
sum_yp += select(zero, texel_datum, tdm1);
607
608
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
609
sum_zp += select(zero, texel_datum, tdm2);
610
}
611
612
vfloat4 prod_xp = dot(sum_xp, sum_xp);
613
vfloat4 prod_yp = dot(sum_yp, sum_yp);
614
vfloat4 prod_zp = dot(sum_zp, sum_zp);
615
616
vfloat4 best_vector = sum_xp;
617
vfloat4 best_sum = prod_xp;
618
619
vmask4 mask = prod_yp > best_sum;
620
best_vector = select(best_vector, sum_yp, mask);
621
best_sum = select(best_sum, prod_yp, mask);
622
623
mask = prod_zp > best_sum;
624
best_vector = select(best_vector, sum_zp, mask);
625
626
pm[partition].dir = best_vector;
627
}
628
}
629
630
/* See header for documentation. */
631
void compute_avgs_and_dirs_2_comp(
632
const partition_info& pt,
633
const image_block& blk,
634
unsigned int component1,
635
unsigned int component2,
636
partition_metrics pm[BLOCK_MAX_PARTITIONS]
637
) {
638
vfloat4 average;
639
640
const float* data_vr = nullptr;
641
const float* data_vg = nullptr;
642
643
if (component1 == 0 && component2 == 1)
644
{
645
average = blk.data_mean.swz<0, 1>();
646
647
data_vr = blk.data_r;
648
data_vg = blk.data_g;
649
}
650
else if (component1 == 0 && component2 == 2)
651
{
652
average = blk.data_mean.swz<0, 2>();
653
654
data_vr = blk.data_r;
655
data_vg = blk.data_b;
656
}
657
else // (component1 == 1 && component2 == 2)
658
{
659
assert(component1 == 1 && component2 == 2);
660
661
average = blk.data_mean.swz<1, 2>();
662
663
data_vr = blk.data_g;
664
data_vg = blk.data_b;
665
}
666
667
size_t partition_count = pt.partition_count;
668
promise(partition_count > 0);
669
670
for (size_t partition = 0; partition < partition_count; partition++)
671
{
672
const uint8_t *texel_indexes = pt.texels_of_partition[partition];
673
size_t texel_count = pt.partition_texel_count[partition];
674
promise(texel_count > 0);
675
676
// Only compute a partition mean if more than one partition
677
if (partition_count > 1)
678
{
679
average = vfloat4::zero();
680
for (size_t i = 0; i < texel_count; i++)
681
{
682
unsigned int iwt = texel_indexes[i];
683
average += vfloat2(data_vr[iwt], data_vg[iwt]);
684
}
685
686
average = average / static_cast<float>(texel_count);
687
}
688
689
pm[partition].avg = average;
690
691
vfloat4 sum_xp = vfloat4::zero();
692
vfloat4 sum_yp = vfloat4::zero();
693
694
for (size_t i = 0; i < texel_count; i++)
695
{
696
unsigned int iwt = texel_indexes[i];
697
vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
698
texel_datum = texel_datum - average;
699
700
vfloat4 zero = vfloat4::zero();
701
702
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
703
sum_xp += select(zero, texel_datum, tdm0);
704
705
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
706
sum_yp += select(zero, texel_datum, tdm1);
707
}
708
709
vfloat4 prod_xp = dot(sum_xp, sum_xp);
710
vfloat4 prod_yp = dot(sum_yp, sum_yp);
711
712
vfloat4 best_vector = sum_xp;
713
vfloat4 best_sum = prod_xp;
714
715
vmask4 mask = prod_yp > best_sum;
716
best_vector = select(best_vector, sum_yp, mask);
717
718
pm[partition].dir = best_vector;
719
}
720
}
721
722
/* See header for documentation. */
723
void compute_error_squared_rgba(
724
const partition_info& pi,
725
const image_block& blk,
726
const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
727
const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
728
float line_lengths[BLOCK_MAX_PARTITIONS],
729
float& uncor_error,
730
float& samec_error
731
) {
732
size_t partition_count = pi.partition_count;
733
promise(partition_count > 0);
734
735
vfloatacc uncor_errorsumv = vfloatacc::zero();
736
vfloatacc samec_errorsumv = vfloatacc::zero();
737
738
for (size_t partition = 0; partition < partition_count; partition++)
739
{
740
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
741
742
processed_line4 l_uncor = uncor_plines[partition];
743
processed_line4 l_samec = samec_plines[partition];
744
745
size_t texel_count = pi.partition_texel_count[partition];
746
promise(texel_count > 0);
747
748
// Vectorize some useful scalar inputs
749
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
750
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
751
vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
752
vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
753
754
vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
755
vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
756
vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
757
vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
758
759
vfloat l_samec_bs0(l_samec.bs.lane<0>());
760
vfloat l_samec_bs1(l_samec.bs.lane<1>());
761
vfloat l_samec_bs2(l_samec.bs.lane<2>());
762
vfloat l_samec_bs3(l_samec.bs.lane<3>());
763
764
assert(all(l_samec.amod == vfloat4(0.0f)));
765
766
vfloat uncor_loparamv(1e10f);
767
vfloat uncor_hiparamv(-1e10f);
768
769
vfloat ew_r(blk.channel_weight.lane<0>());
770
vfloat ew_g(blk.channel_weight.lane<1>());
771
vfloat ew_b(blk.channel_weight.lane<2>());
772
vfloat ew_a(blk.channel_weight.lane<3>());
773
774
// This implementation over-shoots, but this is safe as we initialize the texel_indexes
775
// array to extend the last value. This means min/max are not impacted, but we need to mask
776
// out the dummy values when we compute the line weighting.
777
vint lane_ids = vint::lane_id();
778
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
779
{
780
vmask mask = lane_ids < vint_from_size(texel_count);
781
const uint8_t* texel_idxs = texel_indexes + i;
782
783
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
784
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
785
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
786
vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
787
788
vfloat uncor_param = (data_r * l_uncor_bs0)
789
+ (data_g * l_uncor_bs1)
790
+ (data_b * l_uncor_bs2)
791
+ (data_a * l_uncor_bs3);
792
793
uncor_loparamv = min(uncor_param, uncor_loparamv);
794
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
795
796
vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
797
+ (uncor_param * l_uncor_bs0);
798
vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
799
+ (uncor_param * l_uncor_bs1);
800
vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
801
+ (uncor_param * l_uncor_bs2);
802
vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
803
+ (uncor_param * l_uncor_bs3);
804
805
vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
806
+ (ew_g * uncor_dist1 * uncor_dist1)
807
+ (ew_b * uncor_dist2 * uncor_dist2)
808
+ (ew_a * uncor_dist3 * uncor_dist3);
809
810
haccumulate(uncor_errorsumv, uncor_err, mask);
811
812
// Process samechroma data
813
vfloat samec_param = (data_r * l_samec_bs0)
814
+ (data_g * l_samec_bs1)
815
+ (data_b * l_samec_bs2)
816
+ (data_a * l_samec_bs3);
817
818
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
819
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
820
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
821
vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
822
823
vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
824
+ (ew_g * samec_dist1 * samec_dist1)
825
+ (ew_b * samec_dist2 * samec_dist2)
826
+ (ew_a * samec_dist3 * samec_dist3);
827
828
haccumulate(samec_errorsumv, samec_err, mask);
829
830
lane_ids += vint(ASTCENC_SIMD_WIDTH);
831
}
832
833
// Turn very small numbers and NaNs into a small number
834
float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
835
line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
836
}
837
838
uncor_error = hadd_s(uncor_errorsumv);
839
samec_error = hadd_s(samec_errorsumv);
840
}
841
842
/* See header for documentation. */
843
void compute_error_squared_rgb(
844
const partition_info& pi,
845
const image_block& blk,
846
partition_lines3 plines[BLOCK_MAX_PARTITIONS],
847
float& uncor_error,
848
float& samec_error
849
) {
850
size_t partition_count = pi.partition_count;
851
promise(partition_count > 0);
852
853
vfloatacc uncor_errorsumv = vfloatacc::zero();
854
vfloatacc samec_errorsumv = vfloatacc::zero();
855
856
for (size_t partition = 0; partition < partition_count; partition++)
857
{
858
partition_lines3& pl = plines[partition];
859
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
860
size_t texel_count = pi.partition_texel_count[partition];
861
promise(texel_count > 0);
862
863
processed_line3 l_uncor = pl.uncor_pline;
864
processed_line3 l_samec = pl.samec_pline;
865
866
// Vectorize some useful scalar inputs
867
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
868
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
869
vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
870
871
vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
872
vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
873
vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
874
875
vfloat l_samec_bs0(l_samec.bs.lane<0>());
876
vfloat l_samec_bs1(l_samec.bs.lane<1>());
877
vfloat l_samec_bs2(l_samec.bs.lane<2>());
878
879
assert(all(l_samec.amod == vfloat4(0.0f)));
880
881
vfloat uncor_loparamv(1e10f);
882
vfloat uncor_hiparamv(-1e10f);
883
884
vfloat ew_r(blk.channel_weight.lane<0>());
885
vfloat ew_g(blk.channel_weight.lane<1>());
886
vfloat ew_b(blk.channel_weight.lane<2>());
887
888
// This implementation over-shoots, but this is safe as we initialize the weights array
889
// to extend the last value. This means min/max are not impacted, but we need to mask
890
// out the dummy values when we compute the line weighting.
891
vint lane_ids = vint::lane_id();
892
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
893
{
894
vmask mask = lane_ids < vint_from_size(texel_count);
895
const uint8_t* texel_idxs = texel_indexes + i;
896
897
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
898
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
899
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
900
901
vfloat uncor_param = (data_r * l_uncor_bs0)
902
+ (data_g * l_uncor_bs1)
903
+ (data_b * l_uncor_bs2);
904
905
uncor_loparamv = min(uncor_param, uncor_loparamv);
906
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
907
908
vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
909
+ (uncor_param * l_uncor_bs0);
910
vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
911
+ (uncor_param * l_uncor_bs1);
912
vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
913
+ (uncor_param * l_uncor_bs2);
914
915
vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
916
+ (ew_g * uncor_dist1 * uncor_dist1)
917
+ (ew_b * uncor_dist2 * uncor_dist2);
918
919
haccumulate(uncor_errorsumv, uncor_err, mask);
920
921
// Process samechroma data
922
vfloat samec_param = (data_r * l_samec_bs0)
923
+ (data_g * l_samec_bs1)
924
+ (data_b * l_samec_bs2);
925
926
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
927
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
928
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
929
930
vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
931
+ (ew_g * samec_dist1 * samec_dist1)
932
+ (ew_b * samec_dist2 * samec_dist2);
933
934
haccumulate(samec_errorsumv, samec_err, mask);
935
936
lane_ids += vint(ASTCENC_SIMD_WIDTH);
937
}
938
939
// Turn very small numbers and NaNs into a small number
940
float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
941
pl.line_length = astc::max(uncor_linelen, 1e-7f);
942
}
943
944
uncor_error = hadd_s(uncor_errorsumv);
945
samec_error = hadd_s(samec_errorsumv);
946
}
947
948
#endif
949
950