Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
9896 views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2024 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
// http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20
/**
21
* @brief Functions for computing color endpoints and texel weights.
22
*/
23
24
#include <cassert>
25
26
#include "astcenc_internal.h"
27
#include "astcenc_vecmathlib.h"
28
29
/**
30
* @brief Compute the infilled weight for N texel indices in a decimated grid.
31
*
32
* @param di The weight grid decimation to use.
33
* @param weights The decimated weight values to use.
34
* @param index The first texel index to interpolate.
35
*
36
* @return The interpolated weight for the given set of SIMD_WIDTH texels.
37
*/
38
static vfloat bilinear_infill_vla(
39
const decimation_info& di,
40
const float* weights,
41
unsigned int index
42
) {
43
// Load the bilinear filter texel weight indexes in the decimated grid
44
const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
45
const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
46
const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index;
47
const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index;
48
49
// Load the bilinear filter weights from the decimated grid
50
vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
51
vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
52
vfloat weight_val2 = gatherf_byte_inds<vfloat>(weights, weight_idx2);
53
vfloat weight_val3 = gatherf_byte_inds<vfloat>(weights, weight_idx3);
54
55
// Load the weight contribution factors for each decimated weight
56
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
57
vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
58
vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index);
59
vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index);
60
61
// Compute the bilinear interpolation to generate the per-texel weight
62
return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
63
(weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
64
}
65
66
/**
67
* @brief Compute the infilled weight for N texel indices in a decimated grid.
68
*
69
* This is specialized version which computes only two weights per texel for
70
* encodings that are only decimated in a single axis.
71
*
72
* @param di The weight grid decimation to use.
73
* @param weights The decimated weight values to use.
74
* @param index The first texel index to interpolate.
75
*
76
* @return The interpolated weight for the given set of SIMD_WIDTH texels.
77
*/
78
static vfloat bilinear_infill_vla_2(
79
const decimation_info& di,
80
const float* weights,
81
unsigned int index
82
) {
83
// Load the bilinear filter texel weight indexes in the decimated grid
84
const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
85
const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
86
87
// Load the bilinear filter weights from the decimated grid
88
vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
89
vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
90
91
// Load the weight contribution factors for each decimated weight
92
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
93
vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
94
95
// Compute the bilinear interpolation to generate the per-texel weight
96
return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
97
}
98
99
/**
100
* @brief Compute the ideal endpoints and weights for 1 color component.
101
*
102
* @param blk The image block color data to compress.
103
* @param pi The partition info for the current trial.
104
* @param[out] ei The computed ideal endpoints and weights.
105
* @param component The color component to compute.
106
*/
107
static void compute_ideal_colors_and_weights_1_comp(
108
const image_block& blk,
109
const partition_info& pi,
110
endpoints_and_weights& ei,
111
unsigned int component
112
) {
113
unsigned int partition_count = pi.partition_count;
114
ei.ep.partition_count = partition_count;
115
promise(partition_count > 0);
116
117
unsigned int texel_count = blk.texel_count;
118
promise(texel_count > 0);
119
120
float error_weight;
121
const float* data_vr = nullptr;
122
123
assert(component < BLOCK_MAX_COMPONENTS);
124
switch (component)
125
{
126
case 0:
127
error_weight = blk.channel_weight.lane<0>();
128
data_vr = blk.data_r;
129
break;
130
case 1:
131
error_weight = blk.channel_weight.lane<1>();
132
data_vr = blk.data_g;
133
break;
134
case 2:
135
error_weight = blk.channel_weight.lane<2>();
136
data_vr = blk.data_b;
137
break;
138
default:
139
assert(component == 3);
140
error_weight = blk.channel_weight.lane<3>();
141
data_vr = blk.data_a;
142
break;
143
}
144
145
vmask4 sep_mask = vint4::lane_id() == vint4(component);
146
bool is_constant_wes { true };
147
float partition0_len_sq { 0.0f };
148
149
for (unsigned int i = 0; i < partition_count; i++)
150
{
151
float lowvalue { 1e10f };
152
float highvalue { -1e10f };
153
154
unsigned int partition_texel_count = pi.partition_texel_count[i];
155
for (unsigned int j = 0; j < partition_texel_count; j++)
156
{
157
unsigned int tix = pi.texels_of_partition[i][j];
158
float value = data_vr[tix];
159
lowvalue = astc::min(value, lowvalue);
160
highvalue = astc::max(value, highvalue);
161
}
162
163
if (highvalue <= lowvalue)
164
{
165
lowvalue = 0.0f;
166
highvalue = 1e-7f;
167
}
168
169
float length = highvalue - lowvalue;
170
float length_squared = length * length;
171
float scale = 1.0f / length;
172
173
if (i == 0)
174
{
175
partition0_len_sq = length_squared;
176
}
177
else
178
{
179
is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
180
}
181
182
for (unsigned int j = 0; j < partition_texel_count; j++)
183
{
184
unsigned int tix = pi.texels_of_partition[i][j];
185
float value = (data_vr[tix] - lowvalue) * scale;
186
value = astc::clamp1f(value);
187
188
ei.weights[tix] = value;
189
ei.weight_error_scale[tix] = length_squared * error_weight;
190
assert(!astc::isnan(ei.weight_error_scale[tix]));
191
}
192
193
ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask);
194
ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask);
195
}
196
197
// Zero initialize any SIMD over-fetch
198
size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
199
for (size_t i = texel_count; i < texel_count_simd; i++)
200
{
201
ei.weights[i] = 0.0f;
202
ei.weight_error_scale[i] = 0.0f;
203
}
204
205
ei.is_constant_weight_error_scale = is_constant_wes;
206
}
207
208
/**
209
* @brief Compute the ideal endpoints and weights for 2 color components.
210
*
211
* @param blk The image block color data to compress.
212
* @param pi The partition info for the current trial.
213
* @param[out] ei The computed ideal endpoints and weights.
214
* @param component1 The first color component to compute.
215
* @param component2 The second color component to compute.
216
*/
217
static void compute_ideal_colors_and_weights_2_comp(
218
const image_block& blk,
219
const partition_info& pi,
220
endpoints_and_weights& ei,
221
int component1,
222
int component2
223
) {
224
unsigned int partition_count = pi.partition_count;
225
ei.ep.partition_count = partition_count;
226
promise(partition_count > 0);
227
228
unsigned int texel_count = blk.texel_count;
229
promise(texel_count > 0);
230
231
partition_metrics pms[BLOCK_MAX_PARTITIONS];
232
233
float error_weight;
234
const float* data_vr = nullptr;
235
const float* data_vg = nullptr;
236
237
if (component1 == 0 && component2 == 1)
238
{
239
error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
240
241
data_vr = blk.data_r;
242
data_vg = blk.data_g;
243
}
244
else if (component1 == 0 && component2 == 2)
245
{
246
error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
247
248
data_vr = blk.data_r;
249
data_vg = blk.data_b;
250
}
251
else // (component1 == 1 && component2 == 2)
252
{
253
assert(component1 == 1 && component2 == 2);
254
255
error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
256
257
data_vr = blk.data_g;
258
data_vg = blk.data_b;
259
}
260
261
compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
262
263
bool is_constant_wes { true };
264
float partition0_len_sq { 0.0f };
265
266
vmask4 comp1_mask = vint4::lane_id() == vint4(component1);
267
vmask4 comp2_mask = vint4::lane_id() == vint4(component2);
268
269
for (unsigned int i = 0; i < partition_count; i++)
270
{
271
vfloat4 dir = pms[i].dir;
272
if (hadd_s(dir) < 0.0f)
273
{
274
dir = vfloat4::zero() - dir;
275
}
276
277
line2 line { pms[i].avg, normalize_safe(dir, unit2()) };
278
float lowparam { 1e10f };
279
float highparam { -1e10f };
280
281
unsigned int partition_texel_count = pi.partition_texel_count[i];
282
for (unsigned int j = 0; j < partition_texel_count; j++)
283
{
284
unsigned int tix = pi.texels_of_partition[i][j];
285
vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]);
286
float param = dot_s(point - line.a, line.b);
287
ei.weights[tix] = param;
288
289
lowparam = astc::min(param, lowparam);
290
highparam = astc::max(param, highparam);
291
}
292
293
// It is possible for a uniform-color partition to produce length=0;
294
// this causes NaN issues so set to small value to avoid this problem
295
if (highparam <= lowparam)
296
{
297
lowparam = 0.0f;
298
highparam = 1e-7f;
299
}
300
301
float length = highparam - lowparam;
302
float length_squared = length * length;
303
float scale = 1.0f / length;
304
305
if (i == 0)
306
{
307
partition0_len_sq = length_squared;
308
}
309
else
310
{
311
is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
312
}
313
314
for (unsigned int j = 0; j < partition_texel_count; j++)
315
{
316
unsigned int tix = pi.texels_of_partition[i][j];
317
float idx = (ei.weights[tix] - lowparam) * scale;
318
idx = astc::clamp1f(idx);
319
320
ei.weights[tix] = idx;
321
ei.weight_error_scale[tix] = length_squared * error_weight;
322
assert(!astc::isnan(ei.weight_error_scale[tix]));
323
}
324
325
vfloat4 lowvalue = line.a + line.b * lowparam;
326
vfloat4 highvalue = line.a + line.b * highparam;
327
328
vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask);
329
vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask);
330
331
ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask);
332
ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask);
333
}
334
335
// Zero initialize any SIMD over-fetch
336
size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
337
for (size_t i = texel_count; i < texel_count_simd; i++)
338
{
339
ei.weights[i] = 0.0f;
340
ei.weight_error_scale[i] = 0.0f;
341
}
342
343
ei.is_constant_weight_error_scale = is_constant_wes;
344
}
345
346
/**
347
* @brief Compute the ideal endpoints and weights for 3 color components.
348
*
349
* @param blk The image block color data to compress.
350
* @param pi The partition info for the current trial.
351
* @param[out] ei The computed ideal endpoints and weights.
352
* @param omitted_component The color component excluded from the calculation.
353
*/
354
static void compute_ideal_colors_and_weights_3_comp(
355
const image_block& blk,
356
const partition_info& pi,
357
endpoints_and_weights& ei,
358
unsigned int omitted_component
359
) {
360
unsigned int partition_count = pi.partition_count;
361
ei.ep.partition_count = partition_count;
362
promise(partition_count > 0);
363
364
unsigned int texel_count = blk.texel_count;
365
promise(texel_count > 0);
366
367
partition_metrics pms[BLOCK_MAX_PARTITIONS];
368
369
float error_weight;
370
const float* data_vr = nullptr;
371
const float* data_vg = nullptr;
372
const float* data_vb = nullptr;
373
if (omitted_component == 0)
374
{
375
error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
376
data_vr = blk.data_g;
377
data_vg = blk.data_b;
378
data_vb = blk.data_a;
379
}
380
else if (omitted_component == 1)
381
{
382
error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
383
data_vr = blk.data_r;
384
data_vg = blk.data_b;
385
data_vb = blk.data_a;
386
}
387
else if (omitted_component == 2)
388
{
389
error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
390
data_vr = blk.data_r;
391
data_vg = blk.data_g;
392
data_vb = blk.data_a;
393
}
394
else
395
{
396
assert(omitted_component == 3);
397
398
error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
399
data_vr = blk.data_r;
400
data_vg = blk.data_g;
401
data_vb = blk.data_b;
402
}
403
404
error_weight = error_weight * (1.0f / 3.0f);
405
406
if (omitted_component == 3)
407
{
408
compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
409
}
410
else
411
{
412
compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
413
}
414
415
bool is_constant_wes { true };
416
float partition0_len_sq { 0.0f };
417
418
for (unsigned int i = 0; i < partition_count; i++)
419
{
420
vfloat4 dir = pms[i].dir;
421
if (hadd_rgb_s(dir) < 0.0f)
422
{
423
dir = vfloat4::zero() - dir;
424
}
425
426
line3 line { pms[i].avg, normalize_safe(dir, unit3()) };
427
float lowparam { 1e10f };
428
float highparam { -1e10f };
429
430
unsigned int partition_texel_count = pi.partition_texel_count[i];
431
for (unsigned int j = 0; j < partition_texel_count; j++)
432
{
433
unsigned int tix = pi.texels_of_partition[i][j];
434
vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
435
float param = dot3_s(point - line.a, line.b);
436
ei.weights[tix] = param;
437
438
lowparam = astc::min(param, lowparam);
439
highparam = astc::max(param, highparam);
440
}
441
442
// It is possible for a uniform-color partition to produce length=0;
443
// this causes NaN issues so set to small value to avoid this problem
444
if (highparam <= lowparam)
445
{
446
lowparam = 0.0f;
447
highparam = 1e-7f;
448
}
449
450
float length = highparam - lowparam;
451
float length_squared = length * length;
452
float scale = 1.0f / length;
453
454
if (i == 0)
455
{
456
partition0_len_sq = length_squared;
457
}
458
else
459
{
460
is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
461
}
462
463
for (unsigned int j = 0; j < partition_texel_count; j++)
464
{
465
unsigned int tix = pi.texels_of_partition[i][j];
466
float idx = (ei.weights[tix] - lowparam) * scale;
467
idx = astc::clamp1f(idx);
468
469
ei.weights[tix] = idx;
470
ei.weight_error_scale[tix] = length_squared * error_weight;
471
assert(!astc::isnan(ei.weight_error_scale[tix]));
472
}
473
474
vfloat4 ep0 = line.a + line.b * lowparam;
475
vfloat4 ep1 = line.a + line.b * highparam;
476
477
vfloat4 bmin = blk.data_min;
478
vfloat4 bmax = blk.data_max;
479
480
assert(omitted_component < BLOCK_MAX_COMPONENTS);
481
switch (omitted_component)
482
{
483
case 0:
484
ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>());
485
ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>());
486
break;
487
case 1:
488
ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>());
489
ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>());
490
break;
491
case 2:
492
ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>());
493
ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>());
494
break;
495
default:
496
ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>());
497
ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>());
498
break;
499
}
500
}
501
502
// Zero initialize any SIMD over-fetch
503
size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
504
for (size_t i = texel_count; i < texel_count_simd; i++)
505
{
506
ei.weights[i] = 0.0f;
507
ei.weight_error_scale[i] = 0.0f;
508
}
509
510
ei.is_constant_weight_error_scale = is_constant_wes;
511
}
512
513
/**
514
* @brief Compute the ideal endpoints and weights for 4 color components.
515
*
516
* @param blk The image block color data to compress.
517
* @param pi The partition info for the current trial.
518
* @param[out] ei The computed ideal endpoints and weights.
519
*/
520
static void compute_ideal_colors_and_weights_4_comp(
521
const image_block& blk,
522
const partition_info& pi,
523
endpoints_and_weights& ei
524
) {
525
const float error_weight = hadd_s(blk.channel_weight) / 4.0f;
526
527
unsigned int partition_count = pi.partition_count;
528
529
unsigned int texel_count = blk.texel_count;
530
promise(texel_count > 0);
531
promise(partition_count > 0);
532
533
partition_metrics pms[BLOCK_MAX_PARTITIONS];
534
535
compute_avgs_and_dirs_4_comp(pi, blk, pms);
536
537
bool is_constant_wes { true };
538
float partition0_len_sq { 0.0f };
539
540
for (unsigned int i = 0; i < partition_count; i++)
541
{
542
vfloat4 dir = pms[i].dir;
543
if (hadd_rgb_s(dir) < 0.0f)
544
{
545
dir = vfloat4::zero() - dir;
546
}
547
548
line4 line { pms[i].avg, normalize_safe(dir, unit4()) };
549
float lowparam { 1e10f };
550
float highparam { -1e10f };
551
552
unsigned int partition_texel_count = pi.partition_texel_count[i];
553
for (unsigned int j = 0; j < partition_texel_count; j++)
554
{
555
unsigned int tix = pi.texels_of_partition[i][j];
556
vfloat4 point = blk.texel(tix);
557
float param = dot_s(point - line.a, line.b);
558
ei.weights[tix] = param;
559
560
lowparam = astc::min(param, lowparam);
561
highparam = astc::max(param, highparam);
562
}
563
564
// It is possible for a uniform-color partition to produce length=0;
565
// this causes NaN issues so set to small value to avoid this problem
566
if (highparam <= lowparam)
567
{
568
lowparam = 0.0f;
569
highparam = 1e-7f;
570
}
571
572
float length = highparam - lowparam;
573
float length_squared = length * length;
574
float scale = 1.0f / length;
575
576
if (i == 0)
577
{
578
partition0_len_sq = length_squared;
579
}
580
else
581
{
582
is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
583
}
584
585
ei.ep.endpt0[i] = line.a + line.b * lowparam;
586
ei.ep.endpt1[i] = line.a + line.b * highparam;
587
588
for (unsigned int j = 0; j < partition_texel_count; j++)
589
{
590
unsigned int tix = pi.texels_of_partition[i][j];
591
float idx = (ei.weights[tix] - lowparam) * scale;
592
idx = astc::clamp1f(idx);
593
594
ei.weights[tix] = idx;
595
ei.weight_error_scale[tix] = length_squared * error_weight;
596
assert(!astc::isnan(ei.weight_error_scale[tix]));
597
}
598
}
599
600
// Zero initialize any SIMD over-fetch
601
size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
602
for (size_t i = texel_count; i < texel_count_simd; i++)
603
{
604
ei.weights[i] = 0.0f;
605
ei.weight_error_scale[i] = 0.0f;
606
}
607
608
ei.is_constant_weight_error_scale = is_constant_wes;
609
}
610
611
/* See header for documentation. */
612
void compute_ideal_colors_and_weights_1plane(
613
const image_block& blk,
614
const partition_info& pi,
615
endpoints_and_weights& ei
616
) {
617
bool uses_alpha = !blk.is_constant_channel(3);
618
619
if (uses_alpha)
620
{
621
compute_ideal_colors_and_weights_4_comp(blk, pi, ei);
622
}
623
else
624
{
625
compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3);
626
}
627
}
628
629
/* See header for documentation. */
630
void compute_ideal_colors_and_weights_2planes(
631
const block_size_descriptor& bsd,
632
const image_block& blk,
633
unsigned int plane2_component,
634
endpoints_and_weights& ei1,
635
endpoints_and_weights& ei2
636
) {
637
const auto& pi = bsd.get_partition_info(1, 0);
638
bool uses_alpha = !blk.is_constant_channel(3);
639
640
assert(plane2_component < BLOCK_MAX_COMPONENTS);
641
switch (plane2_component)
642
{
643
case 0: // Separate weights for red
644
if (uses_alpha)
645
{
646
compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0);
647
}
648
else
649
{
650
compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2);
651
}
652
compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0);
653
break;
654
655
case 1: // Separate weights for green
656
if (uses_alpha)
657
{
658
compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1);
659
}
660
else
661
{
662
compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2);
663
}
664
compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1);
665
break;
666
667
case 2: // Separate weights for blue
668
if (uses_alpha)
669
{
670
compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2);
671
}
672
else
673
{
674
compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1);
675
}
676
compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2);
677
break;
678
679
default: // Separate weights for alpha
680
assert(uses_alpha);
681
compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3);
682
compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3);
683
break;
684
}
685
}
686
687
/* See header for documentation. */
688
float compute_error_of_weight_set_1plane(
689
const endpoints_and_weights& eai,
690
const decimation_info& di,
691
const float* dec_weight_quant_uvalue
692
) {
693
vfloatacc error_summav = vfloatacc::zero();
694
unsigned int texel_count = di.texel_count;
695
promise(texel_count > 0);
696
697
// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
698
if (di.max_texel_weight_count > 2)
699
{
700
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
701
{
702
// Compute the bilinear interpolation of the decimated weight grid
703
vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i);
704
705
// Compute the error between the computed value and the ideal weight
706
vfloat actual_values = loada(eai.weights + i);
707
vfloat diff = current_values - actual_values;
708
vfloat significance = loada(eai.weight_error_scale + i);
709
vfloat error = diff * diff * significance;
710
711
haccumulate(error_summav, error);
712
}
713
}
714
else if (di.max_texel_weight_count > 1)
715
{
716
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
717
{
718
// Compute the bilinear interpolation of the decimated weight grid
719
vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i);
720
721
// Compute the error between the computed value and the ideal weight
722
vfloat actual_values = loada(eai.weights + i);
723
vfloat diff = current_values - actual_values;
724
vfloat significance = loada(eai.weight_error_scale + i);
725
vfloat error = diff * diff * significance;
726
727
haccumulate(error_summav, error);
728
}
729
}
730
else
731
{
732
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
733
{
734
// Load the weight set directly, without interpolation
735
vfloat current_values = loada(dec_weight_quant_uvalue + i);
736
737
// Compute the error between the computed value and the ideal weight
738
vfloat actual_values = loada(eai.weights + i);
739
vfloat diff = current_values - actual_values;
740
vfloat significance = loada(eai.weight_error_scale + i);
741
vfloat error = diff * diff * significance;
742
743
haccumulate(error_summav, error);
744
}
745
}
746
747
// Resolve the final scalar accumulator sum
748
return hadd_s(error_summav);
749
}
750
751
/* See header for documentation. */
752
float compute_error_of_weight_set_2planes(
753
const endpoints_and_weights& eai1,
754
const endpoints_and_weights& eai2,
755
const decimation_info& di,
756
const float* dec_weight_quant_uvalue_plane1,
757
const float* dec_weight_quant_uvalue_plane2
758
) {
759
vfloatacc error_summav = vfloatacc::zero();
760
unsigned int texel_count = di.texel_count;
761
promise(texel_count > 0);
762
763
// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
764
if (di.max_texel_weight_count > 2)
765
{
766
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
767
{
768
// Plane 1
769
// Compute the bilinear interpolation of the decimated weight grid
770
vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i);
771
772
// Compute the error between the computed value and the ideal weight
773
vfloat actual_values1 = loada(eai1.weights + i);
774
vfloat diff = current_values1 - actual_values1;
775
vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
776
777
// Plane 2
778
// Compute the bilinear interpolation of the decimated weight grid
779
vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i);
780
781
// Compute the error between the computed value and the ideal weight
782
vfloat actual_values2 = loada(eai2.weights + i);
783
diff = current_values2 - actual_values2;
784
vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
785
786
haccumulate(error_summav, error1 + error2);
787
}
788
}
789
else if (di.max_texel_weight_count > 1)
790
{
791
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
792
{
793
// Plane 1
794
// Compute the bilinear interpolation of the decimated weight grid
795
vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i);
796
797
// Compute the error between the computed value and the ideal weight
798
vfloat actual_values1 = loada(eai1.weights + i);
799
vfloat diff = current_values1 - actual_values1;
800
vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
801
802
// Plane 2
803
// Compute the bilinear interpolation of the decimated weight grid
804
vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i);
805
806
// Compute the error between the computed value and the ideal weight
807
vfloat actual_values2 = loada(eai2.weights + i);
808
diff = current_values2 - actual_values2;
809
vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
810
811
haccumulate(error_summav, error1 + error2);
812
}
813
}
814
else
815
{
816
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
817
{
818
// Plane 1
819
// Load the weight set directly, without interpolation
820
vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i);
821
822
// Compute the error between the computed value and the ideal weight
823
vfloat actual_values1 = loada(eai1.weights + i);
824
vfloat diff = current_values1 - actual_values1;
825
vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
826
827
// Plane 2
828
// Load the weight set directly, without interpolation
829
vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i);
830
831
// Compute the error between the computed value and the ideal weight
832
vfloat actual_values2 = loada(eai2.weights + i);
833
diff = current_values2 - actual_values2;
834
vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
835
836
haccumulate(error_summav, error1 + error2);
837
}
838
}
839
840
// Resolve the final scalar accumulator sum
841
return hadd_s(error_summav);
842
}
843
844
/* See header for documentation. */
845
void compute_ideal_weights_for_decimation(
846
const endpoints_and_weights& ei,
847
const decimation_info& di,
848
float* dec_weight_ideal_value
849
) {
850
unsigned int texel_count = di.texel_count;
851
unsigned int weight_count = di.weight_count;
852
bool is_direct = texel_count == weight_count;
853
promise(texel_count > 0);
854
promise(weight_count > 0);
855
856
// If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
857
// zero-initialized SIMD over-fetch region
858
if (is_direct)
859
{
860
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
861
{
862
vfloat weight(ei.weights + i);
863
storea(weight, dec_weight_ideal_value + i);
864
}
865
866
return;
867
}
868
869
// Otherwise compute an estimate and perform single refinement iteration
870
871
// Compute an initial average for each decimated weight
872
bool constant_wes = ei.is_constant_weight_error_scale;
873
vfloat weight_error_scale(ei.weight_error_scale[0]);
874
875
// This overshoots - this is OK as we initialize the array tails in the
876
// decimation table structures to safe values ...
877
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
878
{
879
// Start with a small value to avoid div-by-zero later
880
vfloat weight_weight(1e-10f);
881
vfloat initial_weight = vfloat::zero();
882
883
// Accumulate error weighting of all the texels using this weight
884
vint weight_texel_count(di.weight_texel_count + i);
885
unsigned int max_texel_count = hmax_s(weight_texel_count);
886
promise(max_texel_count > 0);
887
888
for (unsigned int j = 0; j < max_texel_count; j++)
889
{
890
const uint8_t* texel = di.weight_texels_tr[j] + i;
891
vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
892
893
if (!constant_wes)
894
{
895
weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
896
}
897
898
vfloat contrib_weight = weight * weight_error_scale;
899
900
weight_weight += contrib_weight;
901
initial_weight += gatherf_byte_inds<vfloat>(ei.weights, texel) * contrib_weight;
902
}
903
904
storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
905
}
906
907
// Populate the interpolated weight grid based on the initial average
908
// Process SIMD-width texel coordinates at at time while we can. Safe to
909
// over-process full SIMD vectors - the tail is zeroed.
910
ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
911
if (di.max_texel_weight_count <= 2)
912
{
913
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
914
{
915
vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i);
916
storea(weight, infilled_weights + i);
917
}
918
}
919
else
920
{
921
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
922
{
923
vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i);
924
storea(weight, infilled_weights + i);
925
}
926
}
927
928
// Perform a single iteration of refinement
929
// Empirically determined step size; larger values don't help but smaller drops image quality
930
constexpr float stepsize = 0.25f;
931
constexpr float chd_scale = -WEIGHTS_TEXEL_SUM;
932
933
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
934
{
935
vfloat weight_val = loada(dec_weight_ideal_value + i);
936
937
// Accumulate error weighting of all the texels using this weight
938
// Start with a small value to avoid div-by-zero later
939
vfloat error_change0(1e-10f);
940
vfloat error_change1(0.0f);
941
942
// Accumulate error weighting of all the texels using this weight
943
vint weight_texel_count(di.weight_texel_count + i);
944
unsigned int max_texel_count = hmax_s(weight_texel_count);
945
promise(max_texel_count > 0);
946
947
for (unsigned int j = 0; j < max_texel_count; j++)
948
{
949
const uint8_t* texel = di.weight_texels_tr[j] + i;
950
vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
951
952
if (!constant_wes)
953
{
954
weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
955
}
956
957
vfloat scale = weight_error_scale * contrib_weight;
958
vfloat old_weight = gatherf_byte_inds<vfloat>(infilled_weights, texel);
959
vfloat ideal_weight = gatherf_byte_inds<vfloat>(ei.weights, texel);
960
961
error_change0 += contrib_weight * scale;
962
error_change1 += (old_weight - ideal_weight) * scale;
963
}
964
965
vfloat step = (error_change1 * chd_scale) / error_change0;
966
step = clamp(-stepsize, stepsize, step);
967
968
// Update the weight; note this can store negative values
969
storea(weight_val + step, dec_weight_ideal_value + i);
970
}
971
}
972
973
/* See header for documentation. */
974
void compute_quantized_weights_for_decimation(
975
const decimation_info& di,
976
float low_bound,
977
float high_bound,
978
const float* dec_weight_ideal_value,
979
float* weight_set_out,
980
uint8_t* quantized_weight_set,
981
quant_method quant_level
982
) {
983
int weight_count = di.weight_count;
984
promise(weight_count > 0);
985
const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level];
986
987
// The available quant levels, stored with a minus 1 bias
988
static const float quant_levels_m1[12] {
989
1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
990
};
991
992
vint steps_m1(get_quant_level(quant_level) - 1);
993
float quant_level_m1 = quant_levels_m1[quant_level];
994
995
// Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
996
997
// TODO: Oddity to investigate; triggered by test in issue #265.
998
if (high_bound <= low_bound)
999
{
1000
low_bound = 0.0f;
1001
high_bound = 1.0f;
1002
}
1003
1004
float rscale = high_bound - low_bound;
1005
float scale = 1.0f / rscale;
1006
1007
float scaled_low_bound = low_bound * scale;
1008
rscale *= 1.0f / 64.0f;
1009
1010
vfloat scalev(scale);
1011
vfloat scaled_low_boundv(scaled_low_bound);
1012
vfloat quant_level_m1v(quant_level_m1);
1013
vfloat rscalev(rscale);
1014
vfloat low_boundv(low_bound);
1015
1016
// This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
1017
// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
1018
if (get_quant_level(quant_level) <= 16)
1019
{
1020
vtable_16x8 table;
1021
vtable_prepare(table, qat.quant_to_unquant);
1022
1023
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1024
{
1025
vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
1026
ix = clampzo(ix);
1027
1028
// Look up the two closest indexes and return the one that was closest
1029
vfloat ix1 = ix * quant_level_m1v;
1030
1031
vint weightl = float_to_int(ix1);
1032
vint weighth = min(weightl + vint(1), steps_m1);
1033
1034
vint ixli = vtable_lookup_32bit(table, weightl);
1035
vint ixhi = vtable_lookup_32bit(table, weighth);
1036
1037
vfloat ixl = int_to_float(ixli);
1038
vfloat ixh = int_to_float(ixhi);
1039
1040
vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
1041
vint weight = select(ixli, ixhi, mask);
1042
ixl = select(ixl, ixh, mask);
1043
1044
// Invert the weight-scaling that was done initially
1045
storea(ixl * rscalev + low_boundv, weight_set_out + i);
1046
pack_and_store_low_bytes(weight, quantized_weight_set + i);
1047
}
1048
}
1049
else
1050
{
1051
vtable_32x8 table;
1052
vtable_prepare(table, qat.quant_to_unquant);
1053
1054
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1055
{
1056
vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
1057
ix = clampzo(ix);
1058
1059
// Look up the two closest indexes and return the one that was closest
1060
vfloat ix1 = ix * quant_level_m1v;
1061
1062
vint weightl = float_to_int(ix1);
1063
vint weighth = min(weightl + vint(1), steps_m1);
1064
1065
vint ixli = vtable_lookup_32bit(table, weightl);
1066
vint ixhi = vtable_lookup_32bit(table, weighth);
1067
1068
vfloat ixl = int_to_float(ixli);
1069
vfloat ixh = int_to_float(ixhi);
1070
1071
vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
1072
vint weight = select(ixli, ixhi, mask);
1073
ixl = select(ixl, ixh, mask);
1074
1075
// Invert the weight-scaling that was done initially
1076
storea(ixl * rscalev + low_boundv, weight_set_out + i);
1077
pack_and_store_low_bytes(weight, quantized_weight_set + i);
1078
}
1079
}
1080
}
1081
1082
/**
1083
* @brief Compute the RGB + offset for a HDR endpoint mode #7.
1084
*
1085
* Since the matrix needed has a regular structure we can simplify the inverse calculation. This
1086
* gives us ~24 multiplications vs. 96 for a generic inverse.
1087
*
1088
* mat[0] = vfloat4(rgba_ws.x, 0.0f, 0.0f, wght_ws.x);
1089
* mat[1] = vfloat4( 0.0f, rgba_ws.y, 0.0f, wght_ws.y);
1090
* mat[2] = vfloat4( 0.0f, 0.0f, rgba_ws.z, wght_ws.z);
1091
* mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z, psum);
1092
* mat = invert(mat);
1093
*
1094
* @param rgba_weight_sum Sum of partition component error weights.
1095
* @param weight_weight_sum Sum of partition component error weights * texel weight.
1096
* @param rgbq_sum Sum of partition component error weights * texel weight * color data.
1097
* @param psum Sum of RGB color weights * texel weight^2.
1098
*/
1099
static inline vfloat4 compute_rgbo_vector(
1100
vfloat4 rgba_weight_sum,
1101
vfloat4 weight_weight_sum,
1102
vfloat4 rgbq_sum,
1103
float psum
1104
) {
1105
float X = rgba_weight_sum.lane<0>();
1106
float Y = rgba_weight_sum.lane<1>();
1107
float Z = rgba_weight_sum.lane<2>();
1108
float P = weight_weight_sum.lane<0>();
1109
float Q = weight_weight_sum.lane<1>();
1110
float R = weight_weight_sum.lane<2>();
1111
float S = psum;
1112
1113
float PP = P * P;
1114
float QQ = Q * Q;
1115
float RR = R * R;
1116
1117
float SZmRR = S * Z - RR;
1118
float DT = SZmRR * Y - Z * QQ;
1119
float YP = Y * P;
1120
float QX = Q * X;
1121
float YX = Y * X;
1122
float mZYP = -Z * YP;
1123
float mZQX = -Z * QX;
1124
float mRYX = -R * YX;
1125
float ZQP = Z * Q * P;
1126
float RYP = R * YP;
1127
float RQX = R * QX;
1128
1129
// Compute the reciprocal of matrix determinant
1130
float rdet = 1.0f / (DT * X + mZYP * P);
1131
1132
// Actually compute the adjugate, and then apply 1/det separately
1133
vfloat4 mat0(DT, ZQP, RYP, mZYP);
1134
vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX);
1135
vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX);
1136
vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX);
1137
vfloat4 vect = rgbq_sum * rdet;
1138
1139
return vfloat4(dot_s(mat0, vect),
1140
dot_s(mat1, vect),
1141
dot_s(mat2, vect),
1142
dot_s(mat3, vect));
1143
}
1144
1145
/* See header for documentation. */
1146
void recompute_ideal_colors_1plane(
1147
const image_block& blk,
1148
const partition_info& pi,
1149
const decimation_info& di,
1150
const uint8_t* dec_weights_uquant,
1151
endpoints& ep,
1152
vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
1153
vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
1154
) {
1155
unsigned int weight_count = di.weight_count;
1156
unsigned int total_texel_count = blk.texel_count;
1157
unsigned int partition_count = pi.partition_count;
1158
1159
promise(weight_count > 0);
1160
promise(total_texel_count > 0);
1161
promise(partition_count > 0);
1162
1163
ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
1164
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1165
{
1166
vint unquant_value(dec_weights_uquant + i);
1167
vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f);
1168
storea(unquant_valuef, dec_weight + i);
1169
}
1170
1171
ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
1172
float* undec_weight_ref;
1173
if (di.max_texel_weight_count == 1)
1174
{
1175
undec_weight_ref = dec_weight;
1176
}
1177
else if (di.max_texel_weight_count <= 2)
1178
{
1179
for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1180
{
1181
vfloat weight = bilinear_infill_vla_2(di, dec_weight, i);
1182
storea(weight, undec_weight + i);
1183
}
1184
1185
undec_weight_ref = undec_weight;
1186
}
1187
else
1188
{
1189
for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1190
{
1191
vfloat weight = bilinear_infill_vla(di, dec_weight, i);
1192
storea(weight, undec_weight + i);
1193
}
1194
1195
undec_weight_ref = undec_weight;
1196
}
1197
1198
vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count));
1199
1200
for (unsigned int i = 0; i < partition_count; i++)
1201
{
1202
unsigned int texel_count = pi.partition_texel_count[i];
1203
const uint8_t *texel_indexes = pi.texels_of_partition[i];
1204
1205
// Only compute a partition mean if more than one partition
1206
if (partition_count > 1)
1207
{
1208
rgba_sum = vfloat4::zero();
1209
promise(texel_count > 0);
1210
for (unsigned int j = 0; j < texel_count; j++)
1211
{
1212
unsigned int tix = texel_indexes[j];
1213
rgba_sum += blk.texel(tix);
1214
}
1215
}
1216
1217
rgba_sum = rgba_sum * blk.channel_weight;
1218
vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1219
vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>());
1220
1221
float scale_max = 0.0f;
1222
float scale_min = 1e10f;
1223
1224
float wmin1 = 1.0f;
1225
float wmax1 = 0.0f;
1226
1227
float left_sum_s = 0.0f;
1228
float middle_sum_s = 0.0f;
1229
float right_sum_s = 0.0f;
1230
1231
vfloat4 color_vec_x = vfloat4::zero();
1232
vfloat4 color_vec_y = vfloat4::zero();
1233
1234
vfloat4 scale_vec = vfloat4::zero();
1235
1236
float weight_weight_sum_s = 1e-17f;
1237
1238
vfloat4 color_weight = blk.channel_weight;
1239
float ls_weight = hadd_rgb_s(color_weight);
1240
1241
for (unsigned int j = 0; j < texel_count; j++)
1242
{
1243
unsigned int tix = texel_indexes[j];
1244
vfloat4 rgba = blk.texel(tix);
1245
1246
float idx0 = undec_weight_ref[tix];
1247
1248
float om_idx0 = 1.0f - idx0;
1249
wmin1 = astc::min(idx0, wmin1);
1250
wmax1 = astc::max(idx0, wmax1);
1251
1252
float scale = dot3_s(scale_dir, rgba);
1253
scale_min = astc::min(scale, scale_min);
1254
scale_max = astc::max(scale, scale_max);
1255
1256
left_sum_s += om_idx0 * om_idx0;
1257
middle_sum_s += om_idx0 * idx0;
1258
right_sum_s += idx0 * idx0;
1259
weight_weight_sum_s += idx0;
1260
1261
vfloat4 color_idx(idx0);
1262
vfloat4 cwprod = rgba;
1263
vfloat4 cwiprod = cwprod * color_idx;
1264
1265
color_vec_y += cwiprod;
1266
color_vec_x += cwprod - cwiprod;
1267
1268
scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
1269
}
1270
1271
vfloat4 left_sum = vfloat4(left_sum_s) * color_weight;
1272
vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight;
1273
vfloat4 right_sum = vfloat4(right_sum_s) * color_weight;
1274
vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
1275
1276
color_vec_x = color_vec_x * color_weight;
1277
color_vec_y = color_vec_y * color_weight;
1278
1279
// Initialize the luminance and scale vectors with a reasonable default
1280
float scalediv = scale_min / astc::max(scale_max, 1e-10f);
1281
scalediv = astc::clamp1f(scalediv);
1282
1283
vfloat4 sds = scale_dir * scale_max;
1284
1285
rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1286
1287
if (wmin1 >= wmax1 * 0.999f)
1288
{
1289
// If all weights in the partition were equal, then just take average of all colors in
1290
// the partition and use that as both endpoint colors
1291
vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1292
1293
vmask4 notnan_mask = avg == avg;
1294
ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
1295
ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask);
1296
1297
rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1298
}
1299
else
1300
{
1301
// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1302
// set of texel weights and pixel colors
1303
vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
1304
vfloat4 color_rdet1 = 1.0f / color_det1;
1305
1306
float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1307
float ls_rdet1 = 1.0f / ls_det1;
1308
1309
vfloat4 color_mss1 = (left_sum * left_sum)
1310
+ (2.0f * middle_sum * middle_sum)
1311
+ (right_sum * right_sum);
1312
1313
float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1314
+ (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1315
+ (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1316
1317
vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
1318
vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
1319
1320
vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1321
vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1322
vmask4 full_mask = det_mask & notnan_mask;
1323
1324
ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask);
1325
ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask);
1326
1327
float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1328
float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1329
1330
if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1331
{
1332
float scalediv2 = scale_ep0 / scale_ep1;
1333
vfloat4 sdsm = scale_dir * scale_ep1;
1334
rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1335
}
1336
}
1337
1338
// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
1339
if (blk.rgb_lns[0] || blk.alpha_lns[0])
1340
{
1341
vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
1342
float psum = right_sum_s * hadd_rgb_s(color_weight);
1343
1344
vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1345
rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1346
1347
vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1348
rgbo_vectors[i] = rgbovec;
1349
1350
// We can get a failure due to the use of a singular (non-invertible) matrix
1351
// If it failed, compute rgbo_vectors[] with a different method ...
1352
if (astc::isnan(dot_s(rgbovec, rgbovec)))
1353
{
1354
vfloat4 v0 = ep.endpt0[i];
1355
vfloat4 v1 = ep.endpt1[i];
1356
1357
float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1358
avgdif = astc::max(avgdif, 0.0f);
1359
1360
vfloat4 avg = (v0 + v1) * 0.5f;
1361
vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1362
rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1363
}
1364
}
1365
}
1366
}
1367
1368
/* See header for documentation. */
1369
void recompute_ideal_colors_2planes(
1370
const image_block& blk,
1371
const block_size_descriptor& bsd,
1372
const decimation_info& di,
1373
const uint8_t* dec_weights_uquant_plane1,
1374
const uint8_t* dec_weights_uquant_plane2,
1375
endpoints& ep,
1376
vfloat4& rgbs_vector,
1377
vfloat4& rgbo_vector,
1378
int plane2_component
1379
) {
1380
unsigned int weight_count = di.weight_count;
1381
unsigned int total_texel_count = blk.texel_count;
1382
1383
promise(total_texel_count > 0);
1384
promise(weight_count > 0);
1385
1386
ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
1387
ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
1388
1389
assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
1390
1391
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1392
{
1393
vint unquant_value1(dec_weights_uquant_plane1 + i);
1394
vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f);
1395
storea(unquant_value1f, dec_weight_plane1 + i);
1396
1397
vint unquant_value2(dec_weights_uquant_plane2 + i);
1398
vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f);
1399
storea(unquant_value2f, dec_weight_plane2 + i);
1400
}
1401
1402
ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
1403
ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];
1404
1405
float* undec_weight_plane1_ref;
1406
float* undec_weight_plane2_ref;
1407
1408
if (di.max_texel_weight_count == 1)
1409
{
1410
undec_weight_plane1_ref = dec_weight_plane1;
1411
undec_weight_plane2_ref = dec_weight_plane2;
1412
}
1413
else if (di.max_texel_weight_count <= 2)
1414
{
1415
for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1416
{
1417
vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i);
1418
storea(weight, undec_weight_plane1 + i);
1419
1420
weight = bilinear_infill_vla_2(di, dec_weight_plane2, i);
1421
storea(weight, undec_weight_plane2 + i);
1422
}
1423
1424
undec_weight_plane1_ref = undec_weight_plane1;
1425
undec_weight_plane2_ref = undec_weight_plane2;
1426
}
1427
else
1428
{
1429
for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1430
{
1431
vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i);
1432
storea(weight, undec_weight_plane1 + i);
1433
1434
weight = bilinear_infill_vla(di, dec_weight_plane2, i);
1435
storea(weight, undec_weight_plane2 + i);
1436
}
1437
1438
undec_weight_plane1_ref = undec_weight_plane1;
1439
undec_weight_plane2_ref = undec_weight_plane2;
1440
}
1441
1442
unsigned int texel_count = bsd.texel_count;
1443
vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1444
vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>());
1445
1446
float scale_max = 0.0f;
1447
float scale_min = 1e10f;
1448
1449
float wmin1 = 1.0f;
1450
float wmax1 = 0.0f;
1451
1452
float wmin2 = 1.0f;
1453
float wmax2 = 0.0f;
1454
1455
float left1_sum_s = 0.0f;
1456
float middle1_sum_s = 0.0f;
1457
float right1_sum_s = 0.0f;
1458
1459
float left2_sum_s = 0.0f;
1460
float middle2_sum_s = 0.0f;
1461
float right2_sum_s = 0.0f;
1462
1463
vfloat4 color_vec_x = vfloat4::zero();
1464
vfloat4 color_vec_y = vfloat4::zero();
1465
1466
vfloat4 scale_vec = vfloat4::zero();
1467
1468
vfloat4 weight_weight_sum = vfloat4(1e-17f);
1469
1470
vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
1471
vfloat4 color_weight = blk.channel_weight;
1472
float ls_weight = hadd_rgb_s(color_weight);
1473
1474
for (unsigned int j = 0; j < texel_count; j++)
1475
{
1476
vfloat4 rgba = blk.texel(j);
1477
1478
float idx0 = undec_weight_plane1_ref[j];
1479
1480
float om_idx0 = 1.0f - idx0;
1481
wmin1 = astc::min(idx0, wmin1);
1482
wmax1 = astc::max(idx0, wmax1);
1483
1484
float scale = dot3_s(scale_dir, rgba);
1485
scale_min = astc::min(scale, scale_min);
1486
scale_max = astc::max(scale, scale_max);
1487
1488
left1_sum_s += om_idx0 * om_idx0;
1489
middle1_sum_s += om_idx0 * idx0;
1490
right1_sum_s += idx0 * idx0;
1491
1492
float idx1 = undec_weight_plane2_ref[j];
1493
1494
float om_idx1 = 1.0f - idx1;
1495
wmin2 = astc::min(idx1, wmin2);
1496
wmax2 = astc::max(idx1, wmax2);
1497
1498
left2_sum_s += om_idx1 * om_idx1;
1499
middle2_sum_s += om_idx1 * idx1;
1500
right2_sum_s += idx1 * idx1;
1501
1502
vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask);
1503
1504
vfloat4 cwprod = rgba;
1505
vfloat4 cwiprod = cwprod * color_idx;
1506
1507
color_vec_y += cwiprod;
1508
color_vec_x += cwprod - cwiprod;
1509
1510
scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
1511
weight_weight_sum += color_idx;
1512
}
1513
1514
vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight;
1515
vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight;
1516
vfloat4 right1_sum = vfloat4(right1_sum_s) * color_weight;
1517
vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
1518
1519
vfloat4 left2_sum = vfloat4(left2_sum_s) * color_weight;
1520
vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
1521
vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight;
1522
1523
color_vec_x = color_vec_x * color_weight;
1524
color_vec_y = color_vec_y * color_weight;
1525
1526
// Initialize the luminance and scale vectors with a reasonable default
1527
float scalediv = scale_min / astc::max(scale_max, 1e-10f);
1528
scalediv = astc::clamp1f(scalediv);
1529
1530
vfloat4 sds = scale_dir * scale_max;
1531
1532
rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1533
1534
if (wmin1 >= wmax1 * 0.999f)
1535
{
1536
// If all weights in the partition were equal, then just take average of all colors in
1537
// the partition and use that as both endpoint colors
1538
vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1539
1540
vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1541
vmask4 notnan_mask = avg == avg;
1542
vmask4 full_mask = p1_mask & notnan_mask;
1543
1544
ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1545
ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1546
1547
rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1548
}
1549
else
1550
{
1551
// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1552
// set of texel weights and pixel colors
1553
vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
1554
vfloat4 color_rdet1 = 1.0f / color_det1;
1555
1556
float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1557
float ls_rdet1 = 1.0f / ls_det1;
1558
1559
vfloat4 color_mss1 = (left1_sum * left1_sum)
1560
+ (2.0f * middle1_sum * middle1_sum)
1561
+ (right1_sum * right1_sum);
1562
1563
float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1564
+ (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1565
+ (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1566
1567
vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
1568
vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
1569
1570
float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1571
float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1572
1573
vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1574
vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1575
vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1576
vmask4 full_mask = p1_mask & det_mask & notnan_mask;
1577
1578
ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1579
ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1580
1581
if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1582
{
1583
float scalediv2 = scale_ep0 / scale_ep1;
1584
vfloat4 sdsm = scale_dir * scale_ep1;
1585
rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1586
}
1587
}
1588
1589
if (wmin2 >= wmax2 * 0.999f)
1590
{
1591
// If all weights in the partition were equal, then just take average of all colors in
1592
// the partition and use that as both endpoint colors
1593
vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1594
1595
vmask4 notnan_mask = avg == avg;
1596
vmask4 full_mask = p2_mask & notnan_mask;
1597
1598
ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1599
ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1600
}
1601
else
1602
{
1603
// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1604
// set of texel weights and pixel colors
1605
vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum);
1606
vfloat4 color_rdet2 = 1.0f / color_det2;
1607
1608
vfloat4 color_mss2 = (left2_sum * left2_sum)
1609
+ (2.0f * middle2_sum * middle2_sum)
1610
+ (right2_sum * right2_sum);
1611
1612
vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
1613
vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
1614
1615
vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f);
1616
vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1617
vmask4 full_mask = p2_mask & det_mask & notnan_mask;
1618
1619
ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1620
ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1621
}
1622
1623
// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
1624
if (blk.rgb_lns[0] || blk.alpha_lns[0])
1625
{
1626
weight_weight_sum = weight_weight_sum * color_weight;
1627
float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
1628
1629
vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1630
rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1631
1632
rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1633
1634
// We can get a failure due to the use of a singular (non-invertible) matrix
1635
// If it failed, compute rgbo_vectors[] with a different method ...
1636
if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
1637
{
1638
vfloat4 v0 = ep.endpt0[0];
1639
vfloat4 v1 = ep.endpt1[0];
1640
1641
float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1642
avgdif = astc::max(avgdif, 0.0f);
1643
1644
vfloat4 avg = (v0 + v1) * 0.5f;
1645
vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1646
1647
rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1648
}
1649
}
1650
}
1651
1652
#endif
1653
1654