CoCalc -- astcenc_averages_and

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_averages_and_directions.cpp
⁹⁹⁰² views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2025 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17

18
/**
19
 * @brief Functions for finding dominant direction of a set of colors.
20
 */
21
#if !defined(ASTCENC_DECOMPRESS_ONLY)
22

23
#include "astcenc_internal.h"
24

25
#include <cassert>
26

27
/**
28
 * @brief Compute the average RGB color of each partition.
29
 *
30
 * The algorithm here uses a vectorized sequential scan and per-partition
31
 * color accumulators, using select() to mask texel lanes in other partitions.
32
 *
33
 * We only accumulate sums for N-1 partitions during the scan; the value for
34
 * the last partition can be computed given that we know the block-wide average
35
 * already.
36
 *
37
 * Because of this we could reduce the loop iteration count so it "just" spans
38
 * the max texel index needed for the N-1 partitions, which could need fewer
39
 * iterations than the full block texel count. However, this makes the loop
40
 * count erratic and causes more branch mispredictions so is a net loss.
41
 *
42
 * @param      pi         The partitioning to use.
43
 * @param      blk        The block data to process.
44
 * @param[out] averages   The output averages. Unused partition indices will
45
 *                        not be initialized, and lane<3> will be zero.
46
 */
47
static void compute_partition_averages_rgb(
48
	const partition_info& pi,
49
	const image_block& blk,
50
	vfloat4 averages[BLOCK_MAX_PARTITIONS]
51
) {
52
	unsigned int partition_count = pi.partition_count;
53
	size_t texel_count = blk.texel_count;
54
	promise(texel_count > 0);
55

56
	// For 1 partition just use the precomputed mean
57
	if (partition_count == 1)
58
	{
59
		averages[0] = blk.data_mean.swz<0, 1, 2>();
60
	}
61
	// For 2 partitions scan results for partition 0, compute partition 1
62
	else if (partition_count == 2)
63
	{
64
		vfloatacc pp_avg_rgb[3] {};
65

66
		vint lane_id = vint::lane_id();
67
		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
68
		{
69
			vint texel_partition(pi.partition_of_texel + i);
70

71
			vmask lane_mask = lane_id < vint_from_size(texel_count);
72
			lane_id += vint(ASTCENC_SIMD_WIDTH);
73

74
			vmask p0_mask = lane_mask & (texel_partition == vint(0));
75

76
			vfloat data_r = loada(blk.data_r + i);
77
			haccumulate(pp_avg_rgb[0], data_r, p0_mask);
78

79
			vfloat data_g = loada(blk.data_g + i);
80
			haccumulate(pp_avg_rgb[1], data_g, p0_mask);
81

82
			vfloat data_b = loada(blk.data_b + i);
83
			haccumulate(pp_avg_rgb[2], data_b, p0_mask);
84
		}
85

86
		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
87

88
		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
89
		                           hadd_s(pp_avg_rgb[1]),
90
		                           hadd_s(pp_avg_rgb[2]));
91

92
		vfloat4 p1_total = block_total - p0_total;
93

94
		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
95
		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
96
	}
97
	// For 3 partitions scan results for partition 0/1, compute partition 2
98
	else if (partition_count == 3)
99
	{
100
		vfloatacc pp_avg_rgb[2][3] {};
101

102
		vint lane_id = vint::lane_id();
103
		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
104
		{
105
			vint texel_partition(pi.partition_of_texel + i);
106

107
			vmask lane_mask = lane_id < vint_from_size(texel_count);
108
			lane_id += vint(ASTCENC_SIMD_WIDTH);
109

110
			vmask p0_mask = lane_mask & (texel_partition == vint(0));
111
			vmask p1_mask = lane_mask & (texel_partition == vint(1));
112

113
			vfloat data_r = loada(blk.data_r + i);
114
			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
115
			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
116

117
			vfloat data_g = loada(blk.data_g + i);
118
			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
119
			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
120

121
			vfloat data_b = loada(blk.data_b + i);
122
			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
123
			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
124
		}
125

126
		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
127

128
		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
129
		                           hadd_s(pp_avg_rgb[0][1]),
130
		                           hadd_s(pp_avg_rgb[0][2]));
131

132
		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
133
		                           hadd_s(pp_avg_rgb[1][1]),
134
		                           hadd_s(pp_avg_rgb[1][2]));
135

136
		vfloat4 p2_total = block_total - p0_total - p1_total;
137

138
		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
139
		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
140
		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
141
	}
142
	else
143
	{
144
		// For 4 partitions scan results for partition 0/1/2, compute partition 3
145
		vfloatacc pp_avg_rgb[3][3] {};
146

147
		vint lane_id = vint::lane_id();
148
		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
149
		{
150
			vint texel_partition(pi.partition_of_texel + i);
151

152
			vmask lane_mask = lane_id < vint_from_size(texel_count);
153
			lane_id += vint(ASTCENC_SIMD_WIDTH);
154

155
			vmask p0_mask = lane_mask & (texel_partition == vint(0));
156
			vmask p1_mask = lane_mask & (texel_partition == vint(1));
157
			vmask p2_mask = lane_mask & (texel_partition == vint(2));
158

159
			vfloat data_r = loada(blk.data_r + i);
160
			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
161
			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
162
			haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
163

164
			vfloat data_g = loada(blk.data_g + i);
165
			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
166
			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
167
			haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
168

169
			vfloat data_b = loada(blk.data_b + i);
170
			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
171
			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
172
			haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
173
		}
174

175
		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
176

177
		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
178
		                           hadd_s(pp_avg_rgb[0][1]),
179
		                           hadd_s(pp_avg_rgb[0][2]));
180

181
		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
182
		                           hadd_s(pp_avg_rgb[1][1]),
183
		                           hadd_s(pp_avg_rgb[1][2]));
184

185
		vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
186
		                           hadd_s(pp_avg_rgb[2][1]),
187
		                           hadd_s(pp_avg_rgb[2][2]));
188

189
		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
190

191
		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
192
		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
193
		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
194
		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
195
	}
196
}
197

198
/**
199
 * @brief Compute the average RGBA color of each partition.
200
 *
201
 * The algorithm here uses a vectorized sequential scan and per-partition
202
 * color accumulators, using select() to mask texel lanes in other partitions.
203
 *
204
 * We only accumulate sums for N-1 partitions during the scan; the value for
205
 * the last partition can be computed given that we know the block-wide average
206
 * already.
207
 *
208
 * Because of this we could reduce the loop iteration count so it "just" spans
209
 * the max texel index needed for the N-1 partitions, which could need fewer
210
 * iterations than the full block texel count. However, this makes the loop
211
 * count erratic and causes more branch mispredictions so is a net loss.
212
 *
213
 * @param      pi         The partitioning to use.
214
 * @param      blk        The block data to process.
215
 * @param[out] averages   The output averages. Unused partition indices will
216
 *                        not be initialized.
217
 */
218
static void compute_partition_averages_rgba(
219
	const partition_info& pi,
220
	const image_block& blk,
221
	vfloat4 averages[BLOCK_MAX_PARTITIONS]
222
) {
223
	unsigned int partition_count = pi.partition_count;
224
	size_t texel_count = blk.texel_count;
225
	promise(texel_count > 0);
226

227
	// For 1 partition just use the precomputed mean
228
	if (partition_count == 1)
229
	{
230
		averages[0] = blk.data_mean;
231
	}
232
	// For 2 partitions scan results for partition 0, compute partition 1
233
	else if (partition_count == 2)
234
	{
235
		vfloat4 pp_avg_rgba[4] {};
236

237
		vint lane_id = vint::lane_id();
238
		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
239
		{
240
			vint texel_partition(pi.partition_of_texel + i);
241

242
			vmask lane_mask = lane_id < vint_from_size(texel_count);
243
			lane_id += vint(ASTCENC_SIMD_WIDTH);
244

245
			vmask p0_mask = lane_mask & (texel_partition == vint(0));
246

247
			vfloat data_r = loada(blk.data_r + i);
248
			haccumulate(pp_avg_rgba[0], data_r, p0_mask);
249

250
			vfloat data_g = loada(blk.data_g + i);
251
			haccumulate(pp_avg_rgba[1], data_g, p0_mask);
252

253
			vfloat data_b = loada(blk.data_b + i);
254
			haccumulate(pp_avg_rgba[2], data_b, p0_mask);
255

256
			vfloat data_a = loada(blk.data_a + i);
257
			haccumulate(pp_avg_rgba[3], data_a, p0_mask);
258
		}
259

260
		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
261

262
		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
263
		                           hadd_s(pp_avg_rgba[1]),
264
		                           hadd_s(pp_avg_rgba[2]),
265
		                           hadd_s(pp_avg_rgba[3]));
266

267
		vfloat4 p1_total = block_total - p0_total;
268

269
		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
270
		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
271
	}
272
	// For 3 partitions scan results for partition 0/1, compute partition 2
273
	else if (partition_count == 3)
274
	{
275
		vfloat4 pp_avg_rgba[2][4] {};
276

277
		vint lane_id = vint::lane_id();
278
		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
279
		{
280
			vint texel_partition(pi.partition_of_texel + i);
281

282
			vmask lane_mask = lane_id < vint_from_size(texel_count);
283
			lane_id += vint(ASTCENC_SIMD_WIDTH);
284

285
			vmask p0_mask = lane_mask & (texel_partition == vint(0));
286
			vmask p1_mask = lane_mask & (texel_partition == vint(1));
287

288
			vfloat data_r = loada(blk.data_r + i);
289
			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
290
			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
291

292
			vfloat data_g = loada(blk.data_g + i);
293
			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
294
			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
295

296
			vfloat data_b = loada(blk.data_b + i);
297
			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
298
			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
299

300
			vfloat data_a = loada(blk.data_a + i);
301
			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
302
			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
303
		}
304

305
		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
306

307
		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
308
		                           hadd_s(pp_avg_rgba[0][1]),
309
		                           hadd_s(pp_avg_rgba[0][2]),
310
		                           hadd_s(pp_avg_rgba[0][3]));
311

312
		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
313
		                           hadd_s(pp_avg_rgba[1][1]),
314
		                           hadd_s(pp_avg_rgba[1][2]),
315
		                           hadd_s(pp_avg_rgba[1][3]));
316

317
		vfloat4 p2_total = block_total - p0_total - p1_total;
318

319
		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
320
		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
321
		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
322
	}
323
	else
324
	{
325
		// For 4 partitions scan results for partition 0/1/2, compute partition 3
326
		vfloat4 pp_avg_rgba[3][4] {};
327

328
		vint lane_id = vint::lane_id();
329
		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
330
		{
331
			vint texel_partition(pi.partition_of_texel + i);
332

333
			vmask lane_mask = lane_id < vint_from_size(texel_count);
334
			lane_id += vint(ASTCENC_SIMD_WIDTH);
335

336
			vmask p0_mask = lane_mask & (texel_partition == vint(0));
337
			vmask p1_mask = lane_mask & (texel_partition == vint(1));
338
			vmask p2_mask = lane_mask & (texel_partition == vint(2));
339

340
			vfloat data_r = loada(blk.data_r + i);
341
			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
342
			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
343
			haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
344

345
			vfloat data_g = loada(blk.data_g + i);
346
			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
347
			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
348
			haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
349

350
			vfloat data_b = loada(blk.data_b + i);
351
			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
352
			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
353
			haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
354

355
			vfloat data_a = loada(blk.data_a + i);
356
			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
357
			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
358
			haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
359
		}
360

361
		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
362

363
		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
364
		                           hadd_s(pp_avg_rgba[0][1]),
365
		                           hadd_s(pp_avg_rgba[0][2]),
366
		                           hadd_s(pp_avg_rgba[0][3]));
367

368
		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
369
		                           hadd_s(pp_avg_rgba[1][1]),
370
		                           hadd_s(pp_avg_rgba[1][2]),
371
		                           hadd_s(pp_avg_rgba[1][3]));
372

373
		vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
374
		                           hadd_s(pp_avg_rgba[2][1]),
375
		                           hadd_s(pp_avg_rgba[2][2]),
376
		                           hadd_s(pp_avg_rgba[2][3]));
377

378
		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
379

380
		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
381
		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
382
		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
383
		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
384
	}
385
}
386

387
/* See header for documentation. */
388
void compute_avgs_and_dirs_4_comp(
389
	const partition_info& pi,
390
	const image_block& blk,
391
	partition_metrics pm[BLOCK_MAX_PARTITIONS]
392
) {
393
	size_t partition_count = pi.partition_count;
394
	promise(partition_count > 0);
395

396
	// Pre-compute partition_averages
397
	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
398
	compute_partition_averages_rgba(pi, blk, partition_averages);
399

400
	for (size_t partition = 0; partition < partition_count; partition++)
401
	{
402
		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
403
		size_t texel_count = pi.partition_texel_count[partition];
404
		promise(texel_count > 0);
405

406
		vfloat4 average = partition_averages[partition];
407
		pm[partition].avg = average;
408

409
		vfloat4 sum_xp = vfloat4::zero();
410
		vfloat4 sum_yp = vfloat4::zero();
411
		vfloat4 sum_zp = vfloat4::zero();
412
		vfloat4 sum_wp = vfloat4::zero();
413

414
		for (size_t i = 0; i < texel_count; i++)
415
		{
416
			unsigned int iwt = texel_indexes[i];
417
			vfloat4 texel_datum = blk.texel(iwt);
418
			texel_datum = texel_datum - average;
419

420
			vfloat4 zero = vfloat4::zero();
421

422
			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
423
			sum_xp += select(zero, texel_datum, tdm0);
424

425
			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
426
			sum_yp += select(zero, texel_datum, tdm1);
427

428
			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
429
			sum_zp += select(zero, texel_datum, tdm2);
430

431
			vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
432
			sum_wp += select(zero, texel_datum, tdm3);
433
		}
434

435
		vfloat4 prod_xp = dot(sum_xp, sum_xp);
436
		vfloat4 prod_yp = dot(sum_yp, sum_yp);
437
		vfloat4 prod_zp = dot(sum_zp, sum_zp);
438
		vfloat4 prod_wp = dot(sum_wp, sum_wp);
439

440
		vfloat4 best_vector = sum_xp;
441
		vfloat4 best_sum = prod_xp;
442

443
		vmask4 mask = prod_yp > best_sum;
444
		best_vector = select(best_vector, sum_yp, mask);
445
		best_sum = select(best_sum, prod_yp, mask);
446

447
		mask = prod_zp > best_sum;
448
		best_vector = select(best_vector, sum_zp, mask);
449
		best_sum = select(best_sum, prod_zp, mask);
450

451
		mask = prod_wp > best_sum;
452
		best_vector = select(best_vector, sum_wp, mask);
453

454
		pm[partition].dir = best_vector;
455
	}
456
}
457

458
/* See header for documentation. */
459
void compute_avgs_and_dirs_3_comp(
460
	const partition_info& pi,
461
	const image_block& blk,
462
	unsigned int omitted_component,
463
	partition_metrics pm[BLOCK_MAX_PARTITIONS]
464
) {
465
	// Pre-compute partition_averages
466
	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
467
	compute_partition_averages_rgba(pi, blk, partition_averages);
468

469
	const float* data_vr = blk.data_r;
470
	const float* data_vg = blk.data_g;
471
	const float* data_vb = blk.data_b;
472

473
	// TODO: Data-driven permute would be useful to avoid this ...
474
	if (omitted_component == 0)
475
	{
476
		partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
477
		partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
478
		partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
479
		partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
480

481
		data_vr = blk.data_g;
482
		data_vg = blk.data_b;
483
		data_vb = blk.data_a;
484
	}
485
	else if (omitted_component == 1)
486
	{
487
		partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
488
		partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
489
		partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
490
		partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
491

492
		data_vg = blk.data_b;
493
		data_vb = blk.data_a;
494
	}
495
	else if (omitted_component == 2)
496
	{
497
		partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
498
		partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
499
		partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
500
		partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
501

502
		data_vb = blk.data_a;
503
	}
504
	else
505
	{
506
		partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
507
		partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
508
		partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
509
		partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
510
	}
511

512
	size_t partition_count = pi.partition_count;
513
	promise(partition_count > 0);
514

515
	for (size_t partition = 0; partition < partition_count; partition++)
516
	{
517
		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
518
		size_t texel_count = pi.partition_texel_count[partition];
519
		promise(texel_count > 0);
520

521
		vfloat4 average = partition_averages[partition];
522
		pm[partition].avg = average;
523

524
		vfloat4 sum_xp = vfloat4::zero();
525
		vfloat4 sum_yp = vfloat4::zero();
526
		vfloat4 sum_zp = vfloat4::zero();
527

528
		for (size_t i = 0; i < texel_count; i++)
529
		{
530
			unsigned int iwt = texel_indexes[i];
531

532
			vfloat4 texel_datum = vfloat3(data_vr[iwt],
533
			                              data_vg[iwt],
534
			                              data_vb[iwt]);
535
			texel_datum = texel_datum - average;
536

537
			vfloat4 zero = vfloat4::zero();
538

539
			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
540
			sum_xp += select(zero, texel_datum, tdm0);
541

542
			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
543
			sum_yp += select(zero, texel_datum, tdm1);
544

545
			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
546
			sum_zp += select(zero, texel_datum, tdm2);
547
		}
548

549
		vfloat4 prod_xp = dot(sum_xp, sum_xp);
550
		vfloat4 prod_yp = dot(sum_yp, sum_yp);
551
		vfloat4 prod_zp = dot(sum_zp, sum_zp);
552

553
		vfloat4 best_vector = sum_xp;
554
		vfloat4 best_sum = prod_xp;
555

556
		vmask4 mask = prod_yp > best_sum;
557
		best_vector = select(best_vector, sum_yp, mask);
558
		best_sum = select(best_sum, prod_yp, mask);
559

560
		mask = prod_zp > best_sum;
561
		best_vector = select(best_vector, sum_zp, mask);
562

563
		pm[partition].dir = best_vector;
564
	}
565
}
566

567
/* See header for documentation. */
568
void compute_avgs_and_dirs_3_comp_rgb(
569
	const partition_info& pi,
570
	const image_block& blk,
571
	partition_metrics pm[BLOCK_MAX_PARTITIONS]
572
) {
573
	size_t partition_count = pi.partition_count;
574
	promise(partition_count > 0);
575

576
	// Pre-compute partition_averages
577
	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
578
	compute_partition_averages_rgb(pi, blk, partition_averages);
579

580
	for (size_t partition = 0; partition < partition_count; partition++)
581
	{
582
		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
583
		size_t texel_count = pi.partition_texel_count[partition];
584
		promise(texel_count > 0);
585

586
		vfloat4 average = partition_averages[partition];
587
		pm[partition].avg = average;
588

589
		vfloat4 sum_xp = vfloat4::zero();
590
		vfloat4 sum_yp = vfloat4::zero();
591
		vfloat4 sum_zp = vfloat4::zero();
592

593
		for (size_t i = 0; i < texel_count; i++)
594
		{
595
			unsigned int iwt = texel_indexes[i];
596

597
			vfloat4 texel_datum = blk.texel3(iwt);
598
			texel_datum = texel_datum - average;
599

600
			vfloat4 zero = vfloat4::zero();
601

602
			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
603
			sum_xp += select(zero, texel_datum, tdm0);
604

605
			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
606
			sum_yp += select(zero, texel_datum, tdm1);
607

608
			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
609
			sum_zp += select(zero, texel_datum, tdm2);
610
		}
611

612
		vfloat4 prod_xp = dot(sum_xp, sum_xp);
613
		vfloat4 prod_yp = dot(sum_yp, sum_yp);
614
		vfloat4 prod_zp = dot(sum_zp, sum_zp);
615

616
		vfloat4 best_vector = sum_xp;
617
		vfloat4 best_sum = prod_xp;
618

619
		vmask4 mask = prod_yp > best_sum;
620
		best_vector = select(best_vector, sum_yp, mask);
621
		best_sum = select(best_sum, prod_yp, mask);
622

623
		mask = prod_zp > best_sum;
624
		best_vector = select(best_vector, sum_zp, mask);
625

626
		pm[partition].dir = best_vector;
627
	}
628
}
629

630
/* See header for documentation. */
631
void compute_avgs_and_dirs_2_comp(
632
	const partition_info& pt,
633
	const image_block& blk,
634
	unsigned int component1,
635
	unsigned int component2,
636
	partition_metrics pm[BLOCK_MAX_PARTITIONS]
637
) {
638
	vfloat4 average;
639

640
	const float* data_vr = nullptr;
641
	const float* data_vg = nullptr;
642

643
	if (component1 == 0 && component2 == 1)
644
	{
645
		average = blk.data_mean.swz<0, 1>();
646

647
		data_vr = blk.data_r;
648
		data_vg = blk.data_g;
649
	}
650
	else if (component1 == 0 && component2 == 2)
651
	{
652
		average = blk.data_mean.swz<0, 2>();
653

654
		data_vr = blk.data_r;
655
		data_vg = blk.data_b;
656
	}
657
	else // (component1 == 1 && component2 == 2)
658
	{
659
		assert(component1 == 1 && component2 == 2);
660

661
		average = blk.data_mean.swz<1, 2>();
662

663
		data_vr = blk.data_g;
664
		data_vg = blk.data_b;
665
	}
666

667
	size_t partition_count = pt.partition_count;
668
	promise(partition_count > 0);
669

670
	for (size_t partition = 0; partition < partition_count; partition++)
671
	{
672
		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
673
		size_t texel_count = pt.partition_texel_count[partition];
674
		promise(texel_count > 0);
675

676
		// Only compute a partition mean if more than one partition
677
		if (partition_count > 1)
678
		{
679
			average = vfloat4::zero();
680
			for (size_t i = 0; i < texel_count; i++)
681
			{
682
				unsigned int iwt = texel_indexes[i];
683
				average += vfloat2(data_vr[iwt], data_vg[iwt]);
684
			}
685

686
			average = average / static_cast<float>(texel_count);
687
		}
688

689
		pm[partition].avg = average;
690

691
		vfloat4 sum_xp = vfloat4::zero();
692
		vfloat4 sum_yp = vfloat4::zero();
693

694
		for (size_t i = 0; i < texel_count; i++)
695
		{
696
			unsigned int iwt = texel_indexes[i];
697
			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
698
			texel_datum = texel_datum - average;
699

700
			vfloat4 zero = vfloat4::zero();
701

702
			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
703
			sum_xp += select(zero, texel_datum, tdm0);
704

705
			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
706
			sum_yp += select(zero, texel_datum, tdm1);
707
		}
708

709
		vfloat4 prod_xp = dot(sum_xp, sum_xp);
710
		vfloat4 prod_yp = dot(sum_yp, sum_yp);
711

712
		vfloat4 best_vector = sum_xp;
713
		vfloat4 best_sum = prod_xp;
714

715
		vmask4 mask = prod_yp > best_sum;
716
		best_vector = select(best_vector, sum_yp, mask);
717

718
		pm[partition].dir = best_vector;
719
	}
720
}
721

722
/* See header for documentation. */
723
void compute_error_squared_rgba(
724
	const partition_info& pi,
725
	const image_block& blk,
726
	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
727
	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
728
	float line_lengths[BLOCK_MAX_PARTITIONS],
729
	float& uncor_error,
730
	float& samec_error
731
) {
732
	size_t partition_count = pi.partition_count;
733
	promise(partition_count > 0);
734

735
	vfloatacc uncor_errorsumv = vfloatacc::zero();
736
	vfloatacc samec_errorsumv = vfloatacc::zero();
737

738
	for (size_t partition = 0; partition < partition_count; partition++)
739
	{
740
		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
741

742
		processed_line4 l_uncor = uncor_plines[partition];
743
		processed_line4 l_samec = samec_plines[partition];
744

745
		size_t texel_count = pi.partition_texel_count[partition];
746
		promise(texel_count > 0);
747

748
		// Vectorize some useful scalar inputs
749
		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
750
		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
751
		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
752
		vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
753

754
		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
755
		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
756
		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
757
		vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
758

759
		vfloat l_samec_bs0(l_samec.bs.lane<0>());
760
		vfloat l_samec_bs1(l_samec.bs.lane<1>());
761
		vfloat l_samec_bs2(l_samec.bs.lane<2>());
762
		vfloat l_samec_bs3(l_samec.bs.lane<3>());
763

764
		assert(all(l_samec.amod == vfloat4(0.0f)));
765

766
		vfloat uncor_loparamv(1e10f);
767
		vfloat uncor_hiparamv(-1e10f);
768

769
		vfloat ew_r(blk.channel_weight.lane<0>());
770
		vfloat ew_g(blk.channel_weight.lane<1>());
771
		vfloat ew_b(blk.channel_weight.lane<2>());
772
		vfloat ew_a(blk.channel_weight.lane<3>());
773

774
		// This implementation over-shoots, but this is safe as we initialize the texel_indexes
775
		// array to extend the last value. This means min/max are not impacted, but we need to mask
776
		// out the dummy values when we compute the line weighting.
777
		vint lane_ids = vint::lane_id();
778
		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
779
		{
780
			vmask mask = lane_ids < vint_from_size(texel_count);
781
			const uint8_t* texel_idxs = texel_indexes + i;
782

783
			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
784
			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
785
			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
786
			vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
787

788
			vfloat uncor_param = (data_r * l_uncor_bs0)
789
			                   + (data_g * l_uncor_bs1)
790
			                   + (data_b * l_uncor_bs2)
791
			                   + (data_a * l_uncor_bs3);
792

793
			uncor_loparamv = min(uncor_param, uncor_loparamv);
794
			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
795

796
			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
797
			                   + (uncor_param * l_uncor_bs0);
798
			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
799
			                   + (uncor_param * l_uncor_bs1);
800
			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
801
			                   + (uncor_param * l_uncor_bs2);
802
			vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
803
			                   + (uncor_param * l_uncor_bs3);
804

805
			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
806
			                 + (ew_g * uncor_dist1 * uncor_dist1)
807
			                 + (ew_b * uncor_dist2 * uncor_dist2)
808
			                 + (ew_a * uncor_dist3 * uncor_dist3);
809

810
			haccumulate(uncor_errorsumv, uncor_err, mask);
811

812
			// Process samechroma data
813
			vfloat samec_param = (data_r * l_samec_bs0)
814
			                   + (data_g * l_samec_bs1)
815
			                   + (data_b * l_samec_bs2)
816
			                   + (data_a * l_samec_bs3);
817

818
			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
819
			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
820
			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
821
			vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
822

823
			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
824
			                 + (ew_g * samec_dist1 * samec_dist1)
825
			                 + (ew_b * samec_dist2 * samec_dist2)
826
			                 + (ew_a * samec_dist3 * samec_dist3);
827

828
			haccumulate(samec_errorsumv, samec_err, mask);
829

830
			lane_ids += vint(ASTCENC_SIMD_WIDTH);
831
		}
832

833
		// Turn very small numbers and NaNs into a small number
834
		float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
835
		line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
836
	}
837

838
	uncor_error = hadd_s(uncor_errorsumv);
839
	samec_error = hadd_s(samec_errorsumv);
840
}
841

842
/* See header for documentation. */
843
void compute_error_squared_rgb(
844
	const partition_info& pi,
845
	const image_block& blk,
846
	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
847
	float& uncor_error,
848
	float& samec_error
849
) {
850
	size_t partition_count = pi.partition_count;
851
	promise(partition_count > 0);
852

853
	vfloatacc uncor_errorsumv = vfloatacc::zero();
854
	vfloatacc samec_errorsumv = vfloatacc::zero();
855

856
	for (size_t partition = 0; partition < partition_count; partition++)
857
	{
858
		partition_lines3& pl = plines[partition];
859
		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
860
		size_t texel_count = pi.partition_texel_count[partition];
861
		promise(texel_count > 0);
862

863
		processed_line3 l_uncor = pl.uncor_pline;
864
		processed_line3 l_samec = pl.samec_pline;
865

866
		// Vectorize some useful scalar inputs
867
		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
868
		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
869
		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
870

871
		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
872
		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
873
		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
874

875
		vfloat l_samec_bs0(l_samec.bs.lane<0>());
876
		vfloat l_samec_bs1(l_samec.bs.lane<1>());
877
		vfloat l_samec_bs2(l_samec.bs.lane<2>());
878

879
		assert(all(l_samec.amod == vfloat4(0.0f)));
880

881
		vfloat uncor_loparamv(1e10f);
882
		vfloat uncor_hiparamv(-1e10f);
883

884
		vfloat ew_r(blk.channel_weight.lane<0>());
885
		vfloat ew_g(blk.channel_weight.lane<1>());
886
		vfloat ew_b(blk.channel_weight.lane<2>());
887

888
		// This implementation over-shoots, but this is safe as we initialize the weights array
889
		// to extend the last value. This means min/max are not impacted, but we need to mask
890
		// out the dummy values when we compute the line weighting.
891
		vint lane_ids = vint::lane_id();
892
		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
893
		{
894
			vmask mask = lane_ids < vint_from_size(texel_count);
895
			const uint8_t* texel_idxs = texel_indexes + i;
896

897
			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
898
			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
899
			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
900

901
			vfloat uncor_param = (data_r * l_uncor_bs0)
902
			                   + (data_g * l_uncor_bs1)
903
			                   + (data_b * l_uncor_bs2);
904

905
			uncor_loparamv = min(uncor_param, uncor_loparamv);
906
			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
907

908
			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
909
			                   + (uncor_param * l_uncor_bs0);
910
			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
911
			                   + (uncor_param * l_uncor_bs1);
912
			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
913
			                   + (uncor_param * l_uncor_bs2);
914

915
			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
916
			                 + (ew_g * uncor_dist1 * uncor_dist1)
917
			                 + (ew_b * uncor_dist2 * uncor_dist2);
918

919
			haccumulate(uncor_errorsumv, uncor_err, mask);
920

921
			// Process samechroma data
922
			vfloat samec_param = (data_r * l_samec_bs0)
923
			                   + (data_g * l_samec_bs1)
924
			                   + (data_b * l_samec_bs2);
925

926
			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
927
			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
928
			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
929

930
			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
931
			                 + (ew_g * samec_dist1 * samec_dist1)
932
			                 + (ew_b * samec_dist2 * samec_dist2);
933

934
			haccumulate(samec_errorsumv, samec_err, mask);
935

936
			lane_ids += vint(ASTCENC_SIMD_WIDTH);
937
		}
938

939
		// Turn very small numbers and NaNs into a small number
940
		float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
941
		pl.line_length = astc::max(uncor_linelen, 1e-7f);
942
	}
943

944
	uncor_error = hadd_s(uncor_errorsumv);
945
	samec_error = hadd_s(samec_errorsumv);
946
}
947

948
#endif
949

950
Product

Resources

Company