CoCalc -- astcenc_decompress

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
²¹⁷⁹⁰ views
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2024 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17

18
/**
19
 * @brief Functions to decompress a symbolic block.
20
 */
21

22
#include "astcenc_internal.h"
23

24
#include <stdio.h>
25
#include <assert.h>
26

27
/**
28
 * @brief Compute the integer linear interpolation of two color endpoints.
29
 *
30
 * @param u8_mask       The mask for lanes using decode_unorm8 rather than decode_f16.
31
 * @param color0        The endpoint0 color.
32
 * @param color1        The endpoint1 color.
33
 * @param weights       The interpolation weight (between 0 and 64).
34
 *
35
 * @return The interpolated color.
36
 */
37
static vint4 lerp_color_int(
38
	vmask4 u8_mask,
39
	vint4 color0,
40
	vint4 color1,
41
	vint4 weights
42
) {
43
	vint4 weight1 = weights;
44
	vint4 weight0 = vint4(64) - weight1;
45

46
	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
47
	color = asr<6>(color);
48

49
	// For decode_unorm8 values force the codec to bit replicate. This allows the
50
	// rest of the codec to assume the full 0xFFFF range for everything and ignore
51
	// the decode_mode setting
52
	vint4 color_u8 = asr<8>(color) * vint4(257);
53
	color = select(color, color_u8, u8_mask);
54

55
	return color;
56
}
57

58
/**
59
 * @brief Convert integer color value into a float value for the decoder.
60
 *
61
 * @param data       The integer color value post-interpolation.
62
 * @param lns_mask   If set treat lane as HDR (LNS) else LDR (unorm16).
63
 *
64
 * @return The float color value.
65
 */
66
static inline vfloat4 decode_texel(
67
	vint4 data,
68
	vmask4 lns_mask
69
) {
70
	vint4 color_lns = vint4::zero();
71
	vint4 color_unorm = vint4::zero();
72

73
	if (any(lns_mask))
74
	{
75
		color_lns = lns_to_sf16(data);
76
	}
77

78
	if (!all(lns_mask))
79
	{
80
		color_unorm = unorm16_to_sf16(data);
81
	}
82

83
	// Pick components and then convert to FP16
84
	vint4 datai = select(color_unorm, color_lns, lns_mask);
85
	return float16_to_float(datai);
86
}
87

88
/* See header for documentation. */
89
void unpack_weights(
90
	const block_size_descriptor& bsd,
91
	const symbolic_compressed_block& scb,
92
	const decimation_info& di,
93
	bool is_dual_plane,
94
	int weights_plane1[BLOCK_MAX_TEXELS],
95
	int weights_plane2[BLOCK_MAX_TEXELS]
96
) {
97
	// Safe to overshoot as all arrays are allocated to full size
98
	if (!is_dual_plane)
99
	{
100
		// Build full 64-entry weight lookup table
101
		vtable_64x8 table;
102
		vtable_prepare(table, scb.weights);
103

104
		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
105
		{
106
			vint summed_value(8);
107
			vint weight_count(di.texel_weight_count + i);
108
			int max_weight_count = hmax_s(weight_count);
109

110
			promise(max_weight_count > 0);
111
			for (int j = 0; j < max_weight_count; j++)
112
			{
113
				vint texel_weights(di.texel_weights_tr[j] + i);
114
				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
115

116
				summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
117
			}
118

119
			store(lsr<4>(summed_value), weights_plane1 + i);
120
		}
121
	}
122
	else
123
	{
124
		// Build a 32-entry weight lookup table per plane
125
		// Plane 1
126
		vtable_32x8 tab_plane1;
127
		vtable_prepare(tab_plane1, scb.weights);
128

129
		// Plane 2
130
		vtable_32x8 tab_plane2;
131
		vtable_prepare(tab_plane2, scb.weights + 32);
132

133
		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
134
		{
135
			vint sum_plane1(8);
136
			vint sum_plane2(8);
137

138
			vint weight_count(di.texel_weight_count + i);
139
			int max_weight_count = hmax_s(weight_count);
140

141
			promise(max_weight_count > 0);
142
			for (int j = 0; j < max_weight_count; j++)
143
			{
144
				vint texel_weights(di.texel_weights_tr[j] + i);
145
				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
146

147
				sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
148
				sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
149
			}
150

151
			store(lsr<4>(sum_plane1), weights_plane1 + i);
152
			store(lsr<4>(sum_plane2), weights_plane2 + i);
153
		}
154
	}
155
}
156

157
/**
158
 * @brief Return an FP32 NaN value for use in error colors.
159
 *
160
 * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
161
 *
162
 * @return The float color value.
163
 */
164
static float error_color_nan()
165
{
166
	if32 v;
167
	v.u = 0xFFFFE000U;
168
	return v.f;
169
}
170

171
/* See header for documentation. */
172
void decompress_symbolic_block(
173
	astcenc_profile decode_mode,
174
	const block_size_descriptor& bsd,
175
	int xpos,
176
	int ypos,
177
	int zpos,
178
	const symbolic_compressed_block& scb,
179
	image_block& blk
180
) {
181
	blk.xpos = xpos;
182
	blk.ypos = ypos;
183
	blk.zpos = zpos;
184

185
	blk.data_min = vfloat4::zero();
186
	blk.data_mean = vfloat4::zero();
187
	blk.data_max = vfloat4::zero();
188
	blk.grayscale = false;
189

190
	// If we detected an error-block, blow up immediately.
191
	if (scb.block_type == SYM_BTYPE_ERROR)
192
	{
193
		for (unsigned int i = 0; i < bsd.texel_count; i++)
194
		{
195
			blk.data_r[i] = error_color_nan();
196
			blk.data_g[i] = error_color_nan();
197
			blk.data_b[i] = error_color_nan();
198
			blk.data_a[i] = error_color_nan();
199
			blk.rgb_lns[i] = 0;
200
			blk.alpha_lns[i] = 0;
201
		}
202

203
		return;
204
	}
205

206
	if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
207
	    (scb.block_type == SYM_BTYPE_CONST_U16))
208
	{
209
		vfloat4 color;
210
		uint8_t use_lns = 0;
211

212
		// UNORM16 constant color block
213
		if (scb.block_type == SYM_BTYPE_CONST_U16)
214
		{
215
			vint4 colori(scb.constant_color);
216

217
			// Determine the UNORM8 rounding on the decode
218
			vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
219

220
			// The real decoder would just use the top 8 bits, but we rescale
221
			// in to a 16-bit value that rounds correctly.
222
			vint4 colori_u8 = asr<8>(colori) * 257;
223
			colori = select(colori, colori_u8, u8_mask);
224

225
			vint4 colorf16 = unorm16_to_sf16(colori);
226
			color = float16_to_float(colorf16);
227
		}
228
		// FLOAT16 constant color block
229
		else
230
		{
231
			switch (decode_mode)
232
			{
233
			case ASTCENC_PRF_LDR_SRGB:
234
			case ASTCENC_PRF_LDR:
235
				color = vfloat4(error_color_nan());
236
				break;
237
			case ASTCENC_PRF_HDR_RGB_LDR_A:
238
			case ASTCENC_PRF_HDR:
239
				// Constant-color block; unpack from FP16 to FP32.
240
				color = float16_to_float(vint4(scb.constant_color));
241
				use_lns = 1;
242
				break;
243
			}
244
		}
245

246
		for (unsigned int i = 0; i < bsd.texel_count; i++)
247
		{
248
			blk.data_r[i] = color.lane<0>();
249
			blk.data_g[i] = color.lane<1>();
250
			blk.data_b[i] = color.lane<2>();
251
			blk.data_a[i] = color.lane<3>();
252
			blk.rgb_lns[i] = use_lns;
253
			blk.alpha_lns[i] = use_lns;
254
		}
255

256
		return;
257
	}
258

259
	// Get the appropriate partition-table entry
260
	int partition_count = scb.partition_count;
261
	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
262

263
	// Get the appropriate block descriptors
264
	const auto& bm = bsd.get_block_mode(scb.block_mode);
265
	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
266

267
	bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
268

269
	// Unquantize and undecimate the weights
270
	int plane1_weights[BLOCK_MAX_TEXELS];
271
	int plane2_weights[BLOCK_MAX_TEXELS];
272
	unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
273

274
	// Now that we have endpoint colors and weights, we can unpack texel colors
275
	int plane2_component = scb.plane2_component;
276
	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
277

278
	vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
279

280
	for (int i = 0; i < partition_count; i++)
281
	{
282
		// Decode the color endpoints for this partition
283
		vint4 ep0;
284
		vint4 ep1;
285
		bool rgb_lns;
286
		bool a_lns;
287

288
		unpack_color_endpoints(decode_mode,
289
		                       scb.color_formats[i],
290
		                       scb.color_values[i],
291
		                       rgb_lns, a_lns,
292
		                       ep0, ep1);
293

294
		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
295

296
		int texel_count = pi.partition_texel_count[i];
297
		for (int j = 0; j < texel_count; j++)
298
		{
299
			int tix = pi.texels_of_partition[i][j];
300
			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
301
			vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
302
			vfloat4 colorf = decode_texel(color, lns_mask);
303

304
			blk.data_r[tix] = colorf.lane<0>();
305
			blk.data_g[tix] = colorf.lane<1>();
306
			blk.data_b[tix] = colorf.lane<2>();
307
			blk.data_a[tix] = colorf.lane<3>();
308
		}
309
	}
310
}
311

312
#if !defined(ASTCENC_DECOMPRESS_ONLY)
313

314
/* See header for documentation. */
315
float compute_symbolic_block_difference_2plane(
316
	const astcenc_config& config,
317
	const block_size_descriptor& bsd,
318
	const symbolic_compressed_block& scb,
319
	const image_block& blk
320
) {
321
	// If we detected an error-block, blow up immediately.
322
	if (scb.block_type == SYM_BTYPE_ERROR)
323
	{
324
		return ERROR_CALC_DEFAULT;
325
	}
326

327
	assert(scb.block_mode >= 0);
328
	assert(scb.partition_count == 1);
329
	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
330

331
	// Get the appropriate block descriptor
332
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
333
	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
334

335
	// Unquantize and undecimate the weights
336
	int plane1_weights[BLOCK_MAX_TEXELS];
337
	int plane2_weights[BLOCK_MAX_TEXELS];
338
	unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
339

340
	vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
341

342
	vfloat4 summa = vfloat4::zero();
343

344
	// Decode the color endpoints for this partition
345
	vint4 ep0;
346
	vint4 ep1;
347
	bool rgb_lns;
348
	bool a_lns;
349

350
	unpack_color_endpoints(config.profile,
351
	                       scb.color_formats[0],
352
	                       scb.color_values[0],
353
	                       rgb_lns, a_lns,
354
	                       ep0, ep1);
355

356
	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
357

358
	// Unpack and compute error for each texel in the partition
359
	unsigned int texel_count = bsd.texel_count;
360
	for (unsigned int i = 0; i < texel_count; i++)
361
	{
362
		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
363
		vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
364

365
		vfloat4 color = int_to_float(colori);
366
		vfloat4 oldColor = blk.texel(i);
367

368
		// Compare error using a perceptual decode metric for RGBM textures
369
		if (config.flags & ASTCENC_FLG_MAP_RGBM)
370
		{
371
			// Fail encodings that result in zero weight M pixels. Note that this can cause
372
			// "interesting" artifacts if we reject all useful encodings - we typically get max
373
			// brightness encodings instead which look just as bad. We recommend users apply a
374
			// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
375
			// getting small M values post-quantization, but we can't prove it would never
376
			// happen, especially at low bit rates ...
377
			if (color.lane<3>() == 0.0f)
378
			{
379
				return -ERROR_CALC_DEFAULT;
380
			}
381

382
			// Compute error based on decoded RGBM color
383
			color = vfloat4(
384
				color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
385
				color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
386
				color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
387
				1.0f
388
			);
389

390
			oldColor = vfloat4(
391
				oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
392
				oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
393
				oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
394
				1.0f
395
			);
396
		}
397

398
		vfloat4 error = oldColor - color;
399
		error = min(abs(error), 1e15f);
400
		error = error * error;
401

402
		summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
403
	}
404

405
	return summa.lane<0>();
406
}
407

408
/* See header for documentation. */
409
float compute_symbolic_block_difference_1plane(
410
	const astcenc_config& config,
411
	const block_size_descriptor& bsd,
412
	const symbolic_compressed_block& scb,
413
	const image_block& blk
414
) {
415
	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
416

417
	// If we detected an error-block, blow up immediately.
418
	if (scb.block_type == SYM_BTYPE_ERROR)
419
	{
420
		return ERROR_CALC_DEFAULT;
421
	}
422

423
	assert(scb.block_mode >= 0);
424

425
	// Get the appropriate partition-table entry
426
	unsigned int partition_count = scb.partition_count;
427
	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
428

429
	// Get the appropriate block descriptor
430
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
431
	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
432

433
	// Unquantize and undecimate the weights
434
	int plane1_weights[BLOCK_MAX_TEXELS];
435
	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
436

437
	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
438

439
	vfloat4 summa = vfloat4::zero();
440
	for (unsigned int i = 0; i < partition_count; i++)
441
	{
442
		// Decode the color endpoints for this partition
443
		vint4 ep0;
444
		vint4 ep1;
445
		bool rgb_lns;
446
		bool a_lns;
447

448
		unpack_color_endpoints(config.profile,
449
		                       scb.color_formats[i],
450
		                       scb.color_values[i],
451
		                       rgb_lns, a_lns,
452
		                       ep0, ep1);
453

454
		// Unpack and compute error for each texel in the partition
455
		unsigned int texel_count = pi.partition_texel_count[i];
456
		for (unsigned int j = 0; j < texel_count; j++)
457
		{
458
			unsigned int tix = pi.texels_of_partition[i][j];
459
			vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
460
			                              vint4(plane1_weights[tix]));
461

462
			vfloat4 color = int_to_float(colori);
463
			vfloat4 oldColor = blk.texel(tix);
464

465
			// Compare error using a perceptual decode metric for RGBM textures
466
			if (config.flags & ASTCENC_FLG_MAP_RGBM)
467
			{
468
				// Fail encodings that result in zero weight M pixels. Note that this can cause
469
				// "interesting" artifacts if we reject all useful encodings - we typically get max
470
				// brightness encodings instead which look just as bad. We recommend users apply a
471
				// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
472
				// getting small M values post-quantization, but we can't prove it would never
473
				// happen, especially at low bit rates ...
474
				if (color.lane<3>() == 0.0f)
475
				{
476
					return -ERROR_CALC_DEFAULT;
477
				}
478

479
				// Compute error based on decoded RGBM color
480
				color = vfloat4(
481
					color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
482
					color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
483
					color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
484
					1.0f
485
				);
486

487
				oldColor = vfloat4(
488
					oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
489
					oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
490
					oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
491
					1.0f
492
				);
493
			}
494

495
			vfloat4 error = oldColor - color;
496
			error = min(abs(error), 1e15f);
497
			error = error * error;
498

499
			summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
500
		}
501
	}
502

503
	return summa.lane<0>();
504
}
505

506
/* See header for documentation. */
507
float compute_symbolic_block_difference_1plane_1partition(
508
	const astcenc_config& config,
509
	const block_size_descriptor& bsd,
510
	const symbolic_compressed_block& scb,
511
	const image_block& blk
512
) {
513
	// If we detected an error-block, blow up immediately.
514
	if (scb.block_type == SYM_BTYPE_ERROR)
515
	{
516
		return ERROR_CALC_DEFAULT;
517
	}
518

519
	assert(scb.block_mode >= 0);
520
	assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
521

522
	// Get the appropriate block descriptor
523
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
524
	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
525

526
	// Unquantize and undecimate the weights
527
	ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
528
	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
529

530
	// Decode the color endpoints for this partition
531
	vint4 ep0;
532
	vint4 ep1;
533
	bool rgb_lns;
534
	bool a_lns;
535

536
	unpack_color_endpoints(config.profile,
537
	                       scb.color_formats[0],
538
	                       scb.color_values[0],
539
	                       rgb_lns, a_lns,
540
	                       ep0, ep1);
541

542
	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
543

544
	// Unpack and compute error for each texel in the partition
545
	vfloatacc summav = vfloatacc::zero();
546

547
	vint lane_id = vint::lane_id();
548

549
	unsigned int texel_count = bsd.texel_count;
550
	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
551
	{
552
		// Compute EP1 contribution
553
		vint weight1 = vint::loada(plane1_weights + i);
554
		vint ep1_r = vint(ep1.lane<0>()) * weight1;
555
		vint ep1_g = vint(ep1.lane<1>()) * weight1;
556
		vint ep1_b = vint(ep1.lane<2>()) * weight1;
557
		vint ep1_a = vint(ep1.lane<3>()) * weight1;
558

559
		// Compute EP0 contribution
560
		vint weight0 = vint(64) - weight1;
561
		vint ep0_r = vint(ep0.lane<0>()) * weight0;
562
		vint ep0_g = vint(ep0.lane<1>()) * weight0;
563
		vint ep0_b = vint(ep0.lane<2>()) * weight0;
564
		vint ep0_a = vint(ep0.lane<3>()) * weight0;
565

566
		// Combine contributions
567
		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
568
		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
569
		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
570
		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
571

572
		// If using a U8 decode mode bit replicate top 8 bits
573
		// so rest of codec can assume 0xFFFF max range everywhere
574
		vint colori_r8 = asr<8>(colori_r) * vint(257);
575
		colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
576

577
		vint colori_g8 = asr<8>(colori_g) * vint(257);
578
		colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
579

580
		vint colori_b8 = asr<8>(colori_b) * vint(257);
581
		colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
582

583
		vint colori_a8 = asr<8>(colori_a) * vint(257);
584
		colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
585

586
		// Compute color diff
587
		vfloat color_r = int_to_float(colori_r);
588
		vfloat color_g = int_to_float(colori_g);
589
		vfloat color_b = int_to_float(colori_b);
590
		vfloat color_a = int_to_float(colori_a);
591

592
		vfloat color_orig_r = loada(blk.data_r + i);
593
		vfloat color_orig_g = loada(blk.data_g + i);
594
		vfloat color_orig_b = loada(blk.data_b + i);
595
		vfloat color_orig_a = loada(blk.data_a + i);
596

597
		vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
598
		vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
599
		vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
600
		vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
601

602
		// Compute squared error metric
603
		color_error_r = color_error_r * color_error_r;
604
		color_error_g = color_error_g * color_error_g;
605
		color_error_b = color_error_b * color_error_b;
606
		color_error_a = color_error_a * color_error_a;
607

608
		vfloat metric = color_error_r * blk.channel_weight.lane<0>()
609
		              + color_error_g * blk.channel_weight.lane<1>()
610
		              + color_error_b * blk.channel_weight.lane<2>()
611
		              + color_error_a * blk.channel_weight.lane<3>();
612

613
		// Mask off bad lanes
614
		vmask mask = lane_id < vint(texel_count);
615
		lane_id += vint(ASTCENC_SIMD_WIDTH);
616
		haccumulate(summav, metric, mask);
617
	}
618

619
	return hadd_s(summav);
620
}
621

622
#endif
623

624
Product

Resources

Company