Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/etcpak/ProcessRGB.cpp
9833 views
1
#include <array>
2
#include <string.h>
3
#include <limits>
4
#ifdef __ARM_NEON
5
# include <arm_neon.h>
6
#endif
7
8
#include "Dither.hpp"
9
#include "ForceInline.hpp"
10
#include "Math.hpp"
11
#include "ProcessCommon.hpp"
12
#include "ProcessRGB.hpp"
13
#include "Tables.hpp"
14
#include "Vector.hpp"
15
#if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER
16
# ifdef _MSC_VER
17
# include <intrin.h>
18
# include <Windows.h>
19
# define _bswap(x) _byteswap_ulong(x)
20
# define _bswap64(x) _byteswap_uint64(x)
21
# else
22
# include <x86intrin.h>
23
# endif
24
#endif
25
26
#ifndef _bswap
27
# define _bswap(x) __builtin_bswap32(x)
28
# define _bswap64(x) __builtin_bswap64(x)
29
#endif
30
31
static const uint32_t MaxError = 1065369600; // ((38+76+14) * 255)^2
32
// common T-/H-mode table
33
static uint8_t tableTH[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
34
35
// thresholds for the early compression-mode decision scheme
36
// default: 0.03, 0.09, and 0.38
37
float ecmd_threshold[3] = { 0.03f, 0.09f, 0.38f };
38
39
static const uint8_t ModeUndecided = 0;
40
static const uint8_t ModePlanar = 0x1;
41
static const uint8_t ModeTH = 0x2;
42
43
const unsigned int R = 2;
44
const unsigned int G = 1;
45
const unsigned int B = 0;
46
47
struct Luma
48
{
49
#ifdef __AVX2__
50
float max, min;
51
uint8_t minIdx = 255, maxIdx = 255;
52
__m128i luma8;
53
#elif defined __ARM_NEON && defined __aarch64__
54
float max, min;
55
uint8_t minIdx = 255, maxIdx = 255;
56
uint8x16_t luma8;
57
#else
58
uint8_t max = 0, min = 255, maxIdx = 0, minIdx = 0;
59
uint8_t val[16];
60
#endif
61
};
62
63
#ifdef __AVX2__
64
struct Plane
65
{
66
uint64_t plane;
67
uint64_t error;
68
__m256i sum4;
69
};
70
#endif
71
72
#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
73
struct Channels
74
{
75
#ifdef __AVX2__
76
__m128i r8, g8, b8;
77
#elif defined __ARM_NEON && defined __aarch64__
78
uint8x16x2_t r, g, b;
79
#endif
80
};
81
#endif
82
83
namespace
84
{
85
static etcpak_force_inline uint8_t clamp( uint8_t min, int16_t val, uint8_t max )
86
{
87
return val < min ? min : ( val > max ? max : val );
88
}
89
90
static etcpak_force_inline uint8_t clampMin( uint8_t min, int16_t val )
91
{
92
return val < min ? min : val;
93
}
94
95
static etcpak_force_inline uint8_t clampMax( int16_t val, uint8_t max )
96
{
97
return val > max ? max : val;
98
}
99
100
// slightly faster than std::sort
101
static void insertionSort( uint8_t* arr1, uint8_t* arr2 )
102
{
103
for( uint8_t i = 1; i < 16; ++i )
104
{
105
uint8_t value = arr1[i];
106
uint8_t hole = i;
107
108
for( ; hole > 0 && value < arr1[hole - 1]; --hole )
109
{
110
arr1[hole] = arr1[hole - 1];
111
arr2[hole] = arr2[hole - 1];
112
}
113
arr1[hole] = value;
114
arr2[hole] = i;
115
}
116
}
117
118
//converts indices from |a0|a1|e0|e1|i0|i1|m0|m1|b0|b1|f0|f1|j0|j1|n0|n1|c0|c1|g0|g1|k0|k1|o0|o1|d0|d1|h0|h1|l0|l1|p0|p1| previously used by T- and H-modes
119
// into |p0|o0|n0|m0|l0|k0|j0|i0|h0|g0|f0|e0|d0|c0|b0|a0|p1|o1|n1|m1|l1|k1|j1|i1|h1|g1|f1|e1|d1|c1|b1|a1| which should be used for all modes.
120
// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
121
static etcpak_force_inline int indexConversion( int pixelIndices )
122
{
123
int correctIndices = 0;
124
int LSB[4][4];
125
int MSB[4][4];
126
int shift = 0;
127
for( int y = 3; y >= 0; y-- )
128
{
129
for( int x = 3; x >= 0; x-- )
130
{
131
LSB[x][y] = ( pixelIndices >> shift ) & 1;
132
shift++;
133
MSB[x][y] = ( pixelIndices >> shift ) & 1;
134
shift++;
135
}
136
}
137
shift = 0;
138
for( int x = 0; x < 4; x++ )
139
{
140
for( int y = 0; y < 4; y++ )
141
{
142
correctIndices |= ( LSB[x][y] << shift );
143
correctIndices |= ( MSB[x][y] << ( 16 + shift ) );
144
shift++;
145
}
146
}
147
return correctIndices;
148
}
149
150
// Swapping two RGB-colors
151
// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
152
static etcpak_force_inline void swapColors( uint8_t( colors )[2][3] )
153
{
154
uint8_t temp = colors[0][R];
155
colors[0][R] = colors[1][R];
156
colors[1][R] = temp;
157
158
temp = colors[0][G];
159
colors[0][G] = colors[1][G];
160
colors[1][G] = temp;
161
162
temp = colors[0][B];
163
colors[0][B] = colors[1][B];
164
colors[1][B] = temp;
165
}
166
167
168
// calculates quantized colors for T or H modes
169
void compressColor( uint8_t( currColor )[2][3], uint8_t( quantColor )[2][3], bool t_mode )
170
{
171
if( t_mode )
172
{
173
quantColor[0][R] = clampMax( 15 * ( currColor[0][R] + 8 ) / 255, 15 );
174
quantColor[0][G] = clampMax( 15 * ( currColor[0][G] + 8 ) / 255, 15 );
175
quantColor[0][B] = clampMax( 15 * ( currColor[0][B] + 8 ) / 255, 15 );
176
}
177
else // clamped to [1,14] to get a wider range
178
{
179
quantColor[0][R] = clamp( 1, 15 * ( currColor[0][R] + 8 ) / 255, 14 );
180
quantColor[0][G] = clamp( 1, 15 * ( currColor[0][G] + 8 ) / 255, 14 );
181
quantColor[0][B] = clamp( 1, 15 * ( currColor[0][B] + 8 ) / 255, 14 );
182
}
183
184
// clamped to [1,14] to get a wider range
185
quantColor[1][R] = clamp( 1, 15 * ( currColor[1][R] + 8 ) / 255, 14 );
186
quantColor[1][G] = clamp( 1, 15 * ( currColor[1][G] + 8 ) / 255, 14 );
187
quantColor[1][B] = clamp( 1, 15 * ( currColor[1][B] + 8 ) / 255, 14 );
188
}
189
190
// three decoding functions come from ETCPACK v2.74 and are slightly changed.
191
static etcpak_force_inline void decompressColor( uint8_t( colorsRGB444 )[2][3], uint8_t( colors )[2][3] )
192
{
193
// The color should be retrieved as:
194
//
195
// c = round(255/(r_bits^2-1))*comp_color
196
//
197
// This is similar to bit replication
198
//
199
// Note -- this code only work for bit replication from 4 bits and up --- 3 bits needs
200
// two copy operations.
201
colors[0][R] = ( colorsRGB444[0][R] << 4 ) | colorsRGB444[0][R];
202
colors[0][G] = ( colorsRGB444[0][G] << 4 ) | colorsRGB444[0][G];
203
colors[0][B] = ( colorsRGB444[0][B] << 4 ) | colorsRGB444[0][B];
204
colors[1][R] = ( colorsRGB444[1][R] << 4 ) | colorsRGB444[1][R];
205
colors[1][G] = ( colorsRGB444[1][G] << 4 ) | colorsRGB444[1][G];
206
colors[1][B] = ( colorsRGB444[1][B] << 4 ) | colorsRGB444[1][B];
207
}
208
209
// calculates the paint colors from the block colors
210
// using a distance d and one of the H- or T-patterns.
211
static void calculatePaintColors59T( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
212
{
213
//////////////////////////////////////////////
214
//
215
// C3 C1 C4----C1---C2
216
// | | |
217
// | | |
218
// |-------| |
219
// | | |
220
// | | |
221
// C4 C2 C3
222
//
223
//////////////////////////////////////////////
224
225
// C4
226
pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
227
pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
228
pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
229
230
// C3
231
pColors[0][R] = colors[0][R];
232
pColors[0][G] = colors[0][G];
233
pColors[0][B] = colors[0][B];
234
// C2
235
pColors[1][R] = clampMax( colors[1][R] + tableTH[d], 255 );
236
pColors[1][G] = clampMax( colors[1][G] + tableTH[d], 255 );
237
pColors[1][B] = clampMax( colors[1][B] + tableTH[d], 255 );
238
// C1
239
pColors[2][R] = colors[1][R];
240
pColors[2][G] = colors[1][G];
241
pColors[2][B] = colors[1][B];
242
}
243
244
static void calculatePaintColors58H( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
245
{
246
pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
247
pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
248
pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
249
250
// C1
251
pColors[0][R] = clampMax( colors[0][R] + tableTH[d], 255 );
252
pColors[0][G] = clampMax( colors[0][G] + tableTH[d], 255 );
253
pColors[0][B] = clampMax( colors[0][B] + tableTH[d], 255 );
254
// C2
255
pColors[1][R] = clampMin( 0, colors[0][R] - tableTH[d] );
256
pColors[1][G] = clampMin( 0, colors[0][G] - tableTH[d] );
257
pColors[1][B] = clampMin( 0, colors[0][B] - tableTH[d] );
258
// C3
259
pColors[2][R] = clampMax( colors[1][R] + tableTH[d], 255 );
260
pColors[2][G] = clampMax( colors[1][G] + tableTH[d], 255 );
261
pColors[2][B] = clampMax( colors[1][B] + tableTH[d], 255 );
262
}
263
264
#if defined _MSC_VER && !defined __clang__
265
static etcpak_force_inline unsigned long _bit_scan_forward( unsigned long mask )
266
{
267
unsigned long ret;
268
_BitScanForward( &ret, mask );
269
return ret;
270
}
271
#endif
272
273
typedef std::array<uint16_t, 4> v4i;
274
275
#ifdef __AVX2__
276
static etcpak_force_inline __m256i Sum4_AVX2( const uint8_t* data) noexcept
277
{
278
__m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
279
__m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
280
__m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
281
__m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
282
283
__m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
284
__m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
285
__m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
286
__m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
287
288
__m256i t0 = _mm256_cvtepu8_epi16(dm0);
289
__m256i t1 = _mm256_cvtepu8_epi16(dm1);
290
__m256i t2 = _mm256_cvtepu8_epi16(dm2);
291
__m256i t3 = _mm256_cvtepu8_epi16(dm3);
292
293
__m256i sum0 = _mm256_add_epi16(t0, t1);
294
__m256i sum1 = _mm256_add_epi16(t2, t3);
295
296
__m256i s0 = _mm256_permute2x128_si256(sum0, sum1, (0) | (3 << 4)); // 0, 0, 3, 3
297
__m256i s1 = _mm256_permute2x128_si256(sum0, sum1, (1) | (2 << 4)); // 1, 1, 2, 2
298
299
__m256i s2 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(1, 3, 0, 2));
300
__m256i s3 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(0, 2, 1, 3));
301
__m256i s4 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(3, 1, 0, 2));
302
__m256i s5 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(2, 0, 1, 3));
303
304
__m256i sum5 = _mm256_add_epi16(s2, s3); // 3, 0, 3, 0
305
__m256i sum6 = _mm256_add_epi16(s4, s5); // 2, 1, 1, 2
306
return _mm256_add_epi16(sum5, sum6); // 3+2, 0+1, 3+1, 3+2
307
}
308
309
static etcpak_force_inline __m256i Average_AVX2( const __m256i data) noexcept
310
{
311
__m256i a = _mm256_add_epi16(data, _mm256_set1_epi16(4));
312
313
return _mm256_srli_epi16(a, 3);
314
}
315
316
static etcpak_force_inline __m128i CalcErrorBlock_AVX2( const __m256i data, const v4i a[8]) noexcept
317
{
318
//
319
__m256i a0 = _mm256_load_si256((__m256i*)a[0].data());
320
__m256i a1 = _mm256_load_si256((__m256i*)a[4].data());
321
322
// err = 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
323
__m256i a4 = _mm256_madd_epi16(a0, a0);
324
__m256i a5 = _mm256_madd_epi16(a1, a1);
325
326
__m256i a6 = _mm256_hadd_epi32(a4, a5);
327
__m256i a7 = _mm256_slli_epi32(a6, 3);
328
329
__m256i a8 = _mm256_add_epi32(a7, _mm256_set1_epi32(0x3FFFFFFF)); // Big value to prevent negative values, but small enough to prevent overflow
330
331
// average is not swapped
332
// err -= block[0] * 2 * average[0];
333
// err -= block[1] * 2 * average[1];
334
// err -= block[2] * 2 * average[2];
335
__m256i a2 = _mm256_slli_epi16(a0, 1);
336
__m256i a3 = _mm256_slli_epi16(a1, 1);
337
__m256i b0 = _mm256_madd_epi16(a2, data);
338
__m256i b1 = _mm256_madd_epi16(a3, data);
339
340
__m256i b2 = _mm256_hadd_epi32(b0, b1);
341
__m256i b3 = _mm256_sub_epi32(a8, b2);
342
__m256i b4 = _mm256_hadd_epi32(b3, b3);
343
344
__m256i b5 = _mm256_permutevar8x32_epi32(b4, _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0));
345
346
return _mm256_castsi256_si128(b5);
347
}
348
349
static etcpak_force_inline void ProcessAverages_AVX2(const __m256i d, v4i a[8] ) noexcept
350
{
351
__m256i t = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(31)), _mm256_set1_epi16(128));
352
353
__m256i c = _mm256_srli_epi16(_mm256_add_epi16(t, _mm256_srli_epi16(t, 8)), 8);
354
355
__m256i c1 = _mm256_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
356
__m256i diff = _mm256_sub_epi16(c, c1);
357
diff = _mm256_max_epi16(diff, _mm256_set1_epi16(-4));
358
diff = _mm256_min_epi16(diff, _mm256_set1_epi16(3));
359
360
__m256i co = _mm256_add_epi16(c1, diff);
361
362
c = _mm256_blend_epi16(co, c, 0xF0);
363
364
__m256i a0 = _mm256_or_si256(_mm256_slli_epi16(c, 3), _mm256_srli_epi16(c, 2));
365
366
_mm256_store_si256((__m256i*)a[4].data(), a0);
367
368
__m256i t0 = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(15)), _mm256_set1_epi16(128));
369
__m256i t1 = _mm256_srli_epi16(_mm256_add_epi16(t0, _mm256_srli_epi16(t0, 8)), 8);
370
371
__m256i t2 = _mm256_or_si256(t1, _mm256_slli_epi16(t1, 4));
372
373
_mm256_store_si256((__m256i*)a[0].data(), t2);
374
}
375
376
static etcpak_force_inline uint64_t EncodeAverages_AVX2( const v4i a[8], size_t idx ) noexcept
377
{
378
uint64_t d = ( idx << 24 );
379
size_t base = idx << 1;
380
381
__m128i a0 = _mm_load_si128((const __m128i*)a[base].data());
382
383
__m128i r0, r1;
384
385
if( ( idx & 0x2 ) == 0 )
386
{
387
r0 = _mm_srli_epi16(a0, 4);
388
389
__m128i a1 = _mm_unpackhi_epi64(r0, r0);
390
r1 = _mm_slli_epi16(a1, 4);
391
}
392
else
393
{
394
__m128i a1 = _mm_and_si128(a0, _mm_set1_epi16(-8));
395
396
r0 = _mm_unpackhi_epi64(a1, a1);
397
__m128i a2 = _mm_sub_epi16(a1, r0);
398
__m128i a3 = _mm_srai_epi16(a2, 3);
399
r1 = _mm_and_si128(a3, _mm_set1_epi16(0x07));
400
}
401
402
__m128i r2 = _mm_or_si128(r0, r1);
403
// do missing swap for average values
404
__m128i r3 = _mm_shufflelo_epi16(r2, _MM_SHUFFLE(3, 0, 1, 2));
405
__m128i r4 = _mm_packus_epi16(r3, _mm_setzero_si128());
406
d |= _mm_cvtsi128_si32(r4);
407
408
return d;
409
}
410
411
static etcpak_force_inline uint64_t CheckSolid_AVX2( const uint8_t* src ) noexcept
412
{
413
__m256i d0 = _mm256_loadu_si256(((__m256i*)src) + 0);
414
__m256i d1 = _mm256_loadu_si256(((__m256i*)src) + 1);
415
416
__m256i c = _mm256_broadcastd_epi32(_mm256_castsi256_si128(d0));
417
418
__m256i c0 = _mm256_cmpeq_epi8(d0, c);
419
__m256i c1 = _mm256_cmpeq_epi8(d1, c);
420
421
__m256i m = _mm256_and_si256(c0, c1);
422
423
if (!_mm256_testc_si256(m, _mm256_set1_epi32(-1)))
424
{
425
return 0;
426
}
427
428
return 0x02000000 |
429
( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
430
( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
431
( (unsigned int)( src[2] & 0xF8 ) );
432
}
433
434
static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const uint8_t* src) noexcept
435
{
436
__m256i sum4 = Sum4_AVX2( src );
437
438
ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
439
440
return CalcErrorBlock_AVX2( sum4, a);
441
}
442
443
static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const __m256i sum4) noexcept
444
{
445
ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
446
447
return CalcErrorBlock_AVX2( sum4, a);
448
}
449
450
static etcpak_force_inline void FindBestFit_4x2_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
451
{
452
__m256i sel0 = _mm256_setzero_si256();
453
__m256i sel1 = _mm256_setzero_si256();
454
455
for (unsigned int j = 0; j < 2; ++j)
456
{
457
unsigned int bid = offset + 1 - j;
458
459
__m256i squareErrorSum = _mm256_setzero_si256();
460
461
__m128i a0 = _mm_loadl_epi64((const __m128i*)a[bid].data());
462
__m256i a1 = _mm256_broadcastq_epi64(a0);
463
464
// Processing one full row each iteration
465
for (size_t i = 0; i < 8; i += 4)
466
{
467
__m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
468
469
__m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
470
__m256i d = _mm256_sub_epi16(a1, rgb16);
471
472
// The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
473
// This produces slightly different results, but is significant faster
474
__m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
475
__m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
476
__m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
477
__m128i pixel3 = _mm256_castsi256_si128(pixel2);
478
479
__m128i pix0 = _mm_broadcastw_epi16(pixel3);
480
__m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
481
__m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
482
483
// Processing first two pixels of the row
484
{
485
__m256i pix = _mm256_abs_epi16(pixel);
486
487
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
488
// Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
489
__m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
490
__m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
491
492
__m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
493
__m256i minError = _mm256_min_epi16(error0, error1);
494
495
// Exploiting symmetry of the selector table and use the sign bit
496
// This produces slightly different results, but is significant faster
497
__m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
498
499
// Interleaving values so madd instruction can be used
500
__m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
501
__m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
502
503
__m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
504
// Squaring the minimum error to produce correct values when adding
505
__m256i squareError = _mm256_madd_epi16(minError2, minError2);
506
507
squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
508
509
// Packing selector bits
510
__m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
511
__m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
512
513
sel0 = _mm256_or_si256(sel0, minIndexLo2);
514
sel1 = _mm256_or_si256(sel1, minIndexHi2);
515
}
516
517
pixel3 = _mm256_extracti128_si256(pixel2, 1);
518
pix0 = _mm_broadcastw_epi16(pixel3);
519
pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
520
pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
521
522
// Processing second two pixels of the row
523
{
524
__m256i pix = _mm256_abs_epi16(pixel);
525
526
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
527
// Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
528
__m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
529
__m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
530
531
__m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
532
__m256i minError = _mm256_min_epi16(error0, error1);
533
534
// Exploiting symmetry of the selector table and use the sign bit
535
__m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
536
537
// Interleaving values so madd instruction can be used
538
__m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
539
__m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
540
541
__m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
542
// Squaring the minimum error to produce correct values when adding
543
__m256i squareError = _mm256_madd_epi16(minError2, minError2);
544
545
squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
546
547
// Packing selector bits
548
__m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
549
__m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
550
__m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
551
__m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
552
553
sel0 = _mm256_or_si256(sel0, minIndexLo3);
554
sel1 = _mm256_or_si256(sel1, minIndexHi3);
555
}
556
}
557
558
data += 8 * 4;
559
560
_mm256_store_si256((__m256i*)terr[1 - j], squareErrorSum);
561
}
562
563
// Interleave selector bits
564
__m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
565
__m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
566
567
__m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
568
__m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
569
570
__m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
571
572
__m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
573
574
_mm256_store_si256((__m256i*)tsel, sel);
575
}
576
577
static etcpak_force_inline void FindBestFit_2x4_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
578
{
579
__m256i sel0 = _mm256_setzero_si256();
580
__m256i sel1 = _mm256_setzero_si256();
581
582
__m256i squareErrorSum0 = _mm256_setzero_si256();
583
__m256i squareErrorSum1 = _mm256_setzero_si256();
584
585
__m128i a0 = _mm_loadl_epi64((const __m128i*)a[offset + 1].data());
586
__m128i a1 = _mm_loadl_epi64((const __m128i*)a[offset + 0].data());
587
588
__m128i a2 = _mm_broadcastq_epi64(a0);
589
__m128i a3 = _mm_broadcastq_epi64(a1);
590
__m256i a4 = _mm256_insertf128_si256(_mm256_castsi128_si256(a2), a3, 1);
591
592
// Processing one full row each iteration
593
for (size_t i = 0; i < 16; i += 4)
594
{
595
__m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
596
597
__m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
598
__m256i d = _mm256_sub_epi16(a4, rgb16);
599
600
// The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
601
// This produces slightly different results, but is significant faster
602
__m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
603
__m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
604
__m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
605
__m128i pixel3 = _mm256_castsi256_si128(pixel2);
606
607
__m128i pix0 = _mm_broadcastw_epi16(pixel3);
608
__m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
609
__m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
610
611
// Processing first two pixels of the row
612
{
613
__m256i pix = _mm256_abs_epi16(pixel);
614
615
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
616
// Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
617
__m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
618
__m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
619
620
__m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
621
__m256i minError = _mm256_min_epi16(error0, error1);
622
623
// Exploiting symmetry of the selector table and use the sign bit
624
__m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
625
626
// Interleaving values so madd instruction can be used
627
__m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
628
__m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
629
630
__m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
631
// Squaring the minimum error to produce correct values when adding
632
__m256i squareError = _mm256_madd_epi16(minError2, minError2);
633
634
squareErrorSum0 = _mm256_add_epi32(squareErrorSum0, squareError);
635
636
// Packing selector bits
637
__m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
638
__m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
639
640
sel0 = _mm256_or_si256(sel0, minIndexLo2);
641
sel1 = _mm256_or_si256(sel1, minIndexHi2);
642
}
643
644
pixel3 = _mm256_extracti128_si256(pixel2, 1);
645
pix0 = _mm_broadcastw_epi16(pixel3);
646
pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
647
pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
648
649
// Processing second two pixels of the row
650
{
651
__m256i pix = _mm256_abs_epi16(pixel);
652
653
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
654
// Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
655
__m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
656
__m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
657
658
__m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
659
__m256i minError = _mm256_min_epi16(error0, error1);
660
661
// Exploiting symmetry of the selector table and use the sign bit
662
__m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
663
664
// Interleaving values so madd instruction can be used
665
__m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
666
__m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
667
668
__m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
669
// Squaring the minimum error to produce correct values when adding
670
__m256i squareError = _mm256_madd_epi16(minError2, minError2);
671
672
squareErrorSum1 = _mm256_add_epi32(squareErrorSum1, squareError);
673
674
// Packing selector bits
675
__m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
676
__m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
677
__m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
678
__m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
679
680
sel0 = _mm256_or_si256(sel0, minIndexLo3);
681
sel1 = _mm256_or_si256(sel1, minIndexHi3);
682
}
683
}
684
685
_mm256_store_si256((__m256i*)terr[1], squareErrorSum0);
686
_mm256_store_si256((__m256i*)terr[0], squareErrorSum1);
687
688
// Interleave selector bits
689
__m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
690
__m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
691
692
__m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
693
__m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
694
695
__m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
696
697
__m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
698
699
_mm256_store_si256((__m256i*)tsel, sel);
700
}
701
702
static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate) noexcept
703
{
704
size_t tidx[2];
705
706
// Get index of minimum error (terr[0] and terr[1])
707
__m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
708
__m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
709
710
__m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
711
__m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
712
713
__m256i errMin0 = _mm256_min_epu32(errLo, errHi);
714
715
__m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
716
__m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
717
718
__m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
719
__m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
720
721
__m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
722
__m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
723
724
__m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
725
__m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
726
727
uint32_t mask0 = _mm256_movemask_epi8(errMask0);
728
uint32_t mask1 = _mm256_movemask_epi8(errMask1);
729
730
tidx[0] = _bit_scan_forward(mask0) >> 2;
731
tidx[1] = _bit_scan_forward(mask1) >> 2;
732
733
d |= tidx[0] << 26;
734
d |= tidx[1] << 29;
735
736
unsigned int t0 = tsel[tidx[0]];
737
unsigned int t1 = tsel[tidx[1]];
738
739
if (!rotate)
740
{
741
t0 &= 0xFF00FF00;
742
t1 &= 0x00FF00FF;
743
}
744
else
745
{
746
t0 &= 0xCCCCCCCC;
747
t1 &= 0x33333333;
748
}
749
750
// Flip selectors from sign bit
751
unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
752
753
return d | static_cast<uint64_t>(_bswap(t2)) << 32;
754
}
755
756
static etcpak_force_inline __m128i r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cvf) noexcept
757
{
758
__m128i co = _mm_cvttps_epi32(cof);
759
__m128i ch = _mm_cvttps_epi32(chf);
760
__m128i cv = _mm_cvttps_epi32(cvf);
761
762
__m128i coh = _mm_packus_epi32(co, ch);
763
__m128i cv0 = _mm_packus_epi32(cv, _mm_setzero_si128());
764
765
__m256i cohv0 = _mm256_inserti128_si256(_mm256_castsi128_si256(coh), cv0, 1);
766
__m256i cohv1 = _mm256_min_epu16(cohv0, _mm256_set1_epi16(1023));
767
768
__m256i cohv2 = _mm256_sub_epi16(cohv1, _mm256_set1_epi16(15));
769
__m256i cohv3 = _mm256_srai_epi16(cohv2, 1);
770
771
__m256i cohvrb0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(11));
772
__m256i cohvrb1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(4));
773
__m256i cohvg0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(9));
774
__m256i cohvg1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(6));
775
776
__m256i cohvrb2 = _mm256_srai_epi16(cohvrb0, 7);
777
__m256i cohvrb3 = _mm256_srai_epi16(cohvrb1, 7);
778
__m256i cohvg2 = _mm256_srai_epi16(cohvg0, 8);
779
__m256i cohvg3 = _mm256_srai_epi16(cohvg1, 8);
780
781
__m256i cohvrb4 = _mm256_sub_epi16(cohvrb0, cohvrb2);
782
__m256i cohvrb5 = _mm256_sub_epi16(cohvrb4, cohvrb3);
783
__m256i cohvg4 = _mm256_sub_epi16(cohvg0, cohvg2);
784
__m256i cohvg5 = _mm256_sub_epi16(cohvg4, cohvg3);
785
786
__m256i cohvrb6 = _mm256_srai_epi16(cohvrb5, 3);
787
__m256i cohvg6 = _mm256_srai_epi16(cohvg5, 2);
788
789
__m256i cohv4 = _mm256_blend_epi16(cohvg6, cohvrb6, 0x55);
790
791
__m128i cohv5 = _mm_packus_epi16(_mm256_castsi256_si128(cohv4), _mm256_extracti128_si256(cohv4, 1));
792
return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(6, 5, 4, -1, 2, 1, 0, -1, 10, 9, 8, -1, -1, -1, -1, -1));
793
}
794
795
static etcpak_force_inline Plane Planar_AVX2( const Channels& ch, uint8_t& mode, bool useHeuristics )
796
{
797
__m128i t0 = _mm_sad_epu8( ch.r8, _mm_setzero_si128() );
798
__m128i t1 = _mm_sad_epu8( ch.g8, _mm_setzero_si128() );
799
__m128i t2 = _mm_sad_epu8( ch.b8, _mm_setzero_si128() );
800
801
__m128i r8s = _mm_shuffle_epi8( ch.r8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
802
__m128i g8s = _mm_shuffle_epi8( ch.g8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
803
__m128i b8s = _mm_shuffle_epi8( ch.b8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
804
805
__m128i s0 = _mm_sad_epu8( r8s, _mm_setzero_si128() );
806
__m128i s1 = _mm_sad_epu8( g8s, _mm_setzero_si128() );
807
__m128i s2 = _mm_sad_epu8( b8s, _mm_setzero_si128() );
808
809
__m256i sr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), s0, 1 );
810
__m256i sg0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t1 ), s1, 1 );
811
__m256i sb0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), s2, 1 );
812
813
__m256i sr1 = _mm256_slli_epi64( sr0, 32 );
814
__m256i sg1 = _mm256_slli_epi64( sg0, 16 );
815
816
__m256i srb = _mm256_or_si256( sr1, sb0 );
817
__m256i srgb = _mm256_or_si256( srb, sg1 );
818
819
if( mode != ModePlanar && useHeuristics )
820
{
821
Plane plane;
822
plane.sum4 = _mm256_permute4x64_epi64( srgb, _MM_SHUFFLE( 2, 3, 0, 1 ) );
823
return plane;
824
}
825
826
__m128i t3 = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( t0 ), _mm_castsi128_ps( t1 ), _MM_SHUFFLE( 2, 0, 2, 0 ) ) );
827
__m128i t4 = _mm_shuffle_epi32( t2, _MM_SHUFFLE( 3, 1, 2, 0 ) );
828
__m128i t5 = _mm_hadd_epi32( t3, t4 );
829
__m128i t6 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 1, 1, 1, 1 ) );
830
__m128i t7 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 2, 2, 2, 2 ) );
831
832
__m256i sr = _mm256_broadcastw_epi16( t5 );
833
__m256i sg = _mm256_broadcastw_epi16( t6 );
834
__m256i sb = _mm256_broadcastw_epi16( t7 );
835
836
__m256i r08 = _mm256_cvtepu8_epi16( ch.r8 );
837
__m256i g08 = _mm256_cvtepu8_epi16( ch.g8 );
838
__m256i b08 = _mm256_cvtepu8_epi16( ch.b8 );
839
840
__m256i r16 = _mm256_slli_epi16( r08, 4 );
841
__m256i g16 = _mm256_slli_epi16( g08, 4 );
842
__m256i b16 = _mm256_slli_epi16( b08, 4 );
843
844
__m256i difR0 = _mm256_sub_epi16( r16, sr );
845
__m256i difG0 = _mm256_sub_epi16( g16, sg );
846
__m256i difB0 = _mm256_sub_epi16( b16, sb );
847
848
__m256i difRyz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
849
__m256i difGyz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
850
__m256i difByz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
851
852
__m256i difRxz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
853
__m256i difGxz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
854
__m256i difBxz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
855
856
__m256i difRGyz = _mm256_hadd_epi32( difRyz, difGyz );
857
__m256i difByzxz = _mm256_hadd_epi32( difByz, difBxz );
858
859
__m256i difRGxz = _mm256_hadd_epi32( difRxz, difGxz );
860
861
__m128i sumRGyz = _mm_add_epi32( _mm256_castsi256_si128( difRGyz ), _mm256_extracti128_si256( difRGyz, 1 ) );
862
__m128i sumByzxz = _mm_add_epi32( _mm256_castsi256_si128( difByzxz ), _mm256_extracti128_si256( difByzxz, 1 ) );
863
__m128i sumRGxz = _mm_add_epi32( _mm256_castsi256_si128( difRGxz ), _mm256_extracti128_si256( difRGxz, 1 ) );
864
865
__m128i sumRGByz = _mm_hadd_epi32( sumRGyz, sumByzxz );
866
__m128i sumRGByzxz = _mm_hadd_epi32( sumRGxz, sumByzxz );
867
868
__m128i sumRGBxz = _mm_shuffle_epi32( sumRGByzxz, _MM_SHUFFLE( 2, 3, 1, 0 ) );
869
870
__m128 sumRGByzf = _mm_cvtepi32_ps( sumRGByz );
871
__m128 sumRGBxzf = _mm_cvtepi32_ps( sumRGBxz );
872
873
const float value = ( 255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f;
874
875
__m128 scale = _mm_set1_ps( -4.0f / value );
876
877
__m128 af = _mm_mul_ps( sumRGBxzf, scale );
878
__m128 bf = _mm_mul_ps( sumRGByzf, scale );
879
880
__m128 df = _mm_mul_ps( _mm_cvtepi32_ps( t5 ), _mm_set1_ps( 4.0f / 16.0f ) );
881
882
// calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af * x - bf * y;
883
__m128 cof0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
884
__m128 chf0 = _mm_fnmadd_ps( af, _mm_set1_ps( 425.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
885
__m128 cvf0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( 425.0f ), df ) );
886
887
// convert to r6g7b6
888
__m128i cohv = r6g7b6_AVX2( cof0, chf0, cvf0 );
889
890
uint64_t rgbho = _mm_extract_epi64( cohv, 0 );
891
uint32_t rgbv0 = _mm_extract_epi32( cohv, 2 );
892
893
// Error calculation
894
uint64_t error = 0;
895
if( !useHeuristics )
896
{
897
auto ro0 = ( rgbho >> 48 ) & 0x3F;
898
auto go0 = ( rgbho >> 40 ) & 0x7F;
899
auto bo0 = ( rgbho >> 32 ) & 0x3F;
900
auto ro1 = ( ro0 >> 4 ) | ( ro0 << 2 );
901
auto go1 = ( go0 >> 6 ) | ( go0 << 1 );
902
auto bo1 = ( bo0 >> 4 ) | ( bo0 << 2 );
903
auto ro2 = ( ro1 << 2 ) + 2;
904
auto go2 = ( go1 << 2 ) + 2;
905
auto bo2 = ( bo1 << 2 ) + 2;
906
907
__m256i ro3 = _mm256_set1_epi16( ro2 );
908
__m256i go3 = _mm256_set1_epi16( go2 );
909
__m256i bo3 = _mm256_set1_epi16( bo2 );
910
911
auto rh0 = ( rgbho >> 16 ) & 0x3F;
912
auto gh0 = ( rgbho >> 8 ) & 0x7F;
913
auto bh0 = ( rgbho >> 0 ) & 0x3F;
914
auto rh1 = ( rh0 >> 4 ) | ( rh0 << 2 );
915
auto gh1 = ( gh0 >> 6 ) | ( gh0 << 1 );
916
auto bh1 = ( bh0 >> 4 ) | ( bh0 << 2 );
917
918
auto rh2 = rh1 - ro1;
919
auto gh2 = gh1 - go1;
920
auto bh2 = bh1 - bo1;
921
922
__m256i rh3 = _mm256_set1_epi16( rh2 );
923
__m256i gh3 = _mm256_set1_epi16( gh2 );
924
__m256i bh3 = _mm256_set1_epi16( bh2 );
925
926
auto rv0 = ( rgbv0 >> 16 ) & 0x3F;
927
auto gv0 = ( rgbv0 >> 8 ) & 0x7F;
928
auto bv0 = ( rgbv0 >> 0 ) & 0x3F;
929
auto rv1 = ( rv0 >> 4 ) | ( rv0 << 2 );
930
auto gv1 = ( gv0 >> 6 ) | ( gv0 << 1 );
931
auto bv1 = ( bv0 >> 4 ) | ( bv0 << 2 );
932
933
auto rv2 = rv1 - ro1;
934
auto gv2 = gv1 - go1;
935
auto bv2 = bv1 - bo1;
936
937
__m256i rv3 = _mm256_set1_epi16( rv2 );
938
__m256i gv3 = _mm256_set1_epi16( gv2 );
939
__m256i bv3 = _mm256_set1_epi16( bv2 );
940
941
__m256i x = _mm256_set_epi16( 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 );
942
943
__m256i rh4 = _mm256_mullo_epi16( rh3, x );
944
__m256i gh4 = _mm256_mullo_epi16( gh3, x );
945
__m256i bh4 = _mm256_mullo_epi16( bh3, x );
946
947
__m256i y = _mm256_set_epi16( 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0 );
948
949
__m256i rv4 = _mm256_mullo_epi16( rv3, y );
950
__m256i gv4 = _mm256_mullo_epi16( gv3, y );
951
__m256i bv4 = _mm256_mullo_epi16( bv3, y );
952
953
__m256i rxy = _mm256_add_epi16( rh4, rv4 );
954
__m256i gxy = _mm256_add_epi16( gh4, gv4 );
955
__m256i bxy = _mm256_add_epi16( bh4, bv4 );
956
957
__m256i rp0 = _mm256_add_epi16( rxy, ro3 );
958
__m256i gp0 = _mm256_add_epi16( gxy, go3 );
959
__m256i bp0 = _mm256_add_epi16( bxy, bo3 );
960
961
__m256i rp1 = _mm256_srai_epi16( rp0, 2 );
962
__m256i gp1 = _mm256_srai_epi16( gp0, 2 );
963
__m256i bp1 = _mm256_srai_epi16( bp0, 2 );
964
965
__m256i rp2 = _mm256_max_epi16( _mm256_min_epi16( rp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() );
966
__m256i gp2 = _mm256_max_epi16( _mm256_min_epi16( gp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() );
967
__m256i bp2 = _mm256_max_epi16( _mm256_min_epi16( bp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() );
968
969
__m256i rdif = _mm256_sub_epi16( r08, rp2 );
970
__m256i gdif = _mm256_sub_epi16( g08, gp2 );
971
__m256i bdif = _mm256_sub_epi16( b08, bp2 );
972
973
__m256i rerr = _mm256_mullo_epi16( rdif, _mm256_set1_epi16( 38 ) );
974
__m256i gerr = _mm256_mullo_epi16( gdif, _mm256_set1_epi16( 76 ) );
975
__m256i berr = _mm256_mullo_epi16( bdif, _mm256_set1_epi16( 14 ) );
976
977
__m256i sum0 = _mm256_add_epi16( rerr, gerr );
978
__m256i sum1 = _mm256_add_epi16( sum0, berr );
979
980
__m256i sum2 = _mm256_madd_epi16( sum1, sum1 );
981
982
__m128i sum3 = _mm_add_epi32( _mm256_castsi256_si128( sum2 ), _mm256_extracti128_si256( sum2, 1 ) );
983
984
uint32_t err0 = _mm_extract_epi32( sum3, 0 );
985
uint32_t err1 = _mm_extract_epi32( sum3, 1 );
986
uint32_t err2 = _mm_extract_epi32( sum3, 2 );
987
uint32_t err3 = _mm_extract_epi32( sum3, 3 );
988
989
error = err0 + err1 + err2 + err3;
990
}
991
/**/
992
993
uint32_t rgbv = ( rgbv0 & 0x3F ) | ( ( rgbv0 >> 2 ) & 0x1FC0 ) | ( ( rgbv0 >> 3 ) & 0x7E000 );
994
uint64_t rgbho0_ = ( rgbho & 0x3F0000003F ) | ( ( rgbho >> 2 ) & 0x1FC000001FC0 ) | ( ( rgbho >> 3 ) & 0x7E0000007E000 );
995
uint64_t rgbho0 = ( rgbho0_ & 0x7FFFF ) | ( ( rgbho0_ >> 13 ) & 0x3FFFF80000 );
996
997
uint32_t hi = rgbv | ((rgbho0 & 0x1FFF) << 19);
998
rgbho0 >>= 13;
999
uint32_t lo = ( rgbho0 & 0x1 ) | ( ( rgbho0 & 0x1FE ) << 1 ) | ( ( rgbho0 & 0x600 ) << 2 ) | ( ( rgbho0 & 0x3F800 ) << 5 ) | ( ( rgbho0 & 0x1FC0000 ) << 6 );
1000
1001
uint32_t idx = ( ( rgbho >> 33 ) & 0xF ) | ( ( rgbho >> 41 ) & 0x10 ) | ( ( rgbho >> 48 ) & 0x20 );
1002
lo |= g_flags[idx];
1003
uint64_t result = static_cast<uint32_t>(_bswap(lo));
1004
result |= static_cast<uint64_t>(static_cast<uint32_t>(_bswap(hi))) << 32;
1005
1006
Plane plane;
1007
1008
plane.plane = result;
1009
if( useHeuristics )
1010
{
1011
plane.error = 0;
1012
mode = ModePlanar;
1013
}
1014
else
1015
{
1016
plane.error = error;
1017
}
1018
plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(2, 3, 0, 1));
1019
1020
return plane;
1021
}
1022
1023
static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate, const uint64_t value, const uint32_t error) noexcept
1024
{
1025
size_t tidx[2];
1026
1027
// Get index of minimum error (terr[0] and terr[1])
1028
__m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
1029
__m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
1030
1031
__m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
1032
__m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
1033
1034
__m256i errMin0 = _mm256_min_epu32(errLo, errHi);
1035
1036
__m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
1037
__m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
1038
1039
__m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
1040
__m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
1041
1042
__m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
1043
__m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
1044
1045
__m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
1046
__m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
1047
1048
uint32_t mask0 = _mm256_movemask_epi8(errMask0);
1049
uint32_t mask1 = _mm256_movemask_epi8(errMask1);
1050
1051
tidx[0] = _bit_scan_forward(mask0) >> 2;
1052
tidx[1] = _bit_scan_forward(mask1) >> 2;
1053
1054
if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
1055
{
1056
return value;
1057
}
1058
1059
d |= tidx[0] << 26;
1060
d |= tidx[1] << 29;
1061
1062
unsigned int t0 = tsel[tidx[0]];
1063
unsigned int t1 = tsel[tidx[1]];
1064
1065
if (!rotate)
1066
{
1067
t0 &= 0xFF00FF00;
1068
t1 &= 0x00FF00FF;
1069
}
1070
else
1071
{
1072
t0 &= 0xCCCCCCCC;
1073
t1 &= 0x33333333;
1074
}
1075
1076
// Flip selectors from sign bit
1077
unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
1078
1079
return d | static_cast<uint64_t>(_bswap(t2)) << 32;
1080
}
1081
1082
#endif
1083
1084
static etcpak_force_inline void Average( const uint8_t* data, v4i* a )
1085
{
1086
#ifdef __SSE4_1__
1087
__m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
1088
__m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
1089
__m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
1090
__m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
1091
1092
__m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
1093
__m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
1094
__m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128());
1095
__m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128());
1096
__m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
1097
__m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
1098
__m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128());
1099
__m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128());
1100
1101
__m128i sum0 = _mm_add_epi16(d0l, d1l);
1102
__m128i sum1 = _mm_add_epi16(d0h, d1h);
1103
__m128i sum2 = _mm_add_epi16(d2l, d3l);
1104
__m128i sum3 = _mm_add_epi16(d2h, d3h);
1105
1106
__m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
1107
__m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
1108
__m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
1109
__m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
1110
__m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
1111
__m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
1112
__m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
1113
__m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
1114
1115
__m128i b0 = _mm_add_epi32(sum0l, sum0h);
1116
__m128i b1 = _mm_add_epi32(sum1l, sum1h);
1117
__m128i b2 = _mm_add_epi32(sum2l, sum2h);
1118
__m128i b3 = _mm_add_epi32(sum3l, sum3h);
1119
1120
__m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3);
1121
__m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3);
1122
__m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3);
1123
__m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3);
1124
1125
_mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2))));
1126
_mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2))));
1127
#elif defined __ARM_NEON
1128
uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data + 0), uint8x16_t());
1129
uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t());
1130
uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t());
1131
uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t());
1132
1133
uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) };
1134
uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) };
1135
uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) };
1136
uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) };
1137
1138
uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ) ) ), uint16x8_t());
1139
uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ) ) ), uint16x8_t());
1140
uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ) ) ), uint16x8_t());
1141
uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ) ) ), uint16x8_t());
1142
1143
uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) };
1144
uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) };
1145
uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) };
1146
uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) };
1147
1148
uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]);
1149
uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]);
1150
uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]);
1151
uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]);
1152
1153
uint32x4_t a0 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b2, b3), vdupq_n_u32(4)), 3);
1154
uint32x4_t a1 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b1), vdupq_n_u32(4)), 3);
1155
uint32x4_t a2 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b1, b3), vdupq_n_u32(4)), 3);
1156
uint32x4_t a3 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b2), vdupq_n_u32(4)), 3);
1157
1158
uint16x8_t o0 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a0 )), vqmovun_s32(vreinterpretq_s32_u32( a1 )));
1159
uint16x8_t o1 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a2 )), vqmovun_s32(vreinterpretq_s32_u32( a3 )));
1160
1161
a[0] = v4i{o0[2], o0[1], o0[0], 0};
1162
a[1] = v4i{o0[6], o0[5], o0[4], 0};
1163
a[2] = v4i{o1[2], o1[1], o1[0], 0};
1164
a[3] = v4i{o1[6], o1[5], o1[4], 0};
1165
#else
1166
uint32_t r[4];
1167
uint32_t g[4];
1168
uint32_t b[4];
1169
1170
memset(r, 0, sizeof(r));
1171
memset(g, 0, sizeof(g));
1172
memset(b, 0, sizeof(b));
1173
1174
for( int j=0; j<4; j++ )
1175
{
1176
for( int i=0; i<4; i++ )
1177
{
1178
int index = (j & 2) + (i >> 1);
1179
b[index] += *data++;
1180
g[index] += *data++;
1181
r[index] += *data++;
1182
data++;
1183
}
1184
}
1185
1186
a[0] = v4i{ uint16_t( (r[2] + r[3] + 4) / 8 ), uint16_t( (g[2] + g[3] + 4) / 8 ), uint16_t( (b[2] + b[3] + 4) / 8 ), 0};
1187
a[1] = v4i{ uint16_t( (r[0] + r[1] + 4) / 8 ), uint16_t( (g[0] + g[1] + 4) / 8 ), uint16_t( (b[0] + b[1] + 4) / 8 ), 0};
1188
a[2] = v4i{ uint16_t( (r[1] + r[3] + 4) / 8 ), uint16_t( (g[1] + g[3] + 4) / 8 ), uint16_t( (b[1] + b[3] + 4) / 8 ), 0};
1189
a[3] = v4i{ uint16_t( (r[0] + r[2] + 4) / 8 ), uint16_t( (g[0] + g[2] + 4) / 8 ), uint16_t( (b[0] + b[2] + 4) / 8 ), 0};
1190
#endif
1191
}
1192
1193
static etcpak_force_inline void CalcErrorBlock( const uint8_t* data, unsigned int err[4][4] )
1194
{
1195
#ifdef __SSE4_1__
1196
__m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
1197
__m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
1198
__m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
1199
__m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
1200
1201
__m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
1202
__m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
1203
__m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
1204
__m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
1205
1206
__m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128());
1207
__m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128());
1208
__m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128());
1209
__m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128());
1210
__m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128());
1211
__m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128());
1212
__m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128());
1213
__m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128());
1214
1215
__m128i sum0 = _mm_add_epi16(d0l, d1l);
1216
__m128i sum1 = _mm_add_epi16(d0h, d1h);
1217
__m128i sum2 = _mm_add_epi16(d2l, d3l);
1218
__m128i sum3 = _mm_add_epi16(d2h, d3h);
1219
1220
__m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
1221
__m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
1222
__m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
1223
__m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
1224
__m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
1225
__m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
1226
__m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
1227
__m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
1228
1229
__m128i b0 = _mm_add_epi32(sum0l, sum0h);
1230
__m128i b1 = _mm_add_epi32(sum1l, sum1h);
1231
__m128i b2 = _mm_add_epi32(sum2l, sum2h);
1232
__m128i b3 = _mm_add_epi32(sum3l, sum3h);
1233
1234
__m128i a0 = _mm_add_epi32(b2, b3);
1235
__m128i a1 = _mm_add_epi32(b0, b1);
1236
__m128i a2 = _mm_add_epi32(b1, b3);
1237
__m128i a3 = _mm_add_epi32(b0, b2);
1238
1239
_mm_storeu_si128((__m128i*)&err[0], a0);
1240
_mm_storeu_si128((__m128i*)&err[1], a1);
1241
_mm_storeu_si128((__m128i*)&err[2], a2);
1242
_mm_storeu_si128((__m128i*)&err[3], a3);
1243
#elif defined __ARM_NEON
1244
uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data + 0), uint8x16_t());
1245
uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t());
1246
uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t());
1247
uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t());
1248
1249
uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) };
1250
uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) };
1251
uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) };
1252
uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) };
1253
1254
uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ))), uint16x8_t());
1255
uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ))), uint16x8_t());
1256
uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ))), uint16x8_t());
1257
uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ))), uint16x8_t());
1258
1259
uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) };
1260
uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) };
1261
uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) };
1262
uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) };
1263
1264
uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]);
1265
uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]);
1266
uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]);
1267
uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]);
1268
1269
uint32x4_t a0 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b2, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1270
uint32x4_t a1 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b1) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1271
uint32x4_t a2 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b1, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1272
uint32x4_t a3 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b2) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1273
1274
vst1q_u32(err[0], a0);
1275
vst1q_u32(err[1], a1);
1276
vst1q_u32(err[2], a2);
1277
vst1q_u32(err[3], a3);
1278
#else
1279
unsigned int terr[4][4];
1280
1281
memset(terr, 0, 16 * sizeof(unsigned int));
1282
1283
for( int j=0; j<4; j++ )
1284
{
1285
for( int i=0; i<4; i++ )
1286
{
1287
int index = (j & 2) + (i >> 1);
1288
unsigned int d = *data++;
1289
terr[index][0] += d;
1290
d = *data++;
1291
terr[index][1] += d;
1292
d = *data++;
1293
terr[index][2] += d;
1294
data++;
1295
}
1296
}
1297
1298
for( int i=0; i<3; i++ )
1299
{
1300
err[0][i] = terr[2][i] + terr[3][i];
1301
err[1][i] = terr[0][i] + terr[1][i];
1302
err[2][i] = terr[1][i] + terr[3][i];
1303
err[3][i] = terr[0][i] + terr[2][i];
1304
}
1305
for( int i=0; i<4; i++ )
1306
{
1307
err[i][3] = 0;
1308
}
1309
#endif
1310
}
1311
1312
static etcpak_force_inline unsigned int CalcError( const unsigned int block[4], const v4i& average )
1313
{
1314
unsigned int err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow
1315
err -= block[0] * 2 * average[2];
1316
err -= block[1] * 2 * average[1];
1317
err -= block[2] * 2 * average[0];
1318
err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
1319
return err;
1320
}
1321
1322
static etcpak_force_inline void ProcessAverages( v4i* a )
1323
{
1324
#ifdef __SSE4_1__
1325
for( int i=0; i<2; i++ )
1326
{
1327
__m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
1328
1329
__m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128));
1330
1331
__m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8);
1332
1333
__m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
1334
__m128i diff = _mm_sub_epi16(c, c1);
1335
diff = _mm_max_epi16(diff, _mm_set1_epi16(-4));
1336
diff = _mm_min_epi16(diff, _mm_set1_epi16(3));
1337
1338
__m128i co = _mm_add_epi16(c1, diff);
1339
1340
c = _mm_blend_epi16(co, c, 0xF0);
1341
1342
__m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2));
1343
1344
_mm_storeu_si128((__m128i*)a[4+i*2].data(), a0);
1345
}
1346
1347
for( int i=0; i<2; i++ )
1348
{
1349
__m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
1350
1351
__m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128));
1352
__m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8);
1353
1354
__m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4));
1355
1356
_mm_storeu_si128((__m128i*)a[i*2].data(), t2);
1357
}
1358
#elif defined __ARM_NEON
1359
for( int i=0; i<2; i++ )
1360
{
1361
int16x8_t d = vld1q_s16((int16_t*)&a[i*2]);
1362
int16x8_t t = vaddq_s16(vmulq_s16(d, vdupq_n_s16(31)), vdupq_n_s16(128));
1363
int16x8_t c = vshrq_n_s16(vaddq_s16(t, vshrq_n_s16(t, 8)), 8);
1364
1365
int16x8_t c1 = vcombine_s16(vget_high_s16(c), vget_high_s16(c));
1366
int16x8_t diff = vsubq_s16(c, c1);
1367
diff = vmaxq_s16(diff, vdupq_n_s16(-4));
1368
diff = vminq_s16(diff, vdupq_n_s16(3));
1369
1370
int16x8_t co = vaddq_s16(c1, diff);
1371
1372
c = vcombine_s16(vget_low_s16(co), vget_high_s16(c));
1373
1374
int16x8_t a0 = vorrq_s16(vshlq_n_s16(c, 3), vshrq_n_s16(c, 2));
1375
1376
vst1q_s16((int16_t*)&a[4+i*2], a0);
1377
}
1378
1379
for( int i=0; i<2; i++ )
1380
{
1381
int16x8_t d = vld1q_s16((int16_t*)&a[i*2]);
1382
1383
int16x8_t t0 = vaddq_s16(vmulq_s16(d, vdupq_n_s16(15)), vdupq_n_s16(128));
1384
int16x8_t t1 = vshrq_n_s16(vaddq_s16(t0, vshrq_n_s16(t0, 8)), 8);
1385
1386
int16x8_t t2 = vorrq_s16(t1, vshlq_n_s16(t1, 4));
1387
1388
vst1q_s16((int16_t*)&a[i*2], t2);
1389
}
1390
#else
1391
for( int i=0; i<2; i++ )
1392
{
1393
for( int j=0; j<3; j++ )
1394
{
1395
int32_t c1 = mul8bit( a[i*2+1][j], 31 );
1396
int32_t c2 = mul8bit( a[i*2][j], 31 );
1397
1398
int32_t diff = c2 - c1;
1399
if( diff > 3 ) diff = 3;
1400
else if( diff < -4 ) diff = -4;
1401
1402
int32_t co = c1 + diff;
1403
1404
a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 );
1405
a[4+i*2][j] = ( co << 3 ) | ( co >> 2 );
1406
}
1407
}
1408
1409
for( int i=0; i<4; i++ )
1410
{
1411
a[i][0] = g_avg2[mul8bit( a[i][0], 15 )];
1412
a[i][1] = g_avg2[mul8bit( a[i][1], 15 )];
1413
a[i][2] = g_avg2[mul8bit( a[i][2], 15 )];
1414
}
1415
#endif
1416
}
1417
1418
static etcpak_force_inline void EncodeAverages( uint64_t& _d, const v4i* a, size_t idx )
1419
{
1420
auto d = _d;
1421
d |= ( idx << 24 );
1422
size_t base = idx << 1;
1423
1424
if( ( idx & 0x2 ) == 0 )
1425
{
1426
for( int i=0; i<3; i++ )
1427
{
1428
d |= uint64_t( a[base+0][i] >> 4 ) << ( i*8 );
1429
d |= uint64_t( a[base+1][i] >> 4 ) << ( i*8 + 4 );
1430
}
1431
}
1432
else
1433
{
1434
for( int i=0; i<3; i++ )
1435
{
1436
d |= uint64_t( a[base+1][i] & 0xF8 ) << ( i*8 );
1437
int32_t c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3;
1438
c &= ~0xFFFFFFF8;
1439
d |= ((uint64_t)c) << ( i*8 );
1440
}
1441
}
1442
_d = d;
1443
}
1444
1445
static etcpak_force_inline uint64_t CheckSolid( const uint8_t* src )
1446
{
1447
#ifdef __SSE4_1__
1448
__m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
1449
__m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
1450
__m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
1451
__m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
1452
1453
__m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0));
1454
1455
__m128i c0 = _mm_cmpeq_epi8(d0, c);
1456
__m128i c1 = _mm_cmpeq_epi8(d1, c);
1457
__m128i c2 = _mm_cmpeq_epi8(d2, c);
1458
__m128i c3 = _mm_cmpeq_epi8(d3, c);
1459
1460
__m128i m0 = _mm_and_si128(c0, c1);
1461
__m128i m1 = _mm_and_si128(c2, c3);
1462
__m128i m = _mm_and_si128(m0, m1);
1463
1464
if (!_mm_testc_si128(m, _mm_set1_epi32(-1)))
1465
{
1466
return 0;
1467
}
1468
#elif defined __ARM_NEON
1469
int32x4_t d0 = vld1q_s32((int32_t*)src + 0);
1470
int32x4_t d1 = vld1q_s32((int32_t*)src + 4);
1471
int32x4_t d2 = vld1q_s32((int32_t*)src + 8);
1472
int32x4_t d3 = vld1q_s32((int32_t*)src + 12);
1473
1474
int32x4_t c = vdupq_n_s32(d0[0]);
1475
1476
int32x4_t c0 = vreinterpretq_s32_u32(vceqq_s32(d0, c));
1477
int32x4_t c1 = vreinterpretq_s32_u32(vceqq_s32(d1, c));
1478
int32x4_t c2 = vreinterpretq_s32_u32(vceqq_s32(d2, c));
1479
int32x4_t c3 = vreinterpretq_s32_u32(vceqq_s32(d3, c));
1480
1481
int32x4_t m0 = vandq_s32(c0, c1);
1482
int32x4_t m1 = vandq_s32(c2, c3);
1483
int64x2_t m = vreinterpretq_s64_s32(vandq_s32(m0, m1));
1484
1485
if (m[0] != -1 || m[1] != -1)
1486
{
1487
return 0;
1488
}
1489
#else
1490
const uint8_t* ptr = src + 4;
1491
for( int i=1; i<16; i++ )
1492
{
1493
if( memcmp( src, ptr, 4 ) != 0 )
1494
{
1495
return 0;
1496
}
1497
ptr += 4;
1498
}
1499
#endif
1500
return 0x02000000 |
1501
( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
1502
( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
1503
( (unsigned int)( src[2] & 0xF8 ) );
1504
}
1505
1506
static etcpak_force_inline void PrepareAverages( v4i a[8], const uint8_t* src, unsigned int err[4] )
1507
{
1508
Average( src, a );
1509
ProcessAverages( a );
1510
1511
unsigned int errblock[4][4];
1512
CalcErrorBlock( src, errblock );
1513
1514
for( int i=0; i<4; i++ )
1515
{
1516
err[i/2] += CalcError( errblock[i], a[i] );
1517
err[2+i/2] += CalcError( errblock[i], a[i+4] );
1518
}
1519
}
1520
1521
static etcpak_force_inline void FindBestFit( uint64_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
1522
{
1523
for( size_t i=0; i<16; i++ )
1524
{
1525
uint16_t* sel = tsel[i];
1526
unsigned int bid = id[i];
1527
uint64_t* ter = terr[bid%2];
1528
1529
uint8_t b = *data++;
1530
uint8_t g = *data++;
1531
uint8_t r = *data++;
1532
data++;
1533
1534
int dr = a[bid][0] - r;
1535
int dg = a[bid][1] - g;
1536
int db = a[bid][2] - b;
1537
1538
#ifdef __SSE4_1__
1539
// Reference implementation
1540
1541
__m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28);
1542
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1543
__m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0]));
1544
__m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1]));
1545
__m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0]));
1546
__m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1]));
1547
1548
__m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
1549
__m128i minError0 = _mm_min_epi32(error0, error1);
1550
1551
__m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
1552
__m128i minError1 = _mm_min_epi32(error2, error3);
1553
1554
__m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
1555
__m128i minError = _mm_min_epi32(minError0, minError1);
1556
1557
// Squaring the minimum error to produce correct values when adding
1558
__m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
1559
__m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
1560
squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
1561
_mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
1562
__m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
1563
__m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
1564
squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
1565
_mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
1566
1567
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1568
error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2]));
1569
error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3]));
1570
error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2]));
1571
error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3]));
1572
1573
index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
1574
minError0 = _mm_min_epi32(error0, error1);
1575
1576
index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
1577
minError1 = _mm_min_epi32(error2, error3);
1578
1579
__m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
1580
minError = _mm_min_epi32(minError0, minError1);
1581
1582
// Squaring the minimum error to produce correct values when adding
1583
minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
1584
squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
1585
squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2));
1586
_mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow);
1587
minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
1588
squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
1589
squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3));
1590
_mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh);
1591
__m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1);
1592
_mm_storeu_si128((__m128i*)sel, minIndex);
1593
#elif defined __ARM_NEON
1594
int32x4_t pix = vdupq_n_s32(dr * 77 + dg * 151 + db * 28);
1595
1596
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1597
uint32x4_t error0 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[0])));
1598
uint32x4_t error1 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[1])));
1599
uint32x4_t error2 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[0])));
1600
uint32x4_t error3 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[1])));
1601
1602
uint32x4_t index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1));
1603
uint32x4_t minError0 = vminq_u32(error0, error1);
1604
1605
uint32x4_t index1 = vreinterpretq_u32_s32(vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))));
1606
uint32x4_t minError1 = vminq_u32(error2, error3);
1607
1608
uint32x4_t blendMask = vcltq_u32(minError1, minError0);
1609
uint32x4_t minIndex0 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
1610
uint32x4_t minError = vminq_u32(minError0, minError1);
1611
1612
// Squaring the minimum error to produce correct values when adding
1613
uint32x4_t squareErrorLow = vmulq_u32(minError, minError);
1614
uint32x4_t squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError))), 1);
1615
uint32x4x2_t squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
1616
uint64x2x2_t squareError = { vreinterpretq_u64_u32(squareErrorZip.val[0]), vreinterpretq_u64_u32(squareErrorZip.val[1]) };
1617
squareError.val[0] = vaddq_u64(squareError.val[0], vld1q_u64(ter + 0));
1618
squareError.val[1] = vaddq_u64(squareError.val[1], vld1q_u64(ter + 2));
1619
vst1q_u64(ter + 0, squareError.val[0]);
1620
vst1q_u64(ter + 2, squareError.val[1]);
1621
1622
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1623
error0 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[2])));
1624
error1 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[3])));
1625
error2 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[2])));
1626
error3 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[3])));
1627
1628
index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1));
1629
minError0 = vminq_u32(error0, error1);
1630
1631
index1 = vreinterpretq_u32_s32( vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))) );
1632
minError1 = vminq_u32(error2, error3);
1633
1634
blendMask = vcltq_u32(minError1, minError0);
1635
uint32x4_t minIndex1 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
1636
minError = vminq_u32(minError0, minError1);
1637
1638
// Squaring the minimum error to produce correct values when adding
1639
squareErrorLow = vmulq_u32(minError, minError);
1640
squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32( vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError)) ), 1 );
1641
squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
1642
squareError.val[0] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[0] ), vld1q_u64(ter + 4));
1643
squareError.val[1] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[1] ), vld1q_u64(ter + 6));
1644
vst1q_u64(ter + 4, squareError.val[0]);
1645
vst1q_u64(ter + 6, squareError.val[1]);
1646
1647
uint16x8_t minIndex = vcombine_u16(vqmovn_u32(minIndex0), vqmovn_u32(minIndex1));
1648
vst1q_u16(sel, minIndex);
1649
#else
1650
int pix = dr * 77 + dg * 151 + db * 28;
1651
1652
for( int t=0; t<8; t++ )
1653
{
1654
const int64_t* tab = g_table256[t];
1655
unsigned int idx = 0;
1656
uint64_t err = sq( tab[0] + pix );
1657
for( int j=1; j<4; j++ )
1658
{
1659
uint64_t local = sq( tab[j] + pix );
1660
if( local < err )
1661
{
1662
err = local;
1663
idx = j;
1664
}
1665
}
1666
*sel++ = idx;
1667
*ter++ += err;
1668
}
1669
#endif
1670
}
1671
}
1672
1673
#if defined __SSE4_1__ || defined __ARM_NEON
1674
// Non-reference implementation, but faster. Produces same results as the AVX2 version
1675
static etcpak_force_inline void FindBestFit( uint32_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
1676
{
1677
for( size_t i=0; i<16; i++ )
1678
{
1679
uint16_t* sel = tsel[i];
1680
unsigned int bid = id[i];
1681
uint32_t* ter = terr[bid%2];
1682
1683
uint8_t b = *data++;
1684
uint8_t g = *data++;
1685
uint8_t r = *data++;
1686
data++;
1687
1688
int dr = a[bid][0] - r;
1689
int dg = a[bid][1] - g;
1690
int db = a[bid][2] - b;
1691
1692
#ifdef __SSE4_1__
1693
// The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
1694
// This produces slightly different results, but is significant faster
1695
__m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14);
1696
__m128i pix = _mm_abs_epi16(pixel);
1697
1698
// Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1699
// Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
1700
__m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0]));
1701
__m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1]));
1702
1703
__m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1));
1704
__m128i minError = _mm_min_epi16(error0, error1);
1705
1706
// Exploiting symmetry of the selector table and use the sign bit
1707
// This produces slightly different results, but is needed to produce same results as AVX2 implementation
1708
__m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1));
1709
__m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit));
1710
1711
// Squaring the minimum error to produce correct values when adding
1712
__m128i squareErrorLo = _mm_mullo_epi16(minError, minError);
1713
__m128i squareErrorHi = _mm_mulhi_epi16(minError, minError);
1714
1715
__m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi);
1716
__m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi);
1717
1718
squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
1719
_mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
1720
squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
1721
_mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
1722
1723
_mm_storeu_si128((__m128i*)sel, minIndex);
1724
#elif defined __ARM_NEON
1725
int16x8_t pixel = vdupq_n_s16( dr * 38 + dg * 76 + db * 14 );
1726
int16x8_t pix = vabsq_s16( pixel );
1727
1728
int16x8_t error0 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[0] ) );
1729
int16x8_t error1 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[1] ) );
1730
1731
int16x8_t index = vandq_s16( vreinterpretq_s16_u16( vcltq_s16( error1, error0 ) ), vdupq_n_s16( 1 ) );
1732
int16x8_t minError = vminq_s16( error0, error1 );
1733
1734
int16x8_t indexBit = vandq_s16( vmvnq_s16( vshrq_n_s16( pixel, 15 ) ), vdupq_n_s16( -1 ) );
1735
int16x8_t minIndex = vorrq_s16( index, vaddq_s16( indexBit, indexBit ) );
1736
1737
int16x4_t minErrorLow = vget_low_s16( minError );
1738
int16x4_t minErrorHigh = vget_high_s16( minError );
1739
1740
int32x4_t squareErrorLow = vmull_s16( minErrorLow, minErrorLow );
1741
int32x4_t squareErrorHigh = vmull_s16( minErrorHigh, minErrorHigh );
1742
1743
int32x4_t squareErrorSumLow = vaddq_s32( squareErrorLow, vld1q_s32( (int32_t*)ter ) );
1744
int32x4_t squareErrorSumHigh = vaddq_s32( squareErrorHigh, vld1q_s32( (int32_t*)ter + 4 ) );
1745
1746
vst1q_s32( (int32_t*)ter, squareErrorSumLow );
1747
vst1q_s32( (int32_t*)ter + 4, squareErrorSumHigh );
1748
1749
vst1q_s16( (int16_t*)sel, minIndex );
1750
#endif
1751
}
1752
}
1753
#endif
1754
1755
static etcpak_force_inline uint8_t convert6(float f)
1756
{
1757
int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
1758
return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3;
1759
}
1760
1761
static etcpak_force_inline uint8_t convert7(float f)
1762
{
1763
int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
1764
return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
1765
}
1766
1767
static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode, bool useHeuristics )
1768
{
1769
int32_t r = 0;
1770
int32_t g = 0;
1771
int32_t b = 0;
1772
1773
for( int i = 0; i < 16; ++i )
1774
{
1775
b += src[i * 4 + 0];
1776
g += src[i * 4 + 1];
1777
r += src[i * 4 + 2];
1778
}
1779
1780
int32_t difRyz = 0;
1781
int32_t difGyz = 0;
1782
int32_t difByz = 0;
1783
int32_t difRxz = 0;
1784
int32_t difGxz = 0;
1785
int32_t difBxz = 0;
1786
1787
const int32_t scaling[] = { -255, -85, 85, 255 };
1788
1789
for (int i = 0; i < 16; ++i)
1790
{
1791
int32_t difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b;
1792
int32_t difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g;
1793
int32_t difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r;
1794
1795
difRyz += difR * scaling[i % 4];
1796
difGyz += difG * scaling[i % 4];
1797
difByz += difB * scaling[i % 4];
1798
1799
difRxz += difR * scaling[i / 4];
1800
difGxz += difG * scaling[i / 4];
1801
difBxz += difB * scaling[i / 4];
1802
}
1803
1804
const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f);
1805
1806
float aR = difRxz * scale;
1807
float aG = difGxz * scale;
1808
float aB = difBxz * scale;
1809
1810
float bR = difRyz * scale;
1811
float bG = difGyz * scale;
1812
float bB = difByz * scale;
1813
1814
float dR = r * (4.0f / 16.0f);
1815
float dG = g * (4.0f / 16.0f);
1816
float dB = b * (4.0f / 16.0f);
1817
1818
// calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af * x - bf * y;
1819
float cofR = std::fma(aR, 255.0f, std::fma(bR, 255.0f, dR));
1820
float cofG = std::fma(aG, 255.0f, std::fma(bG, 255.0f, dG));
1821
float cofB = std::fma(aB, 255.0f, std::fma(bB, 255.0f, dB));
1822
float chfR = std::fma(aR, -425.0f, std::fma(bR, 255.0f, dR));
1823
float chfG = std::fma(aG, -425.0f, std::fma(bG, 255.0f, dG));
1824
float chfB = std::fma(aB, -425.0f, std::fma(bB, 255.0f, dB));
1825
float cvfR = std::fma(aR, 255.0f, std::fma(bR, -425.0f, dR));
1826
float cvfG = std::fma(aG, 255.0f, std::fma(bG, -425.0f, dG));
1827
float cvfB = std::fma(aB, 255.0f, std::fma(bB, -425.0f, dB));
1828
1829
// convert to r6g7b6
1830
int32_t coR = convert6(cofR);
1831
int32_t coG = convert7(cofG);
1832
int32_t coB = convert6(cofB);
1833
int32_t chR = convert6(chfR);
1834
int32_t chG = convert7(chfG);
1835
int32_t chB = convert6(chfB);
1836
int32_t cvR = convert6(cvfR);
1837
int32_t cvG = convert7(cvfG);
1838
int32_t cvB = convert6(cvfB);
1839
1840
// Error calculation
1841
uint64_t error = 0;
1842
if( ModePlanar != mode && useHeuristics )
1843
{
1844
auto ro0 = coR;
1845
auto go0 = coG;
1846
auto bo0 = coB;
1847
auto ro1 = ( ro0 >> 4 ) | ( ro0 << 2 );
1848
auto go1 = ( go0 >> 6 ) | ( go0 << 1 );
1849
auto bo1 = ( bo0 >> 4 ) | ( bo0 << 2 );
1850
auto ro2 = ( ro1 << 2 ) + 2;
1851
auto go2 = ( go1 << 2 ) + 2;
1852
auto bo2 = ( bo1 << 2 ) + 2;
1853
1854
auto rh0 = chR;
1855
auto gh0 = chG;
1856
auto bh0 = chB;
1857
auto rh1 = ( rh0 >> 4 ) | ( rh0 << 2 );
1858
auto gh1 = ( gh0 >> 6 ) | ( gh0 << 1 );
1859
auto bh1 = ( bh0 >> 4 ) | ( bh0 << 2 );
1860
1861
auto rh2 = rh1 - ro1;
1862
auto gh2 = gh1 - go1;
1863
auto bh2 = bh1 - bo1;
1864
1865
auto rv0 = cvR;
1866
auto gv0 = cvG;
1867
auto bv0 = cvB;
1868
auto rv1 = ( rv0 >> 4 ) | ( rv0 << 2 );
1869
auto gv1 = ( gv0 >> 6 ) | ( gv0 << 1 );
1870
auto bv1 = ( bv0 >> 4 ) | ( bv0 << 2 );
1871
1872
auto rv2 = rv1 - ro1;
1873
auto gv2 = gv1 - go1;
1874
auto bv2 = bv1 - bo1;
1875
for( int i = 0; i < 16; ++i )
1876
{
1877
int32_t cR = clampu8( ( rh2 * ( i / 4 ) + rv2 * ( i % 4 ) + ro2 ) >> 2 );
1878
int32_t cG = clampu8( ( gh2 * ( i / 4 ) + gv2 * ( i % 4 ) + go2 ) >> 2 );
1879
int32_t cB = clampu8( ( bh2 * ( i / 4 ) + bv2 * ( i % 4 ) + bo2 ) >> 2 );
1880
1881
int32_t difB = static_cast<int>( src[i * 4 + 0] ) - cB;
1882
int32_t difG = static_cast<int>( src[i * 4 + 1] ) - cG;
1883
int32_t difR = static_cast<int>( src[i * 4 + 2] ) - cR;
1884
1885
int32_t dif = difR * 38 + difG * 76 + difB * 14;
1886
1887
error += dif * dif;
1888
}
1889
}
1890
1891
/**/
1892
uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 );
1893
uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 );
1894
uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 );
1895
uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C );
1896
lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 );
1897
lo |= ( ( coG & 0x3F ) << 17 ) | ( ( coG & 0x40 ) << 18 );
1898
lo |= coR << 25;
1899
1900
const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 );
1901
1902
lo |= g_flags[idx];
1903
1904
uint64_t result = static_cast<uint32_t>( _bswap( lo ) );
1905
result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32;
1906
1907
return std::make_pair( result, error );
1908
}
1909
1910
#ifdef __ARM_NEON
1911
1912
static etcpak_force_inline int32x2_t Planar_NEON_DifXZ( int16x8_t dif_lo, int16x8_t dif_hi )
1913
{
1914
int32x4_t dif0 = vmull_n_s16( vget_low_s16( dif_lo ), -255 );
1915
int32x4_t dif1 = vmull_n_s16( vget_high_s16( dif_lo ), -85 );
1916
int32x4_t dif2 = vmull_n_s16( vget_low_s16( dif_hi ), 85 );
1917
int32x4_t dif3 = vmull_n_s16( vget_high_s16( dif_hi ), 255 );
1918
int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
1919
1920
#ifndef __aarch64__
1921
int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
1922
return vpadd_s32( dif5, dif5 );
1923
#else
1924
return vdup_n_s32( vaddvq_s32( dif4 ) );
1925
#endif
1926
}
1927
1928
static etcpak_force_inline int32x2_t Planar_NEON_DifYZ( int16x8_t dif_lo, int16x8_t dif_hi )
1929
{
1930
int16x4_t scaling = { -255, -85, 85, 255 };
1931
int32x4_t dif0 = vmull_s16( vget_low_s16( dif_lo ), scaling );
1932
int32x4_t dif1 = vmull_s16( vget_high_s16( dif_lo ), scaling );
1933
int32x4_t dif2 = vmull_s16( vget_low_s16( dif_hi ), scaling );
1934
int32x4_t dif3 = vmull_s16( vget_high_s16( dif_hi ), scaling );
1935
int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
1936
1937
#ifndef __aarch64__
1938
int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
1939
return vpadd_s32( dif5, dif5 );
1940
#else
1941
return vdup_n_s32( vaddvq_s32( dif4 ) );
1942
#endif
1943
}
1944
1945
static etcpak_force_inline int16x8_t Planar_NEON_SumWide( uint8x16_t src )
1946
{
1947
uint16x8_t accu8 = vpaddlq_u8( src );
1948
#ifndef __aarch64__
1949
uint16x4_t accu4 = vpadd_u16( vget_low_u16( accu8 ), vget_high_u16( accu8 ) );
1950
uint16x4_t accu2 = vpadd_u16( accu4, accu4 );
1951
uint16x4_t accu1 = vpadd_u16( accu2, accu2 );
1952
return vreinterpretq_s16_u16( vcombine_u16( accu1, accu1 ) );
1953
#else
1954
return vdupq_n_s16( vaddvq_u16( accu8 ) );
1955
#endif
1956
}
1957
1958
static etcpak_force_inline int16x8_t convert6_NEON( int32x4_t lo, int32x4_t hi )
1959
{
1960
uint16x8_t x = vcombine_u16( vqmovun_s32( lo ), vqmovun_s32( hi ) );
1961
int16x8_t i = vreinterpretq_s16_u16( vshrq_n_u16( vqshlq_n_u16( x, 6 ), 6) ); // clamp 0-1023
1962
i = vhsubq_s16( i, vdupq_n_s16( 15 ) );
1963
1964
int16x8_t ip11 = vaddq_s16( i, vdupq_n_s16( 11 ) );
1965
int16x8_t ip4 = vaddq_s16( i, vdupq_n_s16( 4 ) );
1966
1967
return vshrq_n_s16( vsubq_s16( vsubq_s16( ip11, vshrq_n_s16( ip11, 7 ) ), vshrq_n_s16( ip4, 7) ), 3 );
1968
}
1969
1970
static etcpak_force_inline int16x4_t convert7_NEON( int32x4_t x )
1971
{
1972
int16x4_t i = vreinterpret_s16_u16( vshr_n_u16( vqshl_n_u16( vqmovun_s32( x ), 6 ), 6 ) ); // clamp 0-1023
1973
i = vhsub_s16( i, vdup_n_s16( 15 ) );
1974
1975
int16x4_t p9 = vadd_s16( i, vdup_n_s16( 9 ) );
1976
int16x4_t p6 = vadd_s16( i, vdup_n_s16( 6 ) );
1977
return vshr_n_s16( vsub_s16( vsub_s16( p9, vshr_n_s16( p9, 8 ) ), vshr_n_s16( p6, 8 ) ), 2 );
1978
}
1979
1980
static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode, bool useHeuristics )
1981
{
1982
uint8x16x4_t srcBlock = vld4q_u8( src );
1983
1984
int16x8_t bSumWide = Planar_NEON_SumWide( srcBlock.val[0] );
1985
int16x8_t gSumWide = Planar_NEON_SumWide( srcBlock.val[1] );
1986
int16x8_t rSumWide = Planar_NEON_SumWide( srcBlock.val[2] );
1987
1988
int16x8_t dif_R_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[2] ), 4) ), rSumWide );
1989
int16x8_t dif_R_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[2] ), 4) ), rSumWide );
1990
1991
int16x8_t dif_G_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[1] ), 4 ) ), gSumWide );
1992
int16x8_t dif_G_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[1] ), 4 ) ), gSumWide );
1993
1994
int16x8_t dif_B_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[0] ), 4) ), bSumWide );
1995
int16x8_t dif_B_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[0] ), 4) ), bSumWide );
1996
1997
int32x2x2_t dif_xz_z = vzip_s32( vzip_s32( Planar_NEON_DifXZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifXZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifXZ( dif_G_lo, dif_G_hi ) );
1998
int32x4_t dif_xz = vcombine_s32( dif_xz_z.val[0], dif_xz_z.val[1] );
1999
int32x2x2_t dif_yz_z = vzip_s32( vzip_s32( Planar_NEON_DifYZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifYZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifYZ( dif_G_lo, dif_G_hi ) );
2000
int32x4_t dif_yz = vcombine_s32( dif_yz_z.val[0], dif_yz_z.val[1] );
2001
2002
const float fscale = -4.0f / ( (255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f );
2003
float32x4_t fa = vmulq_n_f32( vcvtq_f32_s32( dif_xz ), fscale );
2004
float32x4_t fb = vmulq_n_f32( vcvtq_f32_s32( dif_yz ), fscale );
2005
int16x4_t bgrgSum = vzip_s16( vzip_s16( vget_low_s16( bSumWide ), vget_low_s16( rSumWide ) ).val[0], vget_low_s16( gSumWide ) ).val[0];
2006
float32x4_t fd = vmulq_n_f32( vcvtq_f32_s32( vmovl_s16( bgrgSum ) ), 4.0f / 16.0f);
2007
2008
float32x4_t cof = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, 255.0f );
2009
float32x4_t chf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, -425.0f );
2010
float32x4_t cvf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, -425.0f ), fa, 255.0f );
2011
2012
int32x4_t coi = vcvtq_s32_f32( cof );
2013
int32x4_t chi = vcvtq_s32_f32( chf );
2014
int32x4_t cvi = vcvtq_s32_f32( cvf );
2015
2016
int32x4x2_t tr_hv = vtrnq_s32( chi, cvi );
2017
int32x4x2_t tr_o = vtrnq_s32( coi, coi );
2018
2019
int16x8_t c_hvoo_br_6 = convert6_NEON( tr_hv.val[0], tr_o.val[0] );
2020
int16x4_t c_hvox_g_7 = convert7_NEON( vcombine_s32( vget_low_s32( tr_hv.val[1] ), vget_low_s32( tr_o.val[1] ) ) );
2021
int16x8_t c_hvoo_br_8 = vorrq_s16( vshrq_n_s16( c_hvoo_br_6, 4 ), vshlq_n_s16( c_hvoo_br_6, 2 ) );
2022
int16x4_t c_hvox_g_8 = vorr_s16( vshr_n_s16( c_hvox_g_7, 6 ), vshl_n_s16( c_hvox_g_7, 1 ) );
2023
2024
uint64_t error = 0;
2025
if( mode != ModePlanar && useHeuristics )
2026
{
2027
int16x4_t rec_gxbr_o = vext_s16( c_hvox_g_8, vget_high_s16( c_hvoo_br_8 ), 3 );
2028
2029
rec_gxbr_o = vadd_s16( vshl_n_s16( rec_gxbr_o, 2 ), vdup_n_s16( 2 ) );
2030
int16x8_t rec_ro_wide = vdupq_lane_s16( rec_gxbr_o, 3 );
2031
int16x8_t rec_go_wide = vdupq_lane_s16( rec_gxbr_o, 0 );
2032
int16x8_t rec_bo_wide = vdupq_lane_s16( rec_gxbr_o, 1 );
2033
2034
int16x4_t br_hv2 = vsub_s16( vget_low_s16( c_hvoo_br_8 ), vget_high_s16( c_hvoo_br_8 ) );
2035
int16x4_t gg_hv2 = vsub_s16( c_hvox_g_8, vdup_lane_s16( c_hvox_g_8, 2 ) );
2036
2037
int16x8_t scaleh_lo = { 0, 0, 0, 0, 1, 1, 1, 1 };
2038
int16x8_t scaleh_hi = { 2, 2, 2, 2, 3, 3, 3, 3 };
2039
int16x8_t scalev = { 0, 1, 2, 3, 0, 1, 2, 3 };
2040
2041
int16x8_t rec_r_1 = vmlaq_lane_s16( rec_ro_wide, scalev, br_hv2, 3 );
2042
int16x8_t rec_r_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_lo, br_hv2, 2 ), 2 ) ) );
2043
int16x8_t rec_r_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_hi, br_hv2, 2 ), 2 ) ) );
2044
2045
int16x8_t rec_b_1 = vmlaq_lane_s16( rec_bo_wide, scalev, br_hv2, 1 );
2046
int16x8_t rec_b_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_lo, br_hv2, 0 ), 2 ) ) );
2047
int16x8_t rec_b_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_hi, br_hv2, 0 ), 2 ) ) );
2048
2049
int16x8_t rec_g_1 = vmlaq_lane_s16( rec_go_wide, scalev, gg_hv2, 1 );
2050
int16x8_t rec_g_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_lo, gg_hv2, 0 ), 2 ) ) );
2051
int16x8_t rec_g_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_hi, gg_hv2, 0 ), 2 ) ) );
2052
2053
int16x8_t dif_r_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[2] ) ) ), rec_r_lo );
2054
int16x8_t dif_r_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[2] ) ) ), rec_r_hi );
2055
2056
int16x8_t dif_g_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[1] ) ) ), rec_g_lo );
2057
int16x8_t dif_g_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[1] ) ) ), rec_g_hi );
2058
2059
int16x8_t dif_b_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[0] ) ) ), rec_b_lo );
2060
int16x8_t dif_b_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[0] ) ) ), rec_b_hi );
2061
2062
int16x8_t dif_lo = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_lo, 38 ), dif_g_lo, 76 ), dif_b_lo, 14 );
2063
int16x8_t dif_hi = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_hi, 38 ), dif_g_hi, 76 ), dif_b_hi, 14 );
2064
2065
int16x4_t tmpDif = vget_low_s16( dif_lo );
2066
int32x4_t difsq_0 = vmull_s16( tmpDif, tmpDif );
2067
tmpDif = vget_high_s16( dif_lo );
2068
int32x4_t difsq_1 = vmull_s16( tmpDif, tmpDif );
2069
tmpDif = vget_low_s16( dif_hi );
2070
int32x4_t difsq_2 = vmull_s16( tmpDif, tmpDif );
2071
tmpDif = vget_high_s16( dif_hi );
2072
int32x4_t difsq_3 = vmull_s16( tmpDif, tmpDif );
2073
2074
uint32x4_t difsq_5 = vaddq_u32( vreinterpretq_u32_s32( difsq_0 ), vreinterpretq_u32_s32( difsq_1 ) );
2075
uint32x4_t difsq_6 = vaddq_u32( vreinterpretq_u32_s32( difsq_2 ), vreinterpretq_u32_s32( difsq_3 ) );
2076
2077
uint64x2_t difsq_7 = vaddl_u32( vget_low_u32( difsq_5 ), vget_high_u32( difsq_5 ) );
2078
uint64x2_t difsq_8 = vaddl_u32( vget_low_u32( difsq_6 ), vget_high_u32( difsq_6 ) );
2079
2080
uint64x2_t difsq_9 = vaddq_u64( difsq_7, difsq_8 );
2081
2082
#ifdef __aarch64__
2083
error = vaddvq_u64( difsq_9 );
2084
#else
2085
error = vgetq_lane_u64( difsq_9, 0 ) + vgetq_lane_u64( difsq_9, 1 );
2086
#endif
2087
}
2088
2089
int32_t coR = c_hvoo_br_6[6];
2090
int32_t coG = c_hvox_g_7[2];
2091
int32_t coB = c_hvoo_br_6[4];
2092
2093
int32_t chR = c_hvoo_br_6[2];
2094
int32_t chG = c_hvox_g_7[0];
2095
int32_t chB = c_hvoo_br_6[0];
2096
2097
int32_t cvR = c_hvoo_br_6[3];
2098
int32_t cvG = c_hvox_g_7[1];
2099
int32_t cvB = c_hvoo_br_6[1];
2100
2101
uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 );
2102
uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 );
2103
uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 );
2104
uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C );
2105
lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 );
2106
lo |= ( ( coG & 0x3F) << 17) | ( (coG & 0x40 ) << 18 );
2107
lo |= coR << 25;
2108
2109
const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 );
2110
2111
lo |= g_flags[idx];
2112
2113
uint64_t result = static_cast<uint32_t>( _bswap(lo) );
2114
result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32;
2115
2116
return std::make_pair( result, error );
2117
}
2118
2119
#endif
2120
2121
#ifdef __AVX2__
2122
uint32_t calculateErrorTH( bool tMode, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist, __m128i r8, __m128i g8, __m128i b8 )
2123
#else
2124
uint32_t calculateErrorTH( bool tMode, uint8_t* src, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist )
2125
#endif
2126
{
2127
uint32_t blockErr = 0, bestBlockErr = MaxError;
2128
2129
uint32_t pixColors;
2130
uint8_t possibleColors[4][3];
2131
uint8_t colors[2][3];
2132
2133
decompressColor( colorsRGB444, colors );
2134
2135
#ifdef __AVX2__
2136
__m128i reverseMask = _mm_set_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 );
2137
#endif
2138
2139
// test distances
2140
for( uint8_t d = startDist; d < 8; ++d )
2141
{
2142
if( d >= 2 && dist == d - 2 ) break;
2143
2144
blockErr = 0;
2145
pixColors = 0;
2146
2147
if( tMode )
2148
{
2149
calculatePaintColors59T( d, colors, possibleColors );
2150
}
2151
else
2152
{
2153
calculatePaintColors58H( d, colors, possibleColors );
2154
}
2155
2156
#ifdef __AVX2__
2157
// RGB ordering
2158
__m128i b8Rev = _mm_shuffle_epi8( b8, reverseMask );
2159
__m128i g8Rev = _mm_shuffle_epi8( g8, reverseMask );
2160
__m128i r8Rev = _mm_shuffle_epi8( r8, reverseMask );
2161
2162
// extends 3x128 bits RGB into 3x256 bits RGB for error comparisions
2163
static const __m128i zero = _mm_setzero_si128();
2164
__m128i b8Lo = _mm_unpacklo_epi8( b8Rev, zero );
2165
__m128i g8Lo = _mm_unpacklo_epi8( g8Rev, zero );
2166
__m128i r8Lo = _mm_unpacklo_epi8( r8Rev, zero );
2167
__m128i b8Hi = _mm_unpackhi_epi8( b8Rev, zero );
2168
__m128i g8Hi = _mm_unpackhi_epi8( g8Rev, zero );
2169
__m128i r8Hi = _mm_unpackhi_epi8( r8Rev, zero );
2170
2171
__m256i b8 = _mm256_set_m128i( b8Hi, b8Lo );
2172
__m256i g8 = _mm256_set_m128i( g8Hi, g8Lo );
2173
__m256i r8 = _mm256_set_m128i( r8Hi, r8Lo );
2174
2175
// caculates differences between the pixel colrs and the palette colors
2176
__m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[0][B] ) ) );
2177
__m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[0][G] ) ) );
2178
__m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[0][R] ) ) );
2179
2180
// luma-based error calculations
2181
static const __m256i bWeight = _mm256_set1_epi16( 14 );
2182
static const __m256i gWeight = _mm256_set1_epi16( 76 );
2183
static const __m256i rWeight = _mm256_set1_epi16( 38 );
2184
2185
diffb = _mm256_mullo_epi16( diffb, bWeight );
2186
diffg = _mm256_mullo_epi16( diffg, gWeight );
2187
diffr = _mm256_mullo_epi16( diffr, rWeight );
2188
2189
// obtains the error with the current palette color
2190
__m256i lowestPixErr = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
2191
2192
// error calucations with the remaining three palette colors
2193
static const uint32_t masks[4] = { 0, 0x55555555, 0xAAAAAAAA, 0xFFFFFFFF };
2194
for( uint8_t c = 1; c < 4; c++ )
2195
{
2196
__m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[c][B] ) ) );
2197
__m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[c][G] ) ) );
2198
__m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[c][R] ) ) );
2199
2200
diffb = _mm256_mullo_epi16( diffb, bWeight );
2201
diffg = _mm256_mullo_epi16( diffg, gWeight );
2202
diffr = _mm256_mullo_epi16( diffr, rWeight );
2203
2204
// error comparison with the previous best color
2205
__m256i pixErrors = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
2206
__m256i minErr = _mm256_min_epu16( lowestPixErr, pixErrors );
2207
__m256i cmpRes = _mm256_cmpeq_epi16( pixErrors, minErr );
2208
lowestPixErr = minErr;
2209
2210
// update pixel colors
2211
uint32_t updPixColors = _mm256_movemask_epi8( cmpRes );
2212
uint32_t prevPixColors = pixColors & ~updPixColors;
2213
uint32_t mskPixColors = masks[c] & updPixColors;
2214
pixColors = prevPixColors | mskPixColors;
2215
}
2216
2217
// accumulate the block error
2218
alignas( 32 ) uint16_t pixErr16[16] = { 0, };
2219
_mm256_storeu_si256( (__m256i*)pixErr16, lowestPixErr );
2220
for( uint8_t p = 0; p < 16; p++ )
2221
{
2222
blockErr += (int)( pixErr16[p] ) * pixErr16[p];
2223
}
2224
#else
2225
for( size_t y = 0; y < 4; ++y )
2226
{
2227
for( size_t x = 0; x < 4; ++x )
2228
{
2229
uint32_t bestPixErr = MaxError;
2230
pixColors <<= 2; // Make room for next value
2231
2232
// Loop possible block colors
2233
for( uint8_t c = 0; c < 4; ++c )
2234
{
2235
int diff[3];
2236
diff[R] = src[4 * ( x * 4 + y ) + R] - possibleColors[c][R];
2237
diff[G] = src[4 * ( x * 4 + y ) + G] - possibleColors[c][G];
2238
diff[B] = src[4 * ( x * 4 + y ) + B] - possibleColors[c][B];
2239
2240
const uint32_t err = 38 * abs( diff[R] ) + 76 * abs( diff[G] ) + 14 * abs( diff[B] );
2241
uint32_t pixErr = err * err;
2242
2243
// Choose best error
2244
if( pixErr < bestPixErr )
2245
{
2246
bestPixErr = pixErr;
2247
pixColors ^= ( pixColors & 3 ); // Reset the two first bits
2248
pixColors |= c;
2249
}
2250
}
2251
blockErr += bestPixErr;
2252
}
2253
}
2254
#endif
2255
2256
if( blockErr < bestBlockErr )
2257
{
2258
bestBlockErr = blockErr;
2259
dist = d;
2260
pixIndices = pixColors;
2261
}
2262
}
2263
2264
return bestBlockErr;
2265
}
2266
2267
2268
// main T-/H-mode compression function
2269
#ifdef __AVX2__
2270
uint32_t compressBlockTH( uint8_t* src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool& tMode, __m128i r8, __m128i g8, __m128i b8 )
2271
#else
2272
uint32_t compressBlockTH( uint8_t *src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool &tMode )
2273
#endif
2274
{
2275
#ifdef __AVX2__
2276
alignas( 8 ) uint8_t luma[16] = { 0, };
2277
_mm_storeu_si128 ( (__m128i* )luma, l.luma8 );
2278
#elif defined __ARM_NEON && defined __aarch64__
2279
alignas( 8 ) uint8_t luma[16] = { 0 };
2280
vst1q_u8( luma, l.luma8 );
2281
#else
2282
uint8_t* luma = l.val;
2283
#endif
2284
2285
uint8_t pixIdx[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
2286
2287
// 1) sorts the pairs of (luma, pix_idx)
2288
insertionSort( luma, pixIdx );
2289
2290
// 2) finds the min (left+right)
2291
uint8_t minSumRangeIdx = 0;
2292
uint16_t minSumRangeValue;
2293
uint16_t sum;
2294
static const uint8_t diffBonus[15] = {8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 8};
2295
const int16_t temp = luma[15] - luma[0];
2296
2297
minSumRangeValue = luma[15] - luma[1] + diffBonus[0];
2298
for( uint8_t i = 1; i < 14; i++ )
2299
{
2300
sum = temp - luma[i+1] + luma[i] + diffBonus[i];
2301
if( minSumRangeValue > sum )
2302
{
2303
minSumRangeValue = sum;
2304
minSumRangeIdx = i;
2305
}
2306
}
2307
2308
sum = luma[14] - luma[0] + diffBonus[14];
2309
if( minSumRangeValue > sum )
2310
{
2311
minSumRangeValue = sum;
2312
minSumRangeIdx = 14;
2313
}
2314
uint8_t lRange, rRange;
2315
2316
lRange = luma[minSumRangeIdx] - luma[0];
2317
rRange = luma[15] - luma[minSumRangeIdx + 1];
2318
2319
// 3) sets a proper mode
2320
bool swap = false;
2321
if( lRange >= rRange )
2322
{
2323
if( lRange >= rRange * 2 )
2324
{
2325
swap = true;
2326
tMode = true;
2327
}
2328
}
2329
else
2330
{
2331
if( lRange * 2 <= rRange ) tMode = true;
2332
}
2333
// 4) calculates the two base colors
2334
uint8_t rangeIdx[4] = { pixIdx[0], pixIdx[minSumRangeIdx], pixIdx[minSumRangeIdx + 1], pixIdx[15] };
2335
2336
uint16_t r[4], g[4], b[4];
2337
for( uint8_t i = 0; i < 4; ++i )
2338
{
2339
uint8_t idx = rangeIdx[i] * 4;
2340
b[i] = src[idx];
2341
g[i] = src[idx + 1];
2342
r[i] = src[idx + 2];
2343
}
2344
2345
uint8_t mid_rgb[2][3];
2346
if( swap )
2347
{
2348
mid_rgb[1][B] = ( b[0] + b[1] ) / 2;
2349
mid_rgb[1][G] = ( g[0] + g[1] ) / 2;
2350
mid_rgb[1][R] = ( r[0] + r[1] ) / 2;
2351
2352
uint16_t sum_rgb[3] = { 0, 0, 0 };
2353
for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
2354
{
2355
uint8_t idx = pixIdx[i] * 4;
2356
sum_rgb[B] += src[idx];
2357
sum_rgb[G] += src[idx + 1];
2358
sum_rgb[R] += src[idx + 2];
2359
}
2360
const uint8_t temp = 15 - minSumRangeIdx;
2361
mid_rgb[0][B] = sum_rgb[B] / temp;
2362
mid_rgb[0][G] = sum_rgb[G] / temp;
2363
mid_rgb[0][R] = sum_rgb[R] / temp;
2364
}
2365
else
2366
{
2367
mid_rgb[0][B] = (b[0] + b[1]) / 2;
2368
mid_rgb[0][G] = (g[0] + g[1]) / 2;
2369
mid_rgb[0][R] = (r[0] + r[1]) / 2;
2370
if( tMode )
2371
{
2372
uint16_t sum_rgb[3] = { 0, 0, 0 };
2373
for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
2374
{
2375
uint8_t idx = pixIdx[i] * 4;
2376
sum_rgb[B] += src[idx];
2377
sum_rgb[G] += src[idx + 1];
2378
sum_rgb[R] += src[idx + 2];
2379
}
2380
const uint8_t temp = 15 - minSumRangeIdx;
2381
mid_rgb[1][B] = sum_rgb[B] / temp;
2382
mid_rgb[1][G] = sum_rgb[G] / temp;
2383
mid_rgb[1][R] = sum_rgb[R] / temp;
2384
}
2385
else
2386
{
2387
mid_rgb[1][B] = (b[2] + b[3]) / 2;
2388
mid_rgb[1][G] = (g[2] + g[3]) / 2;
2389
mid_rgb[1][R] = (r[2] + r[3]) / 2;
2390
}
2391
}
2392
2393
// 5) sets the start distance index
2394
uint32_t startDistCandidate;
2395
uint32_t avgDist;
2396
if( tMode )
2397
{
2398
if( swap )
2399
{
2400
avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] ) / 6;
2401
}
2402
else
2403
{
2404
avgDist = ( b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 6;
2405
}
2406
}
2407
else
2408
{
2409
avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] + b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 12;
2410
}
2411
2412
if( avgDist <= 16)
2413
{
2414
startDistCandidate = 0;
2415
}
2416
else if( avgDist <= 23 )
2417
{
2418
startDistCandidate = 1;
2419
}
2420
else if( avgDist <= 32 )
2421
{
2422
startDistCandidate = 2;
2423
}
2424
else if( avgDist <= 41 )
2425
{
2426
startDistCandidate = 3;
2427
}
2428
else
2429
{
2430
startDistCandidate = 4;
2431
}
2432
2433
uint32_t bestErr = MaxError;
2434
uint32_t bestPixIndices;
2435
uint8_t bestDist = 10;
2436
uint8_t colorsRGB444[2][3];
2437
compressColor( mid_rgb, colorsRGB444, tMode );
2438
compressed1 = 0;
2439
2440
// 6) finds the best candidate with the lowest error
2441
#ifdef __AVX2__
2442
// Vectorized ver
2443
bestErr = calculateErrorTH( tMode, colorsRGB444, bestDist, bestPixIndices, startDistCandidate, r8, g8, b8 );
2444
#else
2445
// Scalar ver
2446
bestErr = calculateErrorTH( tMode, src, colorsRGB444, bestDist, bestPixIndices, startDistCandidate );
2447
#endif
2448
2449
// 7) outputs the final T or H block
2450
if( tMode )
2451
{
2452
// Put the compress params into the compression block
2453
compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 23;
2454
compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 19;
2455
compressed1 |= ( colorsRGB444[0][B] ) << 15;
2456
compressed1 |= ( colorsRGB444[1][R] ) << 11;
2457
compressed1 |= ( colorsRGB444[1][G] ) << 7;
2458
compressed1 |= ( colorsRGB444[1][B] ) << 3;
2459
compressed1 |= bestDist & 0x7;
2460
}
2461
else
2462
{
2463
int bestRGB444ColPacked[2];
2464
bestRGB444ColPacked[0] = (colorsRGB444[0][R] << 8) + (colorsRGB444[0][G] << 4) + colorsRGB444[0][B];
2465
bestRGB444ColPacked[1] = (colorsRGB444[1][R] << 8) + (colorsRGB444[1][G] << 4) + colorsRGB444[1][B];
2466
if( ( bestRGB444ColPacked[0] >= bestRGB444ColPacked[1] ) ^ ( ( bestDist & 1 ) == 1 ) )
2467
{
2468
swapColors( colorsRGB444 );
2469
// Reshuffle pixel indices to to exchange C1 with C3, and C2 with C4
2470
bestPixIndices = ( 0x55555555 & bestPixIndices ) | ( 0xaaaaaaaa & ( ~bestPixIndices ) );
2471
}
2472
2473
// Put the compress params into the compression block
2474
compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 22;
2475
compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 18;
2476
compressed1 |= ( colorsRGB444[0][B] & 0xf ) << 14;
2477
compressed1 |= ( colorsRGB444[1][R] & 0xf ) << 10;
2478
compressed1 |= ( colorsRGB444[1][G] & 0xf ) << 6;
2479
compressed1 |= ( colorsRGB444[1][B] & 0xf ) << 2;
2480
compressed1 |= ( bestDist >> 1 ) & 0x3;
2481
}
2482
2483
bestPixIndices = indexConversion( bestPixIndices );
2484
compressed2 = 0;
2485
compressed2 = ( compressed2 & ~( ( 0x2 << 31 ) - 1 ) ) | ( bestPixIndices & ( ( 2 << 31 ) - 1 ) );
2486
2487
return bestErr;
2488
}
2489
//#endif
2490
2491
template<class T, class S>
2492
static etcpak_force_inline uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id, const uint64_t value, const uint64_t error)
2493
{
2494
size_t tidx[2];
2495
tidx[0] = GetLeastError( terr[0], 8 );
2496
tidx[1] = GetLeastError( terr[1], 8 );
2497
2498
if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
2499
{
2500
return value;
2501
}
2502
2503
d |= tidx[0] << 26;
2504
d |= tidx[1] << 29;
2505
for( int i=0; i<16; i++ )
2506
{
2507
uint64_t t = tsel[i][tidx[id[i]%2]];
2508
d |= ( t & 0x1 ) << ( i + 32 );
2509
d |= ( t & 0x2 ) << ( i + 47 );
2510
}
2511
2512
return FixByteOrder(d);
2513
}
2514
2515
}
2516
2517
static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src )
2518
{
2519
#ifdef __AVX2__
2520
uint64_t d = CheckSolid_AVX2( src );
2521
if( d != 0 ) return d;
2522
2523
alignas(32) v4i a[8];
2524
2525
__m128i err0 = PrepareAverages_AVX2( a, src );
2526
2527
// Get index of minimum error (err0)
2528
__m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
2529
__m128i errMin0 = _mm_min_epu32(err0, err1);
2530
2531
__m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
2532
__m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
2533
2534
__m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
2535
2536
uint32_t mask = _mm_movemask_epi8(errMask);
2537
2538
uint32_t idx = _bit_scan_forward(mask) >> 2;
2539
2540
d |= EncodeAverages_AVX2( a, idx );
2541
2542
alignas(32) uint32_t terr[2][8] = {};
2543
alignas(32) uint32_t tsel[8];
2544
2545
if ((idx == 0) || (idx == 2))
2546
{
2547
FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
2548
}
2549
else
2550
{
2551
FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
2552
}
2553
2554
return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1 );
2555
#else
2556
uint64_t d = CheckSolid( src );
2557
if( d != 0 ) return d;
2558
2559
v4i a[8];
2560
unsigned int err[4] = {};
2561
PrepareAverages( a, src, err );
2562
size_t idx = GetLeastError( err, 4 );
2563
EncodeAverages( d, a, idx );
2564
2565
#if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
2566
uint32_t terr[2][8] = {};
2567
#else
2568
uint64_t terr[2][8] = {};
2569
#endif
2570
uint16_t tsel[16][8];
2571
auto id = g_id[idx];
2572
FindBestFit( terr, tsel, a, id, src );
2573
2574
return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) );
2575
#endif
2576
}
2577
2578
#ifdef __AVX2__
2579
// horizontal min/max functions. https://stackoverflow.com/questions/22256525/horizontal-minimum-and-maximum-using-sse
2580
// if an error occurs in GCC, please change the value of -march in CFLAGS to a specific value for your CPU (e.g., skylake).
2581
static inline int16_t hMax( __m128i buffer, uint8_t& idx )
2582
{
2583
__m128i tmp1 = _mm_sub_epi8( _mm_set1_epi8( (char)( 255 ) ), buffer );
2584
__m128i tmp2 = _mm_min_epu8( tmp1, _mm_srli_epi16( tmp1, 8 ) );
2585
__m128i tmp3 = _mm_minpos_epu16( tmp2 );
2586
uint8_t result = 255 - (uint8_t)_mm_cvtsi128_si32( tmp3 );
2587
__m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) );
2588
idx = _tzcnt_u32( _mm_movemask_epi8( mask ) );
2589
2590
return result;
2591
}
2592
#elif defined __ARM_NEON && defined __aarch64__
2593
static inline int16_t hMax( uint8x16_t buffer, uint8_t& idx )
2594
{
2595
const uint8_t max = vmaxvq_u8( buffer );
2596
const uint16x8_t vmax = vdupq_n_u16( max );
2597
uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() );
2598
uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[0] );
2599
uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[1] );
2600
uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmax );
2601
uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmax );
2602
2603
static const uint16_t mask_lsb[] = {
2604
0x1, 0x2, 0x4, 0x8,
2605
0x10, 0x20, 0x40, 0x80 };
2606
2607
static const uint16_t mask_msb[] = {
2608
0x100, 0x200, 0x400, 0x800,
2609
0x1000, 0x2000, 0x4000, 0x8000 };
2610
2611
uint16x8_t vmask_lsb = vld1q_u16( mask_lsb );
2612
uint16x8_t vmask_msb = vld1q_u16( mask_msb );
2613
uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask );
2614
uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask );
2615
pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2616
pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2617
pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2618
uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), 0 );
2619
pos_msb = vpaddq_u16( pos_msb, pos_msb );
2620
pos_msb = vpaddq_u16( pos_msb, pos_msb );
2621
pos_msb = vpaddq_u16( pos_msb, pos_msb );
2622
uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), 0 );
2623
idx = idx_lane1 != 0 ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 );
2624
2625
return max;
2626
}
2627
#endif
2628
2629
#ifdef __AVX2__
2630
static inline int16_t hMin( __m128i buffer, uint8_t& idx )
2631
{
2632
__m128i tmp2 = _mm_min_epu8( buffer, _mm_srli_epi16( buffer, 8 ) );
2633
__m128i tmp3 = _mm_minpos_epu16( tmp2 );
2634
uint8_t result = (uint8_t)_mm_cvtsi128_si32( tmp3 );
2635
__m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) );
2636
idx = _tzcnt_u32( _mm_movemask_epi8( mask ) );
2637
return result;
2638
}
2639
#elif defined __ARM_NEON && defined __aarch64__
2640
static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx )
2641
{
2642
const uint8_t min = vminvq_u8( buffer );
2643
const uint16x8_t vmin = vdupq_n_u16( min );
2644
uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() );
2645
uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[0] );
2646
uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[1] );
2647
uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmin );
2648
uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmin );
2649
2650
static const uint16_t mask_lsb[] = {
2651
0x1, 0x2, 0x4, 0x8,
2652
0x10, 0x20, 0x40, 0x80 };
2653
2654
static const uint16_t mask_msb[] = {
2655
0x100, 0x200, 0x400, 0x800,
2656
0x1000, 0x2000, 0x4000, 0x8000 };
2657
2658
uint16x8_t vmask_lsb = vld1q_u16( mask_lsb );
2659
uint16x8_t vmask_msb = vld1q_u16( mask_msb );
2660
uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask );
2661
uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask );
2662
pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2663
pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2664
pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2665
uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), 0 );
2666
pos_msb = vpaddq_u16( pos_msb, pos_msb );
2667
pos_msb = vpaddq_u16( pos_msb, pos_msb );
2668
pos_msb = vpaddq_u16( pos_msb, pos_msb );
2669
uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), 0 );
2670
idx = idx_lane1 != 0 ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 );
2671
2672
return min;
2673
}
2674
#endif
2675
2676
// During search it is not convenient to store the bits the way they are stored in the
2677
// file format. Hence, after search, it is converted to this format.
2678
// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
2679
static inline void stuff59bits( unsigned int thumbT59W1, unsigned int thumbT59W2, unsigned int& thumbTW1, unsigned int& thumbTW2 )
2680
{
2681
// Put bits in twotimer configuration for 59 (red overflows)
2682
//
2683
// Go from this bit layout:
2684
//
2685
// |63 62 61 60 59|58 57 56 55|54 53 52 51|50 49 48 47|46 45 44 43|42 41 40 39|38 37 36 35|34 33 32|
2686
// |----empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|--dist--|
2687
//
2688
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2689
// |----------------------------------------index bits---------------------------------------------|
2690
//
2691
//
2692
// To this:
2693
//
2694
// 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2695
// -----------------------------------------------------------------------------------------------
2696
// |// // //|R0a |//|R0b |G0 |B0 |R1 |G1 |B1 |da |df|db|
2697
// -----------------------------------------------------------------------------------------------
2698
//
2699
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2700
// |----------------------------------------index bits---------------------------------------------|
2701
//
2702
// 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2703
// -----------------------------------------------------------------------------------------------
2704
// | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp|
2705
// | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt|
2706
// ------------------------------------------------------------------------------------------------
2707
2708
uint8_t R0a;
2709
uint8_t bit, a, b, c, d, bits;
2710
2711
R0a = ( thumbT59W1 >> 25 ) & 0x3;
2712
2713
// Fix middle part
2714
thumbTW1 = thumbT59W1 << 1;
2715
// Fix R0a (top two bits of R0)
2716
thumbTW1 = ( thumbTW1 & ~( 0x3 << 27 ) ) | ( ( R0a & 0x3 ) << 27 );
2717
// Fix db (lowest bit of d)
2718
thumbTW1 = ( thumbTW1 & ~0x1 ) | ( thumbT59W1 & 0x1 );
2719
2720
// Make sure that red overflows:
2721
a = ( thumbTW1 >> 28 ) & 0x1;
2722
b = ( thumbTW1 >> 27 ) & 0x1;
2723
c = ( thumbTW1 >> 25 ) & 0x1;
2724
d = ( thumbTW1 >> 24 ) & 0x1;
2725
2726
// The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
2727
// The following logical expression checks for the presence of any of those:
2728
bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
2729
bits = 0xf * bit;
2730
thumbTW1 = ( thumbTW1 & ~( 0x7 << 29 ) ) | ( bits & 0x7 ) << 29;
2731
thumbTW1 = ( thumbTW1 & ~( 0x1 << 26 ) ) | ( !bit & 0x1 ) << 26;
2732
2733
// Set diffbit
2734
thumbTW1 = ( thumbTW1 & ~0x2 ) | 0x2;
2735
thumbTW2 = thumbT59W2;
2736
}
2737
2738
// During search it is not convenient to store the bits the way they are stored in the
2739
// file format. Hence, after search, it is converted to this format.
2740
// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
2741
static inline void stuff58bits( unsigned int thumbH58W1, unsigned int thumbH58W2, unsigned int& thumbHW1, unsigned int& thumbHW2 )
2742
{
2743
// Put bits in twotimer configuration for 58 (red doesn't overflow, green does)
2744
//
2745
// Go from this bit layout:
2746
//
2747
//
2748
// |63 62 61 60 59 58|57 56 55 54|53 52 51 50|49 48 47 46|45 44 43 42|41 40 39 38|37 36 35 34|33 32|
2749
// |-------empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|d2 d1|
2750
//
2751
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2752
// |---------------------------------------index bits----------------------------------------------|
2753
//
2754
// To this:
2755
//
2756
// 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2757
// -----------------------------------------------------------------------------------------------
2758
// |//|R0 |G0 |// // //|G0|B0|//|B0b |R1 |G1 |B0 |d2|df|d1|
2759
// -----------------------------------------------------------------------------------------------
2760
//
2761
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2762
// |---------------------------------------index bits----------------------------------------------|
2763
//
2764
// 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2765
// -----------------------------------------------------------------------------------------------
2766
// | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp|
2767
// | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt|
2768
// -----------------------------------------------------------------------------------------------
2769
//
2770
//
2771
// Thus, what we are really doing is going from this bit layout:
2772
//
2773
//
2774
// |63 62 61 60 59 58|57 56 55 54 53 52 51|50 49|48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33|32 |
2775
// |-------empty-----|part0---------------|part1|part2------------------------------------------|part3|
2776
//
2777
// To this:
2778
//
2779
// 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2780
// --------------------------------------------------------------------------------------------------|
2781
// |//|part0 |// // //|part1|//|part2 |df|part3|
2782
// --------------------------------------------------------------------------------------------------|
2783
2784
unsigned int part0, part1, part2, part3;
2785
uint8_t bit, a, b, c, d, bits;
2786
2787
// move parts
2788
part0 = ( thumbH58W1 >> 19 ) & 0x7f;
2789
part1 = ( thumbH58W1 >> 17 ) & 0x3;
2790
part2 = ( thumbH58W1 >> 1 ) & 0xffff;
2791
part3 = thumbH58W1 & 0x1;
2792
thumbHW1 = 0;
2793
thumbHW1 = ( thumbHW1 & ~( 0x7f << 24 ) ) | ( ( part0 & 0x7f ) << 24 );
2794
thumbHW1 = ( thumbHW1 & ~( 0x3 << 19 ) ) | ( ( part1 & 0x3 ) << 19 );
2795
thumbHW1 = ( thumbHW1 & ~( 0xffff << 2 ) ) | ( ( part2 & 0xffff ) << 2 );
2796
thumbHW1 = ( thumbHW1 & ~0x1 ) | ( part3 & 0x1 );
2797
2798
// Make sure that red does not overflow:
2799
bit = ( thumbHW1 >> 30 ) & 0x1;
2800
thumbHW1 = ( thumbHW1 & ~( 0x1 << 31 ) ) | ( ( !bit & 0x1 ) << 31 );
2801
2802
// Make sure that green overflows:
2803
a = ( thumbHW1 >> 20 ) & 0x1;
2804
b = ( thumbHW1 >> 19 ) & 0x1;
2805
c = ( thumbHW1 >> 17 ) & 0x1;
2806
d = ( thumbHW1 >> 16 ) & 0x1;
2807
// The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
2808
// The following logical expression checks for the presence of any of those:
2809
bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
2810
bits = 0xf * bit;
2811
thumbHW1 = ( thumbHW1 & ~( 0x7 << 21 ) ) | ( ( bits & 0x7 ) << 21 );
2812
thumbHW1 = ( thumbHW1 & ~( 0x1 << 18 ) ) | ( ( !bit & 0x1 ) << 18 );
2813
2814
// Set diffbit
2815
thumbHW1 = ( thumbHW1 & ~0x2 ) | 0x2;
2816
thumbHW2 = thumbH58W2;
2817
}
2818
2819
#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
2820
static etcpak_force_inline Channels GetChannels( const uint8_t* src )
2821
{
2822
Channels ch;
2823
#ifdef __AVX2__
2824
__m128i d0 = _mm_loadu_si128( ( (__m128i*)src ) + 0 );
2825
__m128i d1 = _mm_loadu_si128( ( (__m128i*)src ) + 1 );
2826
__m128i d2 = _mm_loadu_si128( ( (__m128i*)src ) + 2 );
2827
__m128i d3 = _mm_loadu_si128( ( (__m128i*)src ) + 3 );
2828
2829
__m128i rgb0 = _mm_shuffle_epi8( d0, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2830
__m128i rgb1 = _mm_shuffle_epi8( d1, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2831
__m128i rgb2 = _mm_shuffle_epi8( d2, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2832
__m128i rgb3 = _mm_shuffle_epi8( d3, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2833
2834
__m128i rg0 = _mm_unpacklo_epi32( rgb0, rgb1 );
2835
__m128i rg1 = _mm_unpacklo_epi32( rgb2, rgb3 );
2836
__m128i b0 = _mm_unpackhi_epi32( rgb0, rgb1 );
2837
__m128i b1 = _mm_unpackhi_epi32( rgb2, rgb3 );
2838
2839
// swap channels
2840
ch.b8 = _mm_unpacklo_epi64( rg0, rg1 );
2841
ch.g8 = _mm_unpackhi_epi64( rg0, rg1 );
2842
ch.r8 = _mm_unpacklo_epi64( b0, b1 );
2843
#elif defined __ARM_NEON && defined __aarch64__
2844
//load pixel data into 4 rows
2845
uint8x16_t px0 = vld1q_u8( src + 0 );
2846
uint8x16_t px1 = vld1q_u8( src + 16 );
2847
uint8x16_t px2 = vld1q_u8( src + 32 );
2848
uint8x16_t px3 = vld1q_u8( src + 48 );
2849
2850
uint8x16x2_t px0z1 = vzipq_u8( px0, px1 );
2851
uint8x16x2_t px2z3 = vzipq_u8( px2, px3 );
2852
uint8x16x2_t px01 = vzipq_u8( px0z1.val[0], px0z1.val[1] );
2853
uint8x16x2_t rgb01 = vzipq_u8( px01.val[0], px01.val[1] );
2854
uint8x16x2_t px23 = vzipq_u8( px2z3.val[0], px2z3.val[1] );
2855
uint8x16x2_t rgb23 = vzipq_u8( px23.val[0], px23.val[1] );
2856
2857
uint8x16_t rr = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[0] ), vreinterpretq_u64_u8( rgb23.val[0] ) ) );
2858
uint8x16_t gg = vreinterpretq_u8_u64( vzip2q_u64( vreinterpretq_u64_u8( rgb01.val[0] ), vreinterpretq_u64_u8( rgb23.val[0] ) ) );
2859
uint8x16_t bb = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[1] ), vreinterpretq_u64_u8( rgb23.val[1] ) ) );
2860
2861
uint8x16x2_t red = vzipq_u8( rr, uint8x16_t() );
2862
uint8x16x2_t grn = vzipq_u8( gg, uint8x16_t() );
2863
uint8x16x2_t blu = vzipq_u8( bb, uint8x16_t() );
2864
ch.r = red;
2865
ch.b = blu;
2866
ch.g = grn;
2867
#endif
2868
return ch;
2869
}
2870
#endif
2871
2872
#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
2873
static etcpak_force_inline void CalculateLuma( Channels& ch, Luma& luma )
2874
#else
2875
static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
2876
#endif
2877
{
2878
#ifdef __AVX2__
2879
__m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.b8 ), _mm256_set1_epi16( 14 ) );
2880
__m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.g8 ), _mm256_set1_epi16( 76 ) );
2881
__m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.r8 ), _mm256_set1_epi16( 38 ) );
2882
2883
__m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma );
2884
__m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, 7 );
2885
__m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, 0 );
2886
__m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, 1 );
2887
2888
static const __m128i interleaving_mask_lo = _mm_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 );
2889
static const __m128i interleaving_mask_hi = _mm_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 );
2890
__m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo );
2891
__m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi );
2892
__m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved );
2893
luma.luma8 = luma_8bit;
2894
2895
// min/max calculation
2896
luma.min = hMin( luma_8bit, luma.minIdx ) * 0.00392156f;
2897
luma.max = hMax( luma_8bit, luma.maxIdx ) * 0.00392156f;
2898
#elif defined __ARM_NEON && defined __aarch64__
2899
//load pixel data into 4 rows
2900
uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[0] ), 14 );
2901
uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[1] ), 14 );
2902
uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[0] ), 76 );
2903
uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[1] ), 76 );
2904
uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[0] ), 38 );
2905
uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[1] ), 38 );
2906
2907
//calculate luma for rows 0,1 and 2,3
2908
uint16x8_t lum_r01 = vaddq_u16( vaddq_u16( red0, grn0 ), blu0 );
2909
uint16x8_t lum_r23 = vaddq_u16( vaddq_u16( red1, grn1 ), blu1 );
2910
2911
//divide luma values with right shift and narrow results to 8bit
2912
uint8x8_t lum_r01_d = vshrn_n_u16( lum_r01, 7 );
2913
uint8x8_t lum_r02_d = vshrn_n_u16( lum_r23, 7 );
2914
2915
luma.luma8 = vcombine_u8( lum_r01_d, lum_r02_d );
2916
//find min and max luma value
2917
luma.min = hMin( luma.luma8, luma.minIdx ) * 0.00392156f;
2918
luma.max = hMax( luma.luma8, luma.maxIdx ) * 0.00392156f;
2919
#else
2920
for( int i = 0; i < 16; ++i )
2921
{
2922
luma.val[i] = ( src[i * 4 + 2] * 76 + src[i * 4 + 1] * 150 + src[i * 4] * 28 ) / 254; // luma calculation
2923
if( luma.min > luma.val[i] )
2924
{
2925
luma.min = luma.val[i];
2926
luma.minIdx = i;
2927
}
2928
if( luma.max < luma.val[i] )
2929
{
2930
luma.max = luma.val[i];
2931
luma.maxIdx = i;
2932
}
2933
}
2934
#endif
2935
}
2936
2937
static etcpak_force_inline uint8_t SelectModeETC2( const Luma& luma )
2938
{
2939
#if defined __AVX2__ || defined __ARM_NEON
2940
const float lumaRange = ( luma.max - luma.min );
2941
#else
2942
const float lumaRange = ( luma.max - luma.min ) * ( 1.f / 255.f );
2943
#endif
2944
// filters a very-low-contrast block
2945
if( lumaRange <= ecmd_threshold[0] )
2946
{
2947
return ModePlanar;
2948
}
2949
// checks whether a pair of the corner pixels in a block has the min/max luma values;
2950
// if so, the ETC2 planar mode is enabled, and otherwise, the ETC1 mode is enabled
2951
else if( lumaRange <= ecmd_threshold[1] )
2952
{
2953
#ifdef __AVX2__
2954
static const __m128i corner_pair = _mm_set_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 0, 15, 3, 12, 12, 3, 15, 0 );
2955
__m128i current_max_min = _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx );
2956
2957
__m128i max_min_result = _mm_cmpeq_epi16( corner_pair, current_max_min );
2958
2959
int mask = _mm_movemask_epi8( max_min_result );
2960
if( mask )
2961
{
2962
return ModePlanar;
2963
}
2964
#else
2965
// check whether a pair of the corner pixels in a block has the min/max luma values;
2966
// if so, the ETC2 planar mode is enabled.
2967
if( ( luma.minIdx == 0 && luma.maxIdx == 15 ) ||
2968
( luma.minIdx == 15 && luma.maxIdx == 0 ) ||
2969
( luma.minIdx == 3 && luma.maxIdx == 12 ) ||
2970
( luma.minIdx == 12 && luma.maxIdx == 3 ) )
2971
{
2972
return ModePlanar;
2973
}
2974
#endif
2975
}
2976
// filters a high-contrast block for checking both ETC1 mode and the ETC2 T/H mode
2977
else if( lumaRange >= ecmd_threshold[2] )
2978
{
2979
return ModeTH;
2980
}
2981
return ModeUndecided;
2982
}
2983
2984
static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool useHeuristics )
2985
{
2986
#ifdef __AVX2__
2987
uint64_t d = CheckSolid_AVX2( src );
2988
if( d != 0 ) return d;
2989
#else
2990
uint64_t d = CheckSolid( src );
2991
if (d != 0) return d;
2992
#endif
2993
2994
uint8_t mode = ModeUndecided;
2995
Luma luma;
2996
#ifdef __AVX2__
2997
Channels ch = GetChannels( src );
2998
if( useHeuristics )
2999
{
3000
CalculateLuma( ch, luma );
3001
mode = SelectModeETC2( luma );
3002
}
3003
3004
auto plane = Planar_AVX2( ch, mode, useHeuristics );
3005
if( useHeuristics && mode == ModePlanar ) return plane.plane;
3006
3007
alignas( 32 ) v4i a[8];
3008
__m128i err0 = PrepareAverages_AVX2( a, plane.sum4 );
3009
3010
// Get index of minimum error (err0)
3011
__m128i err1 = _mm_shuffle_epi32( err0, _MM_SHUFFLE( 2, 3, 0, 1 ) );
3012
__m128i errMin0 = _mm_min_epu32(err0, err1);
3013
3014
__m128i errMin1 = _mm_shuffle_epi32( errMin0, _MM_SHUFFLE( 1, 0, 3, 2 ) );
3015
__m128i errMin2 = _mm_min_epu32( errMin1, errMin0 );
3016
3017
__m128i errMask = _mm_cmpeq_epi32( errMin2, err0 );
3018
3019
uint32_t mask = _mm_movemask_epi8( errMask );
3020
3021
size_t idx = _bit_scan_forward( mask ) >> 2;
3022
3023
d = EncodeAverages_AVX2( a, idx );
3024
3025
alignas(32) uint32_t terr[2][8] = {};
3026
alignas(32) uint32_t tsel[8];
3027
3028
if ((idx == 0) || (idx == 2))
3029
{
3030
FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
3031
}
3032
else
3033
{
3034
FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
3035
}
3036
3037
if( useHeuristics )
3038
{
3039
if( mode == ModeTH )
3040
{
3041
uint64_t result = 0;
3042
uint64_t error = 0;
3043
uint32_t compressed[4] = { 0, 0, 0, 0 };
3044
bool tMode = false;
3045
3046
error = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode, ch.r8, ch.g8, ch.b8 );
3047
if( tMode )
3048
{
3049
stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3050
}
3051
else
3052
{
3053
stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3054
}
3055
3056
result = (uint32_t)_bswap( compressed[2] );
3057
result |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
3058
3059
plane.plane = result;
3060
plane.error = error;
3061
}
3062
else
3063
{
3064
plane.plane = 0;
3065
plane.error = MaxError;
3066
}
3067
}
3068
3069
return EncodeSelectors_AVX2( d, terr, tsel, ( idx % 2 ) == 1, plane.plane, plane.error );
3070
#else
3071
if( useHeuristics )
3072
{
3073
#if defined __ARM_NEON && defined __aarch64__
3074
Channels ch = GetChannels( src );
3075
CalculateLuma( ch, luma );
3076
#else
3077
CalculateLuma( src, luma );
3078
#endif
3079
mode = SelectModeETC2( luma );
3080
}
3081
#ifdef __ARM_NEON
3082
auto result = Planar_NEON( src, mode, useHeuristics );
3083
#else
3084
auto result = Planar( src, mode, useHeuristics );
3085
#endif
3086
if( result.second == 0 ) return result.first;
3087
3088
v4i a[8];
3089
unsigned int err[4] = {};
3090
PrepareAverages( a, src, err );
3091
size_t idx = GetLeastError( err, 4 );
3092
EncodeAverages( d, a, idx );
3093
3094
#if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
3095
uint32_t terr[2][8] = {};
3096
#else
3097
uint64_t terr[2][8] = {};
3098
#endif
3099
uint16_t tsel[16][8];
3100
auto id = g_id[idx];
3101
FindBestFit( terr, tsel, a, id, src );
3102
3103
if( useHeuristics )
3104
{
3105
if( mode == ModeTH )
3106
{
3107
uint32_t compressed[4] = { 0, 0, 0, 0 };
3108
bool tMode = false;
3109
3110
result.second = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode );
3111
if( tMode )
3112
{
3113
stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3114
}
3115
else
3116
{
3117
stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3118
}
3119
3120
result.first = (uint32_t)_bswap( compressed[2] );
3121
result.first |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
3122
}
3123
else
3124
{
3125
result.first = 0;
3126
result.second = MaxError;
3127
}
3128
}
3129
3130
return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
3131
#endif
3132
}
3133
3134
#ifdef __SSE4_1__
3135
template<int K>
3136
static etcpak_force_inline __m128i Widen( const __m128i src )
3137
{
3138
static_assert( K >= 0 && K <= 7, "Index out of range" );
3139
3140
__m128i tmp;
3141
switch( K )
3142
{
3143
case 0:
3144
tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3145
return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3146
case 1:
3147
tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3148
return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3149
case 2:
3150
tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3151
return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3152
case 3:
3153
tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
3154
return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3155
case 4:
3156
tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3157
return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3158
case 5:
3159
tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3160
return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3161
case 6:
3162
tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3163
return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3164
case 7:
3165
tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
3166
return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3167
}
3168
}
3169
3170
static etcpak_force_inline int GetMulSel( int sel )
3171
{
3172
switch( sel )
3173
{
3174
case 0:
3175
return 0;
3176
case 1:
3177
case 2:
3178
case 3:
3179
return 1;
3180
case 4:
3181
return 2;
3182
case 5:
3183
case 6:
3184
case 7:
3185
return 3;
3186
case 8:
3187
case 9:
3188
case 10:
3189
case 11:
3190
case 12:
3191
case 13:
3192
return 4;
3193
case 14:
3194
case 15:
3195
return 5;
3196
}
3197
}
3198
3199
#endif
3200
3201
#ifdef __ARM_NEON
3202
3203
static constexpr etcpak_force_inline int GetMulSel(int sel)
3204
{
3205
return ( sel < 1 ) ? 0 : ( sel < 4 ) ? 1 : ( sel < 5 ) ? 2 : ( sel < 8 ) ? 3 : ( sel < 14 ) ? 4 : 5;
3206
}
3207
3208
static constexpr int ClampConstant( int x, int min, int max )
3209
{
3210
return x < min ? min : x > max ? max : x;
3211
}
3212
3213
template <int Index>
3214
etcpak_force_inline static uint16x8_t ErrorProbe_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
3215
{
3216
uint8x8_t srcValWide;
3217
#ifndef __aarch64__
3218
if( Index < 8 )
3219
srcValWide = vdup_lane_u8( vget_low_u8( alphaBlock ), ClampConstant( Index, 0, 7 ) );
3220
else
3221
srcValWide = vdup_lane_u8( vget_high_u8( alphaBlock ), ClampConstant( Index - 8, 0, 7 ) );
3222
#else
3223
srcValWide = vdup_laneq_u8( alphaBlock, Index );
3224
#endif
3225
3226
uint8x8_t deltaVal = vabd_u8( srcValWide, recVal );
3227
return vmull_u8( deltaVal, deltaVal );
3228
}
3229
3230
etcpak_force_inline static uint16_t MinError_EAC_NEON( uint16x8_t errProbe )
3231
{
3232
#ifndef __aarch64__
3233
uint16x4_t tmpErr = vpmin_u16( vget_low_u16( errProbe ), vget_high_u16( errProbe ) );
3234
tmpErr = vpmin_u16( tmpErr, tmpErr );
3235
return vpmin_u16( tmpErr, tmpErr )[0];
3236
#else
3237
return vminvq_u16( errProbe );
3238
#endif
3239
}
3240
3241
template <int Index>
3242
etcpak_force_inline static uint64_t MinErrorIndex_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
3243
{
3244
uint16x8_t errProbe = ErrorProbe_EAC_NEON<Index>( recVal, alphaBlock );
3245
uint16x8_t minErrMask = vceqq_u16( errProbe, vdupq_n_u16( MinError_EAC_NEON( errProbe ) ) );
3246
uint64_t idx = __builtin_ctzll( vget_lane_u64( vreinterpret_u64_u8( vqmovn_u16( minErrMask ) ), 0 ) );
3247
idx >>= 3;
3248
idx <<= 45 - Index * 3;
3249
3250
return idx;
3251
}
3252
3253
template <int Index>
3254
etcpak_force_inline static int16x8_t WidenMultiplier_EAC_NEON( int16x8_t multipliers )
3255
{
3256
constexpr int Lane = GetMulSel( Index );
3257
#ifndef __aarch64__
3258
if( Lane < 4 )
3259
return vdupq_lane_s16( vget_low_s16( multipliers ), ClampConstant( Lane, 0, 3 ) );
3260
else
3261
return vdupq_lane_s16( vget_high_s16( multipliers ), ClampConstant( Lane - 4, 0, 3 ) );
3262
#else
3263
return vdupq_laneq_s16( multipliers, Lane );
3264
#endif
3265
}
3266
3267
#endif
3268
3269
template<bool checkSolid = true>
3270
static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
3271
{
3272
#if defined __SSE4_1__
3273
__m128i s = _mm_loadu_si128( (__m128i*)src );
3274
3275
if( checkSolid )
3276
{
3277
// Check solid
3278
__m128i solidCmp = _mm_set1_epi8( src[0] );
3279
__m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
3280
if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
3281
{
3282
return src[0];
3283
}
3284
}
3285
3286
// Calculate min, max
3287
__m128i s1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 2, 3, 0, 1 ) );
3288
__m128i max1 = _mm_max_epu8( s, s1 );
3289
__m128i min1 = _mm_min_epu8( s, s1 );
3290
__m128i smax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
3291
__m128i smin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
3292
__m128i max2 = _mm_max_epu8( max1, smax2 );
3293
__m128i min2 = _mm_min_epu8( min1, smin2 );
3294
__m128i smax3 = _mm_alignr_epi8( max2, max2, 2 );
3295
__m128i smin3 = _mm_alignr_epi8( min2, min2, 2 );
3296
__m128i max3 = _mm_max_epu8( max2, smax3 );
3297
__m128i min3 = _mm_min_epu8( min2, smin3 );
3298
__m128i smax4 = _mm_alignr_epi8( max3, max3, 1 );
3299
__m128i smin4 = _mm_alignr_epi8( min3, min3, 1 );
3300
__m128i max = _mm_max_epu8( max3, smax4 );
3301
__m128i min = _mm_min_epu8( min3, smin4 );
3302
__m128i max16 = _mm_unpacklo_epi8( max, _mm_setzero_si128() );
3303
__m128i min16 = _mm_unpacklo_epi8( min, _mm_setzero_si128() );
3304
3305
// src range, mid
3306
__m128i srcRange = _mm_sub_epi16( max16, min16 );
3307
__m128i srcRangeHalf = _mm_srli_epi16( srcRange, 1 );
3308
__m128i srcMid = _mm_add_epi16( min16, srcRangeHalf );
3309
3310
// multiplier
3311
__m128i mul1 = _mm_mulhi_epi16( srcRange, g_alphaRange_SIMD );
3312
__m128i mul = _mm_add_epi16( mul1, _mm_set1_epi16( 1 ) );
3313
3314
// wide source
3315
__m128i s16_1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 3, 2, 3, 2 ) );
3316
__m128i s16[2] = { _mm_unpacklo_epi8( s, _mm_setzero_si128() ), _mm_unpacklo_epi8( s16_1, _mm_setzero_si128() ) };
3317
3318
__m128i sr[16] = {
3319
Widen<0>( s16[0] ),
3320
Widen<1>( s16[0] ),
3321
Widen<2>( s16[0] ),
3322
Widen<3>( s16[0] ),
3323
Widen<4>( s16[0] ),
3324
Widen<5>( s16[0] ),
3325
Widen<6>( s16[0] ),
3326
Widen<7>( s16[0] ),
3327
Widen<0>( s16[1] ),
3328
Widen<1>( s16[1] ),
3329
Widen<2>( s16[1] ),
3330
Widen<3>( s16[1] ),
3331
Widen<4>( s16[1] ),
3332
Widen<5>( s16[1] ),
3333
Widen<6>( s16[1] ),
3334
Widen<7>( s16[1] )
3335
};
3336
3337
#ifdef __AVX2__
3338
__m256i srcRangeWide = _mm256_broadcastsi128_si256( srcRange );
3339
__m256i srcMidWide = _mm256_broadcastsi128_si256( srcMid );
3340
3341
__m256i mulWide1 = _mm256_mulhi_epi16( srcRangeWide, g_alphaRange_AVX );
3342
__m256i mulWide = _mm256_add_epi16( mulWide1, _mm256_set1_epi16( 1 ) );
3343
3344
__m256i modMul[8] = {
3345
_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ) ), _mm256_setzero_si256() ),
3346
_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ) ), _mm256_setzero_si256() ),
3347
_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ) ), _mm256_setzero_si256() ),
3348
_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ) ), _mm256_setzero_si256() ),
3349
_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ) ), _mm256_setzero_si256() ),
3350
_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ) ), _mm256_setzero_si256() ),
3351
_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ) ), _mm256_setzero_si256() ),
3352
_mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ) ), _mm256_setzero_si256() ),
3353
};
3354
3355
// find selector
3356
__m256i mulErr = _mm256_setzero_si256();
3357
for( int j=0; j<16; j++ )
3358
{
3359
__m256i s16Wide = _mm256_broadcastsi128_si256( sr[j] );
3360
__m256i err1, err2;
3361
3362
err1 = _mm256_sub_epi16( s16Wide, modMul[0] );
3363
__m256i localErr = _mm256_mullo_epi16( err1, err1 );
3364
3365
err1 = _mm256_sub_epi16( s16Wide, modMul[1] );
3366
err2 = _mm256_mullo_epi16( err1, err1 );
3367
localErr = _mm256_min_epu16( localErr, err2 );
3368
3369
err1 = _mm256_sub_epi16( s16Wide, modMul[2] );
3370
err2 = _mm256_mullo_epi16( err1, err1 );
3371
localErr = _mm256_min_epu16( localErr, err2 );
3372
3373
err1 = _mm256_sub_epi16( s16Wide, modMul[3] );
3374
err2 = _mm256_mullo_epi16( err1, err1 );
3375
localErr = _mm256_min_epu16( localErr, err2 );
3376
3377
err1 = _mm256_sub_epi16( s16Wide, modMul[4] );
3378
err2 = _mm256_mullo_epi16( err1, err1 );
3379
localErr = _mm256_min_epu16( localErr, err2 );
3380
3381
err1 = _mm256_sub_epi16( s16Wide, modMul[5] );
3382
err2 = _mm256_mullo_epi16( err1, err1 );
3383
localErr = _mm256_min_epu16( localErr, err2 );
3384
3385
err1 = _mm256_sub_epi16( s16Wide, modMul[6] );
3386
err2 = _mm256_mullo_epi16( err1, err1 );
3387
localErr = _mm256_min_epu16( localErr, err2 );
3388
3389
err1 = _mm256_sub_epi16( s16Wide, modMul[7] );
3390
err2 = _mm256_mullo_epi16( err1, err1 );
3391
localErr = _mm256_min_epu16( localErr, err2 );
3392
3393
// note that this can overflow, but since we're looking for the smallest error, it shouldn't matter
3394
mulErr = _mm256_adds_epu16( mulErr, localErr );
3395
}
3396
uint64_t minPos1 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_castsi256_si128( mulErr ) ) );
3397
uint64_t minPos2 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_extracti128_si256( mulErr, 1 ) ) );
3398
int sel = ( ( minPos1 & 0xFFFF ) < ( minPos2 & 0xFFFF ) ) ? ( minPos1 >> 16 ) : ( 8 + ( minPos2 >> 16 ) );
3399
3400
__m128i recVal16;
3401
switch( sel )
3402
{
3403
case 0:
3404
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() );
3405
break;
3406
case 1:
3407
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() );
3408
break;
3409
case 2:
3410
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() );
3411
break;
3412
case 3:
3413
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() );
3414
break;
3415
case 4:
3416
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() );
3417
break;
3418
case 5:
3419
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() );
3420
break;
3421
case 6:
3422
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() );
3423
break;
3424
case 7:
3425
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() );
3426
break;
3427
case 8:
3428
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() );
3429
break;
3430
case 9:
3431
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() );
3432
break;
3433
case 10:
3434
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() );
3435
break;
3436
case 11:
3437
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() );
3438
break;
3439
case 12:
3440
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() );
3441
break;
3442
case 13:
3443
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() );
3444
break;
3445
case 14:
3446
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() );
3447
break;
3448
case 15:
3449
recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() );
3450
break;
3451
default:
3452
assert( false );
3453
break;
3454
}
3455
#else
3456
// wide multiplier
3457
__m128i rangeMul[16] = {
3458
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() ),
3459
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() ),
3460
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() ),
3461
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() ),
3462
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() ),
3463
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() ),
3464
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() ),
3465
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() ),
3466
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() ),
3467
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() ),
3468
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() ),
3469
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() ),
3470
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() ),
3471
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() ),
3472
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() ),
3473
_mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() )
3474
};
3475
3476
// find selector
3477
int err = std::numeric_limits<int>::max();
3478
int sel;
3479
for( int r=0; r<16; r++ )
3480
{
3481
__m128i err1, err2, minerr;
3482
__m128i recVal16 = rangeMul[r];
3483
int rangeErr;
3484
3485
err1 = _mm_sub_epi16( sr[0], recVal16 );
3486
err2 = _mm_mullo_epi16( err1, err1 );
3487
minerr = _mm_minpos_epu16( err2 );
3488
rangeErr = _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3489
3490
err1 = _mm_sub_epi16( sr[1], recVal16 );
3491
err2 = _mm_mullo_epi16( err1, err1 );
3492
minerr = _mm_minpos_epu16( err2 );
3493
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3494
3495
err1 = _mm_sub_epi16( sr[2], recVal16 );
3496
err2 = _mm_mullo_epi16( err1, err1 );
3497
minerr = _mm_minpos_epu16( err2 );
3498
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3499
3500
err1 = _mm_sub_epi16( sr[3], recVal16 );
3501
err2 = _mm_mullo_epi16( err1, err1 );
3502
minerr = _mm_minpos_epu16( err2 );
3503
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3504
3505
err1 = _mm_sub_epi16( sr[4], recVal16 );
3506
err2 = _mm_mullo_epi16( err1, err1 );
3507
minerr = _mm_minpos_epu16( err2 );
3508
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3509
3510
err1 = _mm_sub_epi16( sr[5], recVal16 );
3511
err2 = _mm_mullo_epi16( err1, err1 );
3512
minerr = _mm_minpos_epu16( err2 );
3513
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3514
3515
err1 = _mm_sub_epi16( sr[6], recVal16 );
3516
err2 = _mm_mullo_epi16( err1, err1 );
3517
minerr = _mm_minpos_epu16( err2 );
3518
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3519
3520
err1 = _mm_sub_epi16( sr[7], recVal16 );
3521
err2 = _mm_mullo_epi16( err1, err1 );
3522
minerr = _mm_minpos_epu16( err2 );
3523
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3524
3525
err1 = _mm_sub_epi16( sr[8], recVal16 );
3526
err2 = _mm_mullo_epi16( err1, err1 );
3527
minerr = _mm_minpos_epu16( err2 );
3528
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3529
3530
err1 = _mm_sub_epi16( sr[9], recVal16 );
3531
err2 = _mm_mullo_epi16( err1, err1 );
3532
minerr = _mm_minpos_epu16( err2 );
3533
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3534
3535
err1 = _mm_sub_epi16( sr[10], recVal16 );
3536
err2 = _mm_mullo_epi16( err1, err1 );
3537
minerr = _mm_minpos_epu16( err2 );
3538
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3539
3540
err1 = _mm_sub_epi16( sr[11], recVal16 );
3541
err2 = _mm_mullo_epi16( err1, err1 );
3542
minerr = _mm_minpos_epu16( err2 );
3543
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3544
3545
err1 = _mm_sub_epi16( sr[12], recVal16 );
3546
err2 = _mm_mullo_epi16( err1, err1 );
3547
minerr = _mm_minpos_epu16( err2 );
3548
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3549
3550
err1 = _mm_sub_epi16( sr[13], recVal16 );
3551
err2 = _mm_mullo_epi16( err1, err1 );
3552
minerr = _mm_minpos_epu16( err2 );
3553
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3554
3555
err1 = _mm_sub_epi16( sr[14], recVal16 );
3556
err2 = _mm_mullo_epi16( err1, err1 );
3557
minerr = _mm_minpos_epu16( err2 );
3558
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3559
3560
err1 = _mm_sub_epi16( sr[15], recVal16 );
3561
err2 = _mm_mullo_epi16( err1, err1 );
3562
minerr = _mm_minpos_epu16( err2 );
3563
rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3564
3565
if( rangeErr < err )
3566
{
3567
err = rangeErr;
3568
sel = r;
3569
if( err == 0 ) break;
3570
}
3571
}
3572
3573
__m128i recVal16 = rangeMul[sel];
3574
#endif
3575
3576
// find indices
3577
__m128i err1, err2, minerr;
3578
uint64_t idx = 0, tmp;
3579
3580
err1 = _mm_sub_epi16( sr[0], recVal16 );
3581
err2 = _mm_mullo_epi16( err1, err1 );
3582
minerr = _mm_minpos_epu16( err2 );
3583
tmp = _mm_cvtsi128_si64( minerr );
3584
idx |= ( tmp >> 16 ) << 15*3;
3585
3586
err1 = _mm_sub_epi16( sr[1], recVal16 );
3587
err2 = _mm_mullo_epi16( err1, err1 );
3588
minerr = _mm_minpos_epu16( err2 );
3589
tmp = _mm_cvtsi128_si64( minerr );
3590
idx |= ( tmp >> 16 ) << 14*3;
3591
3592
err1 = _mm_sub_epi16( sr[2], recVal16 );
3593
err2 = _mm_mullo_epi16( err1, err1 );
3594
minerr = _mm_minpos_epu16( err2 );
3595
tmp = _mm_cvtsi128_si64( minerr );
3596
idx |= ( tmp >> 16 ) << 13*3;
3597
3598
err1 = _mm_sub_epi16( sr[3], recVal16 );
3599
err2 = _mm_mullo_epi16( err1, err1 );
3600
minerr = _mm_minpos_epu16( err2 );
3601
tmp = _mm_cvtsi128_si64( minerr );
3602
idx |= ( tmp >> 16 ) << 12*3;
3603
3604
err1 = _mm_sub_epi16( sr[4], recVal16 );
3605
err2 = _mm_mullo_epi16( err1, err1 );
3606
minerr = _mm_minpos_epu16( err2 );
3607
tmp = _mm_cvtsi128_si64( minerr );
3608
idx |= ( tmp >> 16 ) << 11*3;
3609
3610
err1 = _mm_sub_epi16( sr[5], recVal16 );
3611
err2 = _mm_mullo_epi16( err1, err1 );
3612
minerr = _mm_minpos_epu16( err2 );
3613
tmp = _mm_cvtsi128_si64( minerr );
3614
idx |= ( tmp >> 16 ) << 10*3;
3615
3616
err1 = _mm_sub_epi16( sr[6], recVal16 );
3617
err2 = _mm_mullo_epi16( err1, err1 );
3618
minerr = _mm_minpos_epu16( err2 );
3619
tmp = _mm_cvtsi128_si64( minerr );
3620
idx |= ( tmp >> 16 ) << 9*3;
3621
3622
err1 = _mm_sub_epi16( sr[7], recVal16 );
3623
err2 = _mm_mullo_epi16( err1, err1 );
3624
minerr = _mm_minpos_epu16( err2 );
3625
tmp = _mm_cvtsi128_si64( minerr );
3626
idx |= ( tmp >> 16 ) << 8*3;
3627
3628
err1 = _mm_sub_epi16( sr[8], recVal16 );
3629
err2 = _mm_mullo_epi16( err1, err1 );
3630
minerr = _mm_minpos_epu16( err2 );
3631
tmp = _mm_cvtsi128_si64( minerr );
3632
idx |= ( tmp >> 16 ) << 7*3;
3633
3634
err1 = _mm_sub_epi16( sr[9], recVal16 );
3635
err2 = _mm_mullo_epi16( err1, err1 );
3636
minerr = _mm_minpos_epu16( err2 );
3637
tmp = _mm_cvtsi128_si64( minerr );
3638
idx |= ( tmp >> 16 ) << 6*3;
3639
3640
err1 = _mm_sub_epi16( sr[10], recVal16 );
3641
err2 = _mm_mullo_epi16( err1, err1 );
3642
minerr = _mm_minpos_epu16( err2 );
3643
tmp = _mm_cvtsi128_si64( minerr );
3644
idx |= ( tmp >> 16 ) << 5*3;
3645
3646
err1 = _mm_sub_epi16( sr[11], recVal16 );
3647
err2 = _mm_mullo_epi16( err1, err1 );
3648
minerr = _mm_minpos_epu16( err2 );
3649
tmp = _mm_cvtsi128_si64( minerr );
3650
idx |= ( tmp >> 16 ) << 4*3;
3651
3652
err1 = _mm_sub_epi16( sr[12], recVal16 );
3653
err2 = _mm_mullo_epi16( err1, err1 );
3654
minerr = _mm_minpos_epu16( err2 );
3655
tmp = _mm_cvtsi128_si64( minerr );
3656
idx |= ( tmp >> 16 ) << 3*3;
3657
3658
err1 = _mm_sub_epi16( sr[13], recVal16 );
3659
err2 = _mm_mullo_epi16( err1, err1 );
3660
minerr = _mm_minpos_epu16( err2 );
3661
tmp = _mm_cvtsi128_si64( minerr );
3662
idx |= ( tmp >> 16 ) << 2*3;
3663
3664
err1 = _mm_sub_epi16( sr[14], recVal16 );
3665
err2 = _mm_mullo_epi16( err1, err1 );
3666
minerr = _mm_minpos_epu16( err2 );
3667
tmp = _mm_cvtsi128_si64( minerr );
3668
idx |= ( tmp >> 16 ) << 1*3;
3669
3670
err1 = _mm_sub_epi16( sr[15], recVal16 );
3671
err2 = _mm_mullo_epi16( err1, err1 );
3672
minerr = _mm_minpos_epu16( err2 );
3673
tmp = _mm_cvtsi128_si64( minerr );
3674
idx |= ( tmp >> 16 ) << 0*3;
3675
3676
uint16_t rm[8];
3677
_mm_storeu_si128( (__m128i*)rm, mul );
3678
uint16_t sm = _mm_cvtsi128_si64( srcMid );
3679
3680
uint64_t d = ( uint64_t( sm ) << 56 ) |
3681
( uint64_t( rm[GetMulSel( sel )] ) << 52 ) |
3682
( uint64_t( sel ) << 48 ) |
3683
idx;
3684
3685
return _bswap64( d );
3686
#elif defined __ARM_NEON
3687
3688
int16x8_t srcMidWide, multipliers;
3689
int srcMid;
3690
uint8x16_t srcAlphaBlock = vld1q_u8( src );
3691
{
3692
if( checkSolid )
3693
{
3694
uint8_t ref = src[0];
3695
uint8x16_t a0 = vdupq_n_u8( ref );
3696
uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 );
3697
int64x2_t m = vreinterpretq_s64_u8( r );
3698
if( m[0] == -1 && m[1] == -1 )
3699
return ref;
3700
}
3701
3702
// srcRange
3703
#ifdef __aarch64__
3704
uint8_t min = vminvq_u8( srcAlphaBlock );
3705
uint8_t max = vmaxvq_u8( srcAlphaBlock );
3706
uint8_t srcRange = max - min;
3707
multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_n_s16( g_alphaRange_NEON, srcRange ), 1 ), vdupq_n_s16( 1 ) );
3708
srcMid = min + srcRange / 2;
3709
srcMidWide = vdupq_n_s16( srcMid );
3710
#else
3711
uint8x8_t vmin = vpmin_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
3712
vmin = vpmin_u8( vmin, vmin );
3713
vmin = vpmin_u8( vmin, vmin );
3714
vmin = vpmin_u8( vmin, vmin );
3715
uint8x8_t vmax = vpmax_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
3716
vmax = vpmax_u8( vmax, vmax );
3717
vmax = vpmax_u8( vmax, vmax );
3718
vmax = vpmax_u8( vmax, vmax );
3719
3720
int16x8_t srcRangeWide = vreinterpretq_s16_u16( vsubl_u8( vmax, vmin ) );
3721
multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_s16( g_alphaRange_NEON, srcRangeWide ), 1 ), vdupq_n_s16( 1 ) );
3722
srcMidWide = vsraq_n_s16( vreinterpretq_s16_u16(vmovl_u8(vmin)), srcRangeWide, 1);
3723
srcMid = vgetq_lane_s16( srcMidWide, 0 );
3724
#endif
3725
}
3726
3727
// calculate reconstructed values
3728
#define EAC_APPLY_16X( m ) m( 0 ) m( 1 ) m( 2 ) m( 3 ) m( 4 ) m( 5 ) m( 6 ) m( 7 ) m( 8 ) m( 9 ) m( 10 ) m( 11 ) m( 12 ) m( 13 ) m( 14 ) m( 15 )
3729
3730
#define EAC_RECONSTRUCT_VALUE( n ) vqmovun_s16( vmlaq_s16( srcMidWide, g_alpha_NEON[n], WidenMultiplier_EAC_NEON<n>( multipliers ) ) ),
3731
uint8x8_t recVals[16] = { EAC_APPLY_16X( EAC_RECONSTRUCT_VALUE ) };
3732
3733
// find selector
3734
int err = std::numeric_limits<int>::max();
3735
int sel = 0;
3736
for( int r = 0; r < 16; r++ )
3737
{
3738
uint8x8_t recVal = recVals[r];
3739
3740
int rangeErr = 0;
3741
#define EAC_ACCUMULATE_ERROR( n ) rangeErr += MinError_EAC_NEON( ErrorProbe_EAC_NEON<n>( recVal, srcAlphaBlock ) );
3742
EAC_APPLY_16X( EAC_ACCUMULATE_ERROR )
3743
3744
if( rangeErr < err )
3745
{
3746
err = rangeErr;
3747
sel = r;
3748
if ( err == 0 ) break;
3749
}
3750
}
3751
3752
// combine results
3753
uint64_t d = ( uint64_t( srcMid ) << 56 ) |
3754
( uint64_t( multipliers[GetMulSel( sel )] ) << 52 ) |
3755
( uint64_t( sel ) << 48);
3756
3757
// generate indices
3758
uint8x8_t recVal = recVals[sel];
3759
#define EAC_INSERT_INDEX(n) d |= MinErrorIndex_EAC_NEON<n>( recVal, srcAlphaBlock );
3760
EAC_APPLY_16X( EAC_INSERT_INDEX )
3761
3762
return _bswap64( d );
3763
3764
#undef EAC_APPLY_16X
3765
#undef EAC_INSERT_INDEX
3766
#undef EAC_ACCUMULATE_ERROR
3767
#undef EAC_RECONSTRUCT_VALUE
3768
3769
#else
3770
if( checkSolid )
3771
{
3772
bool solid = true;
3773
const uint8_t* ptr = src + 1;
3774
const uint8_t ref = *src;
3775
for( int i=1; i<16; i++ )
3776
{
3777
if( ref != *ptr++ )
3778
{
3779
solid = false;
3780
break;
3781
}
3782
}
3783
if( solid )
3784
{
3785
return ref;
3786
}
3787
}
3788
3789
uint8_t min = src[0];
3790
uint8_t max = src[0];
3791
for( int i=1; i<16; i++ )
3792
{
3793
if( min > src[i] ) min = src[i];
3794
else if( max < src[i] ) max = src[i];
3795
}
3796
int srcRange = max - min;
3797
int srcMid = min + srcRange / 2;
3798
3799
uint8_t buf[16][16];
3800
int err = std::numeric_limits<int>::max();
3801
int sel;
3802
int selmul;
3803
for( int r=0; r<16; r++ )
3804
{
3805
int mul = ( ( srcRange * g_alphaRange[r] ) >> 16 ) + 1;
3806
3807
int rangeErr = 0;
3808
for( int i=0; i<16; i++ )
3809
{
3810
const auto srcVal = src[i];
3811
3812
int idx = 0;
3813
const auto modVal = g_alpha[r][0] * mul;
3814
const auto recVal = clampu8( srcMid + modVal );
3815
int localErr = sq( srcVal - recVal );
3816
3817
if( localErr != 0 )
3818
{
3819
for( int j=1; j<8; j++ )
3820
{
3821
const auto modVal = g_alpha[r][j] * mul;
3822
const auto recVal = clampu8( srcMid + modVal );
3823
const auto errProbe = sq( srcVal - recVal );
3824
if( errProbe < localErr )
3825
{
3826
localErr = errProbe;
3827
idx = j;
3828
}
3829
}
3830
}
3831
3832
buf[r][i] = idx;
3833
rangeErr += localErr;
3834
}
3835
3836
if( rangeErr < err )
3837
{
3838
err = rangeErr;
3839
sel = r;
3840
selmul = mul;
3841
if( err == 0 ) break;
3842
}
3843
}
3844
3845
uint64_t d = ( uint64_t( srcMid ) << 56 ) |
3846
( uint64_t( selmul ) << 52 ) |
3847
( uint64_t( sel ) << 48 );
3848
3849
int offset = 45;
3850
auto ptr = buf[sel];
3851
for( int i=0; i<16; i++ )
3852
{
3853
d |= uint64_t( *ptr++ ) << offset;
3854
offset -= 3;
3855
}
3856
3857
return _bswap64( d );
3858
#endif
3859
}
3860
3861
void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
3862
{
3863
int w = 0;
3864
uint32_t buf[4*4];
3865
do
3866
{
3867
#ifdef __SSE4_1__
3868
__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
3869
__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
3870
__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
3871
__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
3872
3873
_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3874
3875
_mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) );
3876
_mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) );
3877
_mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) );
3878
_mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
3879
3880
src += 4;
3881
#else
3882
auto ptr = buf;
3883
for( int x=0; x<4; x++ )
3884
{
3885
*ptr++ = *src;
3886
src += width;
3887
*ptr++ = *src;
3888
src += width;
3889
*ptr++ = *src;
3890
src += width;
3891
*ptr++ = *src;
3892
src -= width * 3 - 1;
3893
}
3894
#endif
3895
if( ++w == width/4 )
3896
{
3897
src += width * 3;
3898
w = 0;
3899
}
3900
*dst++ = ProcessRGB( (uint8_t*)buf );
3901
}
3902
while( --blocks );
3903
}
3904
3905
void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
3906
{
3907
int w = 0;
3908
uint32_t buf[4*4];
3909
do
3910
{
3911
#ifdef __SSE4_1__
3912
__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
3913
__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
3914
__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
3915
__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
3916
3917
_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3918
3919
# ifdef __AVX2__
3920
DitherAvx2( (uint8_t*)buf, _mm_castps_si128( px0 ), _mm_castps_si128( px1 ), _mm_castps_si128( px2 ), _mm_castps_si128( px3 ) );
3921
# else
3922
_mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) );
3923
_mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) );
3924
_mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) );
3925
_mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
3926
3927
Dither( (uint8_t*)buf );
3928
# endif
3929
3930
src += 4;
3931
#else
3932
auto ptr = buf;
3933
for( int x=0; x<4; x++ )
3934
{
3935
*ptr++ = *src;
3936
src += width;
3937
*ptr++ = *src;
3938
src += width;
3939
*ptr++ = *src;
3940
src += width;
3941
*ptr++ = *src;
3942
src -= width * 3 - 1;
3943
}
3944
#endif
3945
if( ++w == width/4 )
3946
{
3947
src += width * 3;
3948
w = 0;
3949
}
3950
*dst++ = ProcessRGB( (uint8_t*)buf );
3951
}
3952
while( --blocks );
3953
}
3954
3955
void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics )
3956
{
3957
int w = 0;
3958
uint32_t buf[4*4];
3959
do
3960
{
3961
#ifdef __SSE4_1__
3962
__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
3963
__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
3964
__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
3965
__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
3966
3967
_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3968
3969
_mm_store_si128( (__m128i*)(buf + 0), _mm_castps_si128( px0 ) );
3970
_mm_store_si128( (__m128i*)(buf + 4), _mm_castps_si128( px1 ) );
3971
_mm_store_si128( (__m128i*)(buf + 8), _mm_castps_si128( px2 ) );
3972
_mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
3973
3974
src += 4;
3975
#else
3976
auto ptr = buf;
3977
for( int x=0; x<4; x++ )
3978
{
3979
*ptr++ = *src;
3980
src += width;
3981
*ptr++ = *src;
3982
src += width;
3983
*ptr++ = *src;
3984
src += width;
3985
*ptr++ = *src;
3986
src -= width * 3 - 1;
3987
}
3988
#endif
3989
if( ++w == width/4 )
3990
{
3991
src += width * 3;
3992
w = 0;
3993
}
3994
*dst++ = ProcessRGB_ETC2( (uint8_t*)buf, useHeuristics );
3995
}
3996
while( --blocks );
3997
}
3998
3999
void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics )
4000
{
4001
int w = 0;
4002
uint32_t rgba[4*4];
4003
uint8_t alpha[4*4];
4004
do
4005
{
4006
#ifdef __SSE4_1__
4007
__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
4008
__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
4009
__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
4010
__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
4011
4012
_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4013
4014
__m128i c0 = _mm_castps_si128( px0 );
4015
__m128i c1 = _mm_castps_si128( px1 );
4016
__m128i c2 = _mm_castps_si128( px2 );
4017
__m128i c3 = _mm_castps_si128( px3 );
4018
4019
_mm_store_si128( (__m128i*)(rgba + 0), c0 );
4020
_mm_store_si128( (__m128i*)(rgba + 4), c1 );
4021
_mm_store_si128( (__m128i*)(rgba + 8), c2 );
4022
_mm_store_si128( (__m128i*)(rgba + 12), c3 );
4023
4024
__m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );
4025
4026
__m128i a0 = _mm_shuffle_epi8( c0, mask );
4027
__m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
4028
__m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
4029
__m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
4030
4031
__m128i s0 = _mm_or_si128( a0, a1 );
4032
__m128i s1 = _mm_or_si128( a2, a3 );
4033
__m128i s2 = _mm_or_si128( s0, s1 );
4034
4035
_mm_store_si128( (__m128i*)alpha, s2 );
4036
4037
src += 4;
4038
#else
4039
auto ptr = rgba;
4040
auto ptr8 = alpha;
4041
for( int x=0; x<4; x++ )
4042
{
4043
auto v = *src;
4044
*ptr++ = v;
4045
*ptr8++ = v >> 24;
4046
src += width;
4047
v = *src;
4048
*ptr++ = v;
4049
*ptr8++ = v >> 24;
4050
src += width;
4051
v = *src;
4052
*ptr++ = v;
4053
*ptr8++ = v >> 24;
4054
src += width;
4055
v = *src;
4056
*ptr++ = v;
4057
*ptr8++ = v >> 24;
4058
src -= width * 3 - 1;
4059
}
4060
#endif
4061
if( ++w == width/4 )
4062
{
4063
src += width * 3;
4064
w = 0;
4065
}
4066
*dst++ = ProcessAlpha_ETC2<true>( alpha );
4067
*dst++ = ProcessRGB_ETC2( (uint8_t*)rgba, useHeuristics );
4068
}
4069
while( --blocks );
4070
}
4071
4072
void CompressEacR( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
4073
{
4074
int w = 0;
4075
uint8_t r[4*4];
4076
do
4077
{
4078
#ifdef __SSE4_1__
4079
__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
4080
__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
4081
__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
4082
__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
4083
4084
_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4085
4086
__m128i c0 = _mm_castps_si128( px0 );
4087
__m128i c1 = _mm_castps_si128( px1 );
4088
__m128i c2 = _mm_castps_si128( px2 );
4089
__m128i c3 = _mm_castps_si128( px3 );
4090
4091
__m128i mask = _mm_setr_epi32( 0x0e0a0602, -1, -1, -1 );
4092
4093
__m128i a0 = _mm_shuffle_epi8( c0, mask );
4094
__m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
4095
__m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
4096
__m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
4097
4098
__m128i s0 = _mm_or_si128( a0, a1 );
4099
__m128i s1 = _mm_or_si128( a2, a3 );
4100
__m128i s2 = _mm_or_si128( s0, s1 );
4101
4102
_mm_store_si128( (__m128i*)r, s2 );
4103
4104
src += 4;
4105
#else
4106
auto ptr8 = r;
4107
for( int x=0; x<4; x++ )
4108
{
4109
auto v = *src;
4110
*ptr8++ = (v & 0xff0000) >> 16;
4111
src += width;
4112
v = *src;
4113
*ptr8++ = (v & 0xff0000) >> 16;
4114
src += width;
4115
v = *src;
4116
*ptr8++ = (v & 0xff0000) >> 16;
4117
src += width;
4118
v = *src;
4119
*ptr8++ = (v & 0xff0000) >> 16;
4120
src -= width * 3 - 1;
4121
}
4122
#endif
4123
if( ++w == width/4 )
4124
{
4125
src += width * 3;
4126
w = 0;
4127
}
4128
*dst++ = ProcessAlpha_ETC2<false>( r );
4129
}
4130
while( --blocks );
4131
}
4132
4133
void CompressEacRg( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
4134
{
4135
int w = 0;
4136
uint8_t rg[4*4*2];
4137
do
4138
{
4139
#ifdef __SSE4_1__
4140
__m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
4141
__m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
4142
__m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
4143
__m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
4144
4145
_MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4146
4147
__m128i c0 = _mm_castps_si128( px0 );
4148
__m128i c1 = _mm_castps_si128( px1 );
4149
__m128i c2 = _mm_castps_si128( px2 );
4150
__m128i c3 = _mm_castps_si128( px3 );
4151
4152
__m128i mask = _mm_setr_epi32( 0x0e0a0602, -1, -1, -1 );
4153
4154
__m128i r0 = _mm_shuffle_epi8( c0, mask );
4155
__m128i r1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
4156
__m128i r2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
4157
__m128i r3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
4158
4159
__m128i s0 = _mm_or_si128( r0, r1 );
4160
__m128i s1 = _mm_or_si128( r2, r3 );
4161
__m128i s2 = _mm_or_si128( s0, s1 );
4162
4163
_mm_store_si128( (__m128i*)rg, s2 );
4164
4165
mask = _mm_setr_epi32( 0x0d090501, -1, -1, -1 );
4166
4167
r0 = _mm_shuffle_epi8( c0, mask );
4168
r1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
4169
r2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
4170
r3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
4171
4172
s0 = _mm_or_si128( r0, r1 );
4173
s1 = _mm_or_si128( r2, r3 );
4174
s2 = _mm_or_si128( s0, s1 );
4175
4176
_mm_store_si128( (__m128i*)&rg[16], s2 );
4177
src += 4;
4178
#else
4179
auto ptrr = rg;
4180
auto ptrg = ptrr + 16;
4181
for( int x=0; x<4; x++ )
4182
{
4183
auto v = *src;
4184
*ptrr++ = (v & 0xff0000) >> 16;
4185
*ptrg++ = (v & 0xff00) >> 8;
4186
src += width;
4187
v = *src;
4188
*ptrr++ = (v & 0xff0000) >> 16;
4189
*ptrg++ = (v & 0xff00) >> 8;
4190
src += width;
4191
v = *src;
4192
*ptrr++ = (v & 0xff0000) >> 16;
4193
*ptrg++ = (v & 0xff00) >> 8;
4194
src += width;
4195
v = *src;
4196
*ptrr++ = (v & 0xff0000) >> 16;
4197
*ptrg++ = (v & 0xff00) >> 8;
4198
src -= width * 3 - 1;
4199
}
4200
#endif
4201
if( ++w == width/4 )
4202
{
4203
src += width * 3;
4204
w = 0;
4205
}
4206
*dst++ = ProcessAlpha_ETC2<false>( rg );
4207
*dst++ = ProcessAlpha_ETC2<false>( &rg[16] );
4208
}
4209
while( --blocks );
4210
}
4211
4212