CoCalc -- ProcessRGB.cpp

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/etcpak/ProcessRGB.cpp
⁹⁸³³ views
1
#include <array>
2
#include <string.h>
3
#include <limits>
4
#ifdef __ARM_NEON
5
#  include <arm_neon.h>
6
#endif
7

8
#include "Dither.hpp"
9
#include "ForceInline.hpp"
10
#include "Math.hpp"
11
#include "ProcessCommon.hpp"
12
#include "ProcessRGB.hpp"
13
#include "Tables.hpp"
14
#include "Vector.hpp"
15
#if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER
16
#  ifdef _MSC_VER
17
#    include <intrin.h>
18
#    include <Windows.h>
19
#    define _bswap(x) _byteswap_ulong(x)
20
#    define _bswap64(x) _byteswap_uint64(x)
21
#  else
22
#    include <x86intrin.h>
23
#  endif
24
#endif
25

26
#ifndef _bswap
27
#  define _bswap(x) __builtin_bswap32(x)
28
#  define _bswap64(x) __builtin_bswap64(x)
29
#endif
30

31
static const uint32_t MaxError = 1065369600; // ((38+76+14) * 255)^2
32
// common T-/H-mode table
33
static uint8_t tableTH[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
34

35
// thresholds for the early compression-mode decision scheme
36
// default: 0.03, 0.09, and 0.38
37
float ecmd_threshold[3] = { 0.03f, 0.09f, 0.38f };
38

39
static const uint8_t ModeUndecided = 0;
40
static const uint8_t ModePlanar = 0x1;
41
static const uint8_t ModeTH = 0x2;
42

43
const unsigned int R = 2;
44
const unsigned int G = 1;
45
const unsigned int B = 0;
46

47
struct Luma
48
{
49
#ifdef __AVX2__
50
    float max, min;
51
    uint8_t minIdx = 255, maxIdx = 255;
52
    __m128i luma8;
53
#elif defined __ARM_NEON && defined __aarch64__
54
    float max, min;
55
    uint8_t minIdx = 255, maxIdx = 255;
56
    uint8x16_t luma8;
57
#else
58
    uint8_t max = 0, min = 255, maxIdx = 0, minIdx = 0;
59
    uint8_t val[16];
60
#endif
61
};
62

63
#ifdef __AVX2__
64
struct Plane
65
{
66
    uint64_t plane;
67
    uint64_t error;
68
    __m256i sum4;
69
};
70
#endif
71

72
#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
73
struct Channels
74
{
75
#ifdef __AVX2__
76
    __m128i r8, g8, b8;
77
#elif defined __ARM_NEON && defined __aarch64__
78
    uint8x16x2_t r, g, b;
79
#endif
80
};
81
#endif
82

83
namespace
84
{
85
static etcpak_force_inline uint8_t clamp( uint8_t min, int16_t val, uint8_t max )
86
{
87
    return val < min ? min : ( val > max ? max : val );
88
}
89

90
static etcpak_force_inline uint8_t clampMin( uint8_t min, int16_t val )
91
{
92
    return val < min ? min : val;
93
}
94

95
static etcpak_force_inline uint8_t clampMax( int16_t val, uint8_t max )
96
{
97
    return val > max ? max : val;
98
}
99

100
// slightly faster than std::sort
101
static void insertionSort( uint8_t* arr1, uint8_t* arr2 )
102
{
103
    for( uint8_t i = 1; i < 16; ++i )
104
    {
105
        uint8_t value = arr1[i];
106
        uint8_t hole = i;
107

108
        for( ; hole > 0 && value < arr1[hole - 1]; --hole )
109
        {
110
            arr1[hole] = arr1[hole - 1];
111
            arr2[hole] = arr2[hole - 1];
112
        }
113
        arr1[hole] = value;
114
        arr2[hole] = i;
115
    }
116
}
117

118
//converts indices from  |a0|a1|e0|e1|i0|i1|m0|m1|b0|b1|f0|f1|j0|j1|n0|n1|c0|c1|g0|g1|k0|k1|o0|o1|d0|d1|h0|h1|l0|l1|p0|p1| previously used by T- and H-modes
119
//                     into  |p0|o0|n0|m0|l0|k0|j0|i0|h0|g0|f0|e0|d0|c0|b0|a0|p1|o1|n1|m1|l1|k1|j1|i1|h1|g1|f1|e1|d1|c1|b1|a1| which should be used for all modes.
120
// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
121
static etcpak_force_inline int indexConversion( int pixelIndices )
122
{
123
    int correctIndices = 0;
124
    int LSB[4][4];
125
    int MSB[4][4];
126
    int shift = 0;
127
    for( int y = 3; y >= 0; y-- )
128
    {
129
        for( int x = 3; x >= 0; x-- )
130
        {
131
            LSB[x][y] = ( pixelIndices >> shift ) & 1;
132
            shift++;
133
            MSB[x][y] = ( pixelIndices >> shift ) & 1;
134
            shift++;
135
        }
136
    }
137
    shift = 0;
138
    for( int x = 0; x < 4; x++ )
139
    {
140
        for( int y = 0; y < 4; y++ )
141
        {
142
            correctIndices |= ( LSB[x][y] << shift );
143
            correctIndices |= ( MSB[x][y] << ( 16 + shift ) );
144
            shift++;
145
        }
146
    }
147
    return correctIndices;
148
}
149

150
// Swapping two RGB-colors
151
// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
152
static etcpak_force_inline void swapColors( uint8_t( colors )[2][3] )
153
{
154
    uint8_t temp = colors[0][R];
155
    colors[0][R] = colors[1][R];
156
    colors[1][R] = temp;
157

158
    temp = colors[0][G];
159
    colors[0][G] = colors[1][G];
160
    colors[1][G] = temp;
161

162
    temp = colors[0][B];
163
    colors[0][B] = colors[1][B];
164
    colors[1][B] = temp;
165
}
166

167

168
// calculates quantized colors for T or H modes
169
void compressColor( uint8_t( currColor )[2][3], uint8_t( quantColor )[2][3], bool t_mode )
170
{
171
    if( t_mode )
172
    {
173
        quantColor[0][R] = clampMax( 15 * ( currColor[0][R] + 8 ) / 255, 15 );
174
        quantColor[0][G] = clampMax( 15 * ( currColor[0][G] + 8 ) / 255, 15 );
175
        quantColor[0][B] = clampMax( 15 * ( currColor[0][B] + 8 ) / 255, 15 );
176
    }
177
    else // clamped to [1,14] to get a wider range
178
    {
179
        quantColor[0][R] = clamp( 1, 15 * ( currColor[0][R] + 8 ) / 255, 14 );
180
        quantColor[0][G] = clamp( 1, 15 * ( currColor[0][G] + 8 ) / 255, 14 );
181
        quantColor[0][B] = clamp( 1, 15 * ( currColor[0][B] + 8 ) / 255, 14 );
182
    }
183

184
    // clamped to [1,14] to get a wider range
185
    quantColor[1][R] = clamp( 1, 15 * ( currColor[1][R] + 8 ) / 255, 14 );
186
    quantColor[1][G] = clamp( 1, 15 * ( currColor[1][G] + 8 ) / 255, 14 );
187
    quantColor[1][B] = clamp( 1, 15 * ( currColor[1][B] + 8 ) / 255, 14 );
188
}
189

190
// three decoding functions come from ETCPACK v2.74 and are slightly changed.
191
static etcpak_force_inline void decompressColor( uint8_t( colorsRGB444 )[2][3], uint8_t( colors )[2][3] )
192
{
193
    // The color should be retrieved as:
194
    //
195
    // c = round(255/(r_bits^2-1))*comp_color
196
    //
197
    // This is similar to bit replication
198
    //
199
    // Note -- this code only work for bit replication from 4 bits and up --- 3 bits needs
200
    // two copy operations.
201
    colors[0][R] = ( colorsRGB444[0][R] << 4 ) | colorsRGB444[0][R];
202
    colors[0][G] = ( colorsRGB444[0][G] << 4 ) | colorsRGB444[0][G];
203
    colors[0][B] = ( colorsRGB444[0][B] << 4 ) | colorsRGB444[0][B];
204
    colors[1][R] = ( colorsRGB444[1][R] << 4 ) | colorsRGB444[1][R];
205
    colors[1][G] = ( colorsRGB444[1][G] << 4 ) | colorsRGB444[1][G];
206
    colors[1][B] = ( colorsRGB444[1][B] << 4 ) | colorsRGB444[1][B];
207
}
208

209
// calculates the paint colors from the block colors
210
// using a distance d and one of the H- or T-patterns.
211
static void calculatePaintColors59T( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
212
{
213
    //////////////////////////////////////////////
214
    //
215
    //		C3      C1		C4----C1---C2
216
    //		|		|			  |
217
    //		|		|			  |
218
    //		|-------|			  |
219
    //		|		|			  |
220
    //		|		|			  |
221
    //		C4      C2			  C3
222
    //
223
    //////////////////////////////////////////////
224

225
    // C4
226
    pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
227
    pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
228
    pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
229

230
    // C3
231
    pColors[0][R] = colors[0][R];
232
    pColors[0][G] = colors[0][G];
233
    pColors[0][B] = colors[0][B];
234
    // C2
235
    pColors[1][R] = clampMax( colors[1][R] + tableTH[d], 255 );
236
    pColors[1][G] = clampMax( colors[1][G] + tableTH[d], 255 );
237
    pColors[1][B] = clampMax( colors[1][B] + tableTH[d], 255 );
238
    // C1
239
    pColors[2][R] = colors[1][R];
240
    pColors[2][G] = colors[1][G];
241
    pColors[2][B] = colors[1][B];
242
}
243

244
static void calculatePaintColors58H( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
245
{
246
    pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
247
    pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
248
    pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
249

250
    // C1
251
    pColors[0][R] = clampMax( colors[0][R] + tableTH[d], 255 );
252
    pColors[0][G] = clampMax( colors[0][G] + tableTH[d], 255 );
253
    pColors[0][B] = clampMax( colors[0][B] + tableTH[d], 255 );
254
    // C2
255
    pColors[1][R] = clampMin( 0, colors[0][R] - tableTH[d] );
256
    pColors[1][G] = clampMin( 0, colors[0][G] - tableTH[d] );
257
    pColors[1][B] = clampMin( 0, colors[0][B] - tableTH[d] );
258
    // C3
259
    pColors[2][R] = clampMax( colors[1][R] + tableTH[d], 255 );
260
    pColors[2][G] = clampMax( colors[1][G] + tableTH[d], 255 );
261
    pColors[2][B] = clampMax( colors[1][B] + tableTH[d], 255 );
262
}
263

264
#if defined _MSC_VER && !defined __clang__
265
static etcpak_force_inline unsigned long _bit_scan_forward( unsigned long mask )
266
{
267
    unsigned long ret;
268
    _BitScanForward( &ret, mask );
269
    return ret;
270
}
271
#endif
272

273
typedef std::array<uint16_t, 4> v4i;
274

275
#ifdef __AVX2__
276
static etcpak_force_inline __m256i Sum4_AVX2( const uint8_t* data) noexcept
277
{
278
    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
279
    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
280
    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
281
    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
282

283
    __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
284
    __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
285
    __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
286
    __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
287

288
    __m256i t0 = _mm256_cvtepu8_epi16(dm0);
289
    __m256i t1 = _mm256_cvtepu8_epi16(dm1);
290
    __m256i t2 = _mm256_cvtepu8_epi16(dm2);
291
    __m256i t3 = _mm256_cvtepu8_epi16(dm3);
292

293
    __m256i sum0 = _mm256_add_epi16(t0, t1);
294
    __m256i sum1 = _mm256_add_epi16(t2, t3);
295

296
    __m256i s0 = _mm256_permute2x128_si256(sum0, sum1, (0) | (3 << 4)); // 0, 0, 3, 3
297
    __m256i s1 = _mm256_permute2x128_si256(sum0, sum1, (1) | (2 << 4)); // 1, 1, 2, 2
298

299
    __m256i s2 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(1, 3, 0, 2));
300
    __m256i s3 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(0, 2, 1, 3));
301
    __m256i s4 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(3, 1, 0, 2));
302
    __m256i s5 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(2, 0, 1, 3));
303

304
    __m256i sum5 = _mm256_add_epi16(s2, s3); //   3,   0,   3,   0
305
    __m256i sum6 = _mm256_add_epi16(s4, s5); //   2,   1,   1,   2
306
    return _mm256_add_epi16(sum5, sum6);     // 3+2, 0+1, 3+1, 3+2
307
}
308

309
static etcpak_force_inline __m256i Average_AVX2( const __m256i data) noexcept
310
{
311
    __m256i a = _mm256_add_epi16(data, _mm256_set1_epi16(4));
312

313
    return _mm256_srli_epi16(a, 3);
314
}
315

316
static etcpak_force_inline __m128i CalcErrorBlock_AVX2( const __m256i data, const v4i a[8]) noexcept
317
{
318
    //
319
    __m256i a0 = _mm256_load_si256((__m256i*)a[0].data());
320
    __m256i a1 = _mm256_load_si256((__m256i*)a[4].data());
321

322
    // err = 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
323
    __m256i a4 = _mm256_madd_epi16(a0, a0);
324
    __m256i a5 = _mm256_madd_epi16(a1, a1);
325

326
    __m256i a6 = _mm256_hadd_epi32(a4, a5);
327
    __m256i a7 = _mm256_slli_epi32(a6, 3);
328

329
    __m256i a8 = _mm256_add_epi32(a7, _mm256_set1_epi32(0x3FFFFFFF)); // Big value to prevent negative values, but small enough to prevent overflow
330

331
    // average is not swapped
332
    // err -= block[0] * 2 * average[0];
333
    // err -= block[1] * 2 * average[1];
334
    // err -= block[2] * 2 * average[2];
335
    __m256i a2 = _mm256_slli_epi16(a0, 1);
336
    __m256i a3 = _mm256_slli_epi16(a1, 1);
337
    __m256i b0 = _mm256_madd_epi16(a2, data);
338
    __m256i b1 = _mm256_madd_epi16(a3, data);
339

340
    __m256i b2 = _mm256_hadd_epi32(b0, b1);
341
    __m256i b3 = _mm256_sub_epi32(a8, b2);
342
    __m256i b4 = _mm256_hadd_epi32(b3, b3);
343

344
    __m256i b5 = _mm256_permutevar8x32_epi32(b4, _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0));
345

346
    return _mm256_castsi256_si128(b5);
347
}
348

349
static etcpak_force_inline void ProcessAverages_AVX2(const __m256i d, v4i a[8] ) noexcept
350
{
351
    __m256i t = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(31)), _mm256_set1_epi16(128));
352

353
    __m256i c = _mm256_srli_epi16(_mm256_add_epi16(t, _mm256_srli_epi16(t, 8)), 8);
354

355
    __m256i c1 = _mm256_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
356
    __m256i diff = _mm256_sub_epi16(c, c1);
357
    diff = _mm256_max_epi16(diff, _mm256_set1_epi16(-4));
358
    diff = _mm256_min_epi16(diff, _mm256_set1_epi16(3));
359

360
    __m256i co = _mm256_add_epi16(c1, diff);
361

362
    c = _mm256_blend_epi16(co, c, 0xF0);
363

364
    __m256i a0 = _mm256_or_si256(_mm256_slli_epi16(c, 3), _mm256_srli_epi16(c, 2));
365

366
    _mm256_store_si256((__m256i*)a[4].data(), a0);
367

368
    __m256i t0 = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(15)), _mm256_set1_epi16(128));
369
    __m256i t1 = _mm256_srli_epi16(_mm256_add_epi16(t0, _mm256_srli_epi16(t0, 8)), 8);
370

371
    __m256i t2 = _mm256_or_si256(t1, _mm256_slli_epi16(t1, 4));
372

373
    _mm256_store_si256((__m256i*)a[0].data(), t2);
374
}
375

376
static etcpak_force_inline uint64_t EncodeAverages_AVX2( const v4i a[8], size_t idx ) noexcept
377
{
378
    uint64_t d = ( idx << 24 );
379
    size_t base = idx << 1;
380

381
    __m128i a0 = _mm_load_si128((const __m128i*)a[base].data());
382

383
    __m128i r0, r1;
384

385
    if( ( idx & 0x2 ) == 0 )
386
    {
387
        r0 = _mm_srli_epi16(a0, 4);
388

389
        __m128i a1 = _mm_unpackhi_epi64(r0, r0);
390
        r1 = _mm_slli_epi16(a1, 4);
391
    }
392
    else
393
    {
394
        __m128i a1 = _mm_and_si128(a0, _mm_set1_epi16(-8));
395

396
        r0 = _mm_unpackhi_epi64(a1, a1);
397
        __m128i a2 = _mm_sub_epi16(a1, r0);
398
        __m128i a3 = _mm_srai_epi16(a2, 3);
399
        r1 = _mm_and_si128(a3, _mm_set1_epi16(0x07));
400
    }
401

402
    __m128i r2 = _mm_or_si128(r0, r1);
403
    // do missing swap for average values
404
    __m128i r3 = _mm_shufflelo_epi16(r2, _MM_SHUFFLE(3, 0, 1, 2));
405
    __m128i r4 = _mm_packus_epi16(r3, _mm_setzero_si128());
406
    d |= _mm_cvtsi128_si32(r4);
407

408
    return d;
409
}
410

411
static etcpak_force_inline uint64_t CheckSolid_AVX2( const uint8_t* src ) noexcept
412
{
413
    __m256i d0 = _mm256_loadu_si256(((__m256i*)src) + 0);
414
    __m256i d1 = _mm256_loadu_si256(((__m256i*)src) + 1);
415

416
    __m256i c = _mm256_broadcastd_epi32(_mm256_castsi256_si128(d0));
417

418
    __m256i c0 = _mm256_cmpeq_epi8(d0, c);
419
    __m256i c1 = _mm256_cmpeq_epi8(d1, c);
420

421
    __m256i m = _mm256_and_si256(c0, c1);
422

423
    if (!_mm256_testc_si256(m, _mm256_set1_epi32(-1)))
424
    {
425
        return 0;
426
    }
427

428
    return 0x02000000 |
429
        ( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
430
        ( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
431
        ( (unsigned int)( src[2] & 0xF8 ) );
432
}
433

434
static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const uint8_t* src) noexcept
435
{
436
    __m256i sum4 = Sum4_AVX2( src );
437

438
    ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
439

440
    return CalcErrorBlock_AVX2( sum4, a);
441
}
442

443
static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const __m256i sum4) noexcept
444
{
445
    ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
446

447
    return CalcErrorBlock_AVX2( sum4, a);
448
}
449

450
static etcpak_force_inline void FindBestFit_4x2_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
451
{
452
    __m256i sel0 = _mm256_setzero_si256();
453
    __m256i sel1 = _mm256_setzero_si256();
454

455
    for (unsigned int j = 0; j < 2; ++j)
456
    {
457
        unsigned int bid = offset + 1 - j;
458

459
        __m256i squareErrorSum = _mm256_setzero_si256();
460

461
        __m128i a0 = _mm_loadl_epi64((const __m128i*)a[bid].data());
462
        __m256i a1 = _mm256_broadcastq_epi64(a0);
463

464
        // Processing one full row each iteration
465
        for (size_t i = 0; i < 8; i += 4)
466
        {
467
            __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
468

469
            __m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
470
            __m256i d = _mm256_sub_epi16(a1, rgb16);
471

472
            // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
473
            // This produces slightly different results, but is significant faster
474
            __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
475
            __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
476
            __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
477
            __m128i pixel3 = _mm256_castsi256_si128(pixel2);
478

479
            __m128i pix0 = _mm_broadcastw_epi16(pixel3);
480
            __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
481
            __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
482

483
            // Processing first two pixels of the row
484
            {
485
                __m256i pix = _mm256_abs_epi16(pixel);
486

487
                // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
488
                // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
489
                __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
490
                __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
491

492
                __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
493
                __m256i minError = _mm256_min_epi16(error0, error1);
494

495
                // Exploiting symmetry of the selector table and use the sign bit
496
                // This produces slightly different results, but is significant faster
497
                __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
498

499
                // Interleaving values so madd instruction can be used
500
                __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
501
                __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
502

503
                __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
504
                // Squaring the minimum error to produce correct values when adding
505
                __m256i squareError = _mm256_madd_epi16(minError2, minError2);
506

507
                squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
508

509
                // Packing selector bits
510
                __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
511
                __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
512

513
                sel0 = _mm256_or_si256(sel0, minIndexLo2);
514
                sel1 = _mm256_or_si256(sel1, minIndexHi2);
515
            }
516

517
            pixel3 = _mm256_extracti128_si256(pixel2, 1);
518
            pix0 = _mm_broadcastw_epi16(pixel3);
519
            pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
520
            pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
521

522
            // Processing second two pixels of the row
523
            {
524
                __m256i pix = _mm256_abs_epi16(pixel);
525

526
                // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
527
                // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
528
                __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
529
                __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
530

531
                __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
532
                __m256i minError = _mm256_min_epi16(error0, error1);
533

534
                // Exploiting symmetry of the selector table and use the sign bit
535
                __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
536

537
                // Interleaving values so madd instruction can be used
538
                __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
539
                __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
540

541
                __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
542
                // Squaring the minimum error to produce correct values when adding
543
                __m256i squareError = _mm256_madd_epi16(minError2, minError2);
544

545
                squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
546

547
                // Packing selector bits
548
                __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
549
                __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
550
                __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
551
                __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
552

553
                sel0 = _mm256_or_si256(sel0, minIndexLo3);
554
                sel1 = _mm256_or_si256(sel1, minIndexHi3);
555
            }
556
        }
557

558
        data += 8 * 4;
559

560
        _mm256_store_si256((__m256i*)terr[1 - j], squareErrorSum);
561
    }
562

563
    // Interleave selector bits
564
    __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
565
    __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
566

567
    __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
568
    __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
569

570
    __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
571

572
    __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
573

574
    _mm256_store_si256((__m256i*)tsel, sel);
575
}
576

577
static etcpak_force_inline void FindBestFit_2x4_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
578
{
579
    __m256i sel0 = _mm256_setzero_si256();
580
    __m256i sel1 = _mm256_setzero_si256();
581

582
    __m256i squareErrorSum0 = _mm256_setzero_si256();
583
    __m256i squareErrorSum1 = _mm256_setzero_si256();
584

585
    __m128i a0 = _mm_loadl_epi64((const __m128i*)a[offset + 1].data());
586
    __m128i a1 = _mm_loadl_epi64((const __m128i*)a[offset + 0].data());
587

588
    __m128i a2 = _mm_broadcastq_epi64(a0);
589
    __m128i a3 = _mm_broadcastq_epi64(a1);
590
    __m256i a4 = _mm256_insertf128_si256(_mm256_castsi128_si256(a2), a3, 1);
591

592
    // Processing one full row each iteration
593
    for (size_t i = 0; i < 16; i += 4)
594
    {
595
        __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
596

597
        __m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
598
        __m256i d = _mm256_sub_epi16(a4, rgb16);
599

600
        // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
601
        // This produces slightly different results, but is significant faster
602
        __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
603
        __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
604
        __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
605
        __m128i pixel3 = _mm256_castsi256_si128(pixel2);
606

607
        __m128i pix0 = _mm_broadcastw_epi16(pixel3);
608
        __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
609
        __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
610

611
        // Processing first two pixels of the row
612
        {
613
            __m256i pix = _mm256_abs_epi16(pixel);
614

615
            // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
616
            // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
617
            __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
618
            __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
619

620
            __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
621
            __m256i minError = _mm256_min_epi16(error0, error1);
622

623
            // Exploiting symmetry of the selector table and use the sign bit
624
            __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
625

626
            // Interleaving values so madd instruction can be used
627
            __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
628
            __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
629

630
            __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
631
            // Squaring the minimum error to produce correct values when adding
632
            __m256i squareError = _mm256_madd_epi16(minError2, minError2);
633

634
            squareErrorSum0 = _mm256_add_epi32(squareErrorSum0, squareError);
635

636
            // Packing selector bits
637
            __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
638
            __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
639

640
            sel0 = _mm256_or_si256(sel0, minIndexLo2);
641
            sel1 = _mm256_or_si256(sel1, minIndexHi2);
642
        }
643

644
        pixel3 = _mm256_extracti128_si256(pixel2, 1);
645
        pix0 = _mm_broadcastw_epi16(pixel3);
646
        pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
647
        pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
648

649
        // Processing second two pixels of the row
650
        {
651
            __m256i pix = _mm256_abs_epi16(pixel);
652

653
            // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
654
            // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
655
            __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
656
            __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
657

658
            __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
659
            __m256i minError = _mm256_min_epi16(error0, error1);
660

661
            // Exploiting symmetry of the selector table and use the sign bit
662
            __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
663

664
            // Interleaving values so madd instruction can be used
665
            __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
666
            __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
667

668
            __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
669
            // Squaring the minimum error to produce correct values when adding
670
            __m256i squareError = _mm256_madd_epi16(minError2, minError2);
671

672
            squareErrorSum1 = _mm256_add_epi32(squareErrorSum1, squareError);
673

674
            // Packing selector bits
675
            __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
676
            __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
677
            __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
678
            __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
679

680
            sel0 = _mm256_or_si256(sel0, minIndexLo3);
681
            sel1 = _mm256_or_si256(sel1, minIndexHi3);
682
        }
683
    }
684

685
    _mm256_store_si256((__m256i*)terr[1], squareErrorSum0);
686
    _mm256_store_si256((__m256i*)terr[0], squareErrorSum1);
687

688
    // Interleave selector bits
689
    __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
690
    __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
691

692
    __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
693
    __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
694

695
    __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
696

697
    __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
698

699
    _mm256_store_si256((__m256i*)tsel, sel);
700
}
701

702
static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate) noexcept
703
{
704
    size_t tidx[2];
705

706
    // Get index of minimum error (terr[0] and terr[1])
707
    __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
708
    __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
709

710
    __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
711
    __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
712

713
    __m256i errMin0 = _mm256_min_epu32(errLo, errHi);
714

715
    __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
716
    __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
717

718
    __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
719
    __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
720

721
    __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
722
    __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
723

724
    __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
725
    __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
726

727
    uint32_t mask0 = _mm256_movemask_epi8(errMask0);
728
    uint32_t mask1 = _mm256_movemask_epi8(errMask1);
729

730
    tidx[0] = _bit_scan_forward(mask0) >> 2;
731
    tidx[1] = _bit_scan_forward(mask1) >> 2;
732

733
    d |= tidx[0] << 26;
734
    d |= tidx[1] << 29;
735

736
    unsigned int t0 = tsel[tidx[0]];
737
    unsigned int t1 = tsel[tidx[1]];
738

739
    if (!rotate)
740
    {
741
        t0 &= 0xFF00FF00;
742
        t1 &= 0x00FF00FF;
743
    }
744
    else
745
    {
746
        t0 &= 0xCCCCCCCC;
747
        t1 &= 0x33333333;
748
    }
749

750
    // Flip selectors from sign bit
751
    unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
752

753
    return d | static_cast<uint64_t>(_bswap(t2)) << 32;
754
}
755

756
static etcpak_force_inline __m128i r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cvf) noexcept
757
{
758
    __m128i co = _mm_cvttps_epi32(cof);
759
    __m128i ch = _mm_cvttps_epi32(chf);
760
    __m128i cv = _mm_cvttps_epi32(cvf);
761

762
    __m128i coh = _mm_packus_epi32(co, ch);
763
    __m128i cv0 = _mm_packus_epi32(cv, _mm_setzero_si128());
764

765
    __m256i cohv0 = _mm256_inserti128_si256(_mm256_castsi128_si256(coh), cv0, 1);
766
    __m256i cohv1 = _mm256_min_epu16(cohv0, _mm256_set1_epi16(1023));
767

768
    __m256i cohv2 = _mm256_sub_epi16(cohv1, _mm256_set1_epi16(15));
769
    __m256i cohv3 = _mm256_srai_epi16(cohv2, 1);
770

771
    __m256i cohvrb0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(11));
772
    __m256i cohvrb1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(4));
773
    __m256i cohvg0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(9));
774
    __m256i cohvg1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(6));
775

776
    __m256i cohvrb2 = _mm256_srai_epi16(cohvrb0, 7);
777
    __m256i cohvrb3 = _mm256_srai_epi16(cohvrb1, 7);
778
    __m256i cohvg2 = _mm256_srai_epi16(cohvg0, 8);
779
    __m256i cohvg3 = _mm256_srai_epi16(cohvg1, 8);
780

781
    __m256i cohvrb4 = _mm256_sub_epi16(cohvrb0, cohvrb2);
782
    __m256i cohvrb5 = _mm256_sub_epi16(cohvrb4, cohvrb3);
783
    __m256i cohvg4 = _mm256_sub_epi16(cohvg0, cohvg2);
784
    __m256i cohvg5 = _mm256_sub_epi16(cohvg4, cohvg3);
785

786
    __m256i cohvrb6 = _mm256_srai_epi16(cohvrb5, 3);
787
    __m256i cohvg6 = _mm256_srai_epi16(cohvg5, 2);
788

789
    __m256i cohv4 = _mm256_blend_epi16(cohvg6, cohvrb6, 0x55);
790

791
    __m128i cohv5 = _mm_packus_epi16(_mm256_castsi256_si128(cohv4), _mm256_extracti128_si256(cohv4, 1));
792
    return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(6, 5, 4, -1, 2, 1, 0, -1, 10, 9, 8, -1, -1, -1, -1, -1));
793
}
794

795
static etcpak_force_inline Plane Planar_AVX2( const Channels& ch, uint8_t& mode, bool useHeuristics )
796
{
797
    __m128i t0 = _mm_sad_epu8( ch.r8, _mm_setzero_si128() );
798
    __m128i t1 = _mm_sad_epu8( ch.g8, _mm_setzero_si128() );
799
    __m128i t2 = _mm_sad_epu8( ch.b8, _mm_setzero_si128() );
800

801
    __m128i r8s = _mm_shuffle_epi8( ch.r8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
802
    __m128i g8s = _mm_shuffle_epi8( ch.g8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
803
    __m128i b8s = _mm_shuffle_epi8( ch.b8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
804

805
    __m128i s0 = _mm_sad_epu8( r8s, _mm_setzero_si128() );
806
    __m128i s1 = _mm_sad_epu8( g8s, _mm_setzero_si128() );
807
    __m128i s2 = _mm_sad_epu8( b8s, _mm_setzero_si128() );
808

809
    __m256i sr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), s0, 1 );
810
    __m256i sg0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t1 ), s1, 1 );
811
    __m256i sb0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), s2, 1 );
812

813
    __m256i sr1 = _mm256_slli_epi64( sr0, 32 );
814
    __m256i sg1 = _mm256_slli_epi64( sg0, 16 );
815

816
    __m256i srb = _mm256_or_si256( sr1, sb0 );
817
    __m256i srgb = _mm256_or_si256( srb, sg1 );
818

819
    if( mode != ModePlanar && useHeuristics )
820
    {
821
        Plane plane;
822
        plane.sum4 = _mm256_permute4x64_epi64( srgb, _MM_SHUFFLE( 2, 3, 0, 1 ) );
823
        return plane;
824
    }
825

826
    __m128i t3 = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( t0 ), _mm_castsi128_ps( t1 ), _MM_SHUFFLE( 2, 0, 2, 0 ) ) );
827
    __m128i t4 = _mm_shuffle_epi32( t2, _MM_SHUFFLE( 3, 1, 2, 0 ) );
828
    __m128i t5 = _mm_hadd_epi32( t3, t4 );
829
    __m128i t6 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 1, 1, 1, 1 ) );
830
    __m128i t7 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 2, 2, 2, 2 ) );
831

832
    __m256i sr = _mm256_broadcastw_epi16( t5 );
833
    __m256i sg = _mm256_broadcastw_epi16( t6 );
834
    __m256i sb = _mm256_broadcastw_epi16( t7 );
835

836
    __m256i r08 = _mm256_cvtepu8_epi16( ch.r8 );
837
    __m256i g08 = _mm256_cvtepu8_epi16( ch.g8 );
838
    __m256i b08 = _mm256_cvtepu8_epi16( ch.b8 );
839

840
    __m256i r16 = _mm256_slli_epi16( r08, 4 );
841
    __m256i g16 = _mm256_slli_epi16( g08, 4 );
842
    __m256i b16 = _mm256_slli_epi16( b08, 4 );
843

844
    __m256i difR0 = _mm256_sub_epi16( r16, sr );
845
    __m256i difG0 = _mm256_sub_epi16( g16, sg );
846
    __m256i difB0 = _mm256_sub_epi16( b16, sb );
847

848
    __m256i difRyz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
849
    __m256i difGyz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
850
    __m256i difByz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
851

852
    __m256i difRxz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
853
    __m256i difGxz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
854
    __m256i difBxz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
855

856
    __m256i difRGyz = _mm256_hadd_epi32( difRyz, difGyz );
857
    __m256i difByzxz = _mm256_hadd_epi32( difByz, difBxz );
858

859
    __m256i difRGxz = _mm256_hadd_epi32( difRxz, difGxz );
860

861
    __m128i sumRGyz = _mm_add_epi32( _mm256_castsi256_si128( difRGyz ), _mm256_extracti128_si256( difRGyz, 1 ) );
862
    __m128i sumByzxz = _mm_add_epi32( _mm256_castsi256_si128( difByzxz ), _mm256_extracti128_si256( difByzxz, 1 ) );
863
    __m128i sumRGxz = _mm_add_epi32( _mm256_castsi256_si128( difRGxz ), _mm256_extracti128_si256( difRGxz, 1 ) );
864

865
    __m128i sumRGByz = _mm_hadd_epi32( sumRGyz, sumByzxz );
866
    __m128i sumRGByzxz = _mm_hadd_epi32( sumRGxz, sumByzxz );
867

868
    __m128i sumRGBxz = _mm_shuffle_epi32( sumRGByzxz, _MM_SHUFFLE( 2, 3, 1, 0 ) );
869

870
    __m128 sumRGByzf = _mm_cvtepi32_ps( sumRGByz );
871
    __m128 sumRGBxzf = _mm_cvtepi32_ps( sumRGBxz );
872

873
    const float value = ( 255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f;
874

875
    __m128 scale = _mm_set1_ps( -4.0f / value );
876

877
    __m128 af = _mm_mul_ps( sumRGBxzf, scale );
878
    __m128 bf = _mm_mul_ps( sumRGByzf, scale );
879

880
    __m128 df = _mm_mul_ps( _mm_cvtepi32_ps( t5 ), _mm_set1_ps( 4.0f / 16.0f ) );
881

882
    // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
883
    __m128 cof0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
884
    __m128 chf0 = _mm_fnmadd_ps( af, _mm_set1_ps( 425.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
885
    __m128 cvf0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( 425.0f ), df ) );
886

887
    // convert to r6g7b6
888
    __m128i cohv = r6g7b6_AVX2( cof0, chf0, cvf0 );
889

890
    uint64_t rgbho = _mm_extract_epi64( cohv, 0 );
891
    uint32_t rgbv0 = _mm_extract_epi32( cohv, 2 );
892

893
    // Error calculation
894
    uint64_t error = 0;
895
    if( !useHeuristics )
896
    {
897
        auto ro0 = ( rgbho >> 48 ) & 0x3F;
898
        auto go0 = ( rgbho >> 40 ) & 0x7F;
899
        auto bo0 = ( rgbho >> 32 ) & 0x3F;
900
        auto ro1 = ( ro0 >> 4 ) | ( ro0 << 2 );
901
        auto go1 = ( go0 >> 6 ) | ( go0 << 1 );
902
        auto bo1 = ( bo0 >> 4 ) | ( bo0 << 2 );
903
        auto ro2 = ( ro1 << 2 ) + 2;
904
        auto go2 = ( go1 << 2 ) + 2;
905
        auto bo2 = ( bo1 << 2 ) + 2;
906

907
        __m256i ro3 = _mm256_set1_epi16( ro2 );
908
        __m256i go3 = _mm256_set1_epi16( go2 );
909
        __m256i bo3 = _mm256_set1_epi16( bo2 );
910

911
        auto rh0 = ( rgbho >> 16 ) & 0x3F;
912
        auto gh0 = ( rgbho >> 8 ) & 0x7F;
913
        auto bh0 = ( rgbho >> 0 ) & 0x3F;
914
        auto rh1 = ( rh0 >> 4 ) | ( rh0 << 2 );
915
        auto gh1 = ( gh0 >> 6 ) | ( gh0 << 1 );
916
        auto bh1 = ( bh0 >> 4 ) | ( bh0 << 2 );
917

918
        auto rh2 = rh1 - ro1;
919
        auto gh2 = gh1 - go1;
920
        auto bh2 = bh1 - bo1;
921

922
        __m256i rh3 = _mm256_set1_epi16( rh2 );
923
        __m256i gh3 = _mm256_set1_epi16( gh2 );
924
        __m256i bh3 = _mm256_set1_epi16( bh2 );
925

926
        auto rv0 = ( rgbv0 >> 16 ) & 0x3F;
927
        auto gv0 = ( rgbv0 >> 8 ) & 0x7F;
928
        auto bv0 = ( rgbv0 >> 0 ) & 0x3F;
929
        auto rv1 = ( rv0 >> 4 ) | ( rv0 << 2 );
930
        auto gv1 = ( gv0 >> 6 ) | ( gv0 << 1 );
931
        auto bv1 = ( bv0 >> 4 ) | ( bv0 << 2 );
932

933
        auto rv2 = rv1 - ro1;
934
        auto gv2 = gv1 - go1;
935
        auto bv2 = bv1 - bo1;
936

937
        __m256i rv3 = _mm256_set1_epi16( rv2 );
938
        __m256i gv3 = _mm256_set1_epi16( gv2 );
939
        __m256i bv3 = _mm256_set1_epi16( bv2 );
940

941
        __m256i x = _mm256_set_epi16( 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 );
942

943
        __m256i rh4 = _mm256_mullo_epi16( rh3, x );
944
        __m256i gh4 = _mm256_mullo_epi16( gh3, x );
945
        __m256i bh4 = _mm256_mullo_epi16( bh3, x );
946

947
        __m256i y = _mm256_set_epi16( 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0 );
948

949
        __m256i rv4 = _mm256_mullo_epi16( rv3, y );
950
        __m256i gv4 = _mm256_mullo_epi16( gv3, y );
951
        __m256i bv4 = _mm256_mullo_epi16( bv3, y );
952

953
        __m256i rxy = _mm256_add_epi16( rh4, rv4 );
954
        __m256i gxy = _mm256_add_epi16( gh4, gv4 );
955
        __m256i bxy = _mm256_add_epi16( bh4, bv4 );
956

957
        __m256i rp0 = _mm256_add_epi16( rxy, ro3 );
958
        __m256i gp0 = _mm256_add_epi16( gxy, go3 );
959
        __m256i bp0 = _mm256_add_epi16( bxy, bo3 );
960

961
        __m256i rp1 = _mm256_srai_epi16( rp0, 2 );
962
        __m256i gp1 = _mm256_srai_epi16( gp0, 2 );
963
        __m256i bp1 = _mm256_srai_epi16( bp0, 2 );
964

965
        __m256i rp2 = _mm256_max_epi16( _mm256_min_epi16( rp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() );
966
        __m256i gp2 = _mm256_max_epi16( _mm256_min_epi16( gp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() );
967
        __m256i bp2 = _mm256_max_epi16( _mm256_min_epi16( bp1, _mm256_set1_epi16( 255 ) ), _mm256_setzero_si256() );
968

969
        __m256i rdif = _mm256_sub_epi16( r08, rp2 );
970
        __m256i gdif = _mm256_sub_epi16( g08, gp2 );
971
        __m256i bdif = _mm256_sub_epi16( b08, bp2 );
972

973
        __m256i rerr = _mm256_mullo_epi16( rdif, _mm256_set1_epi16( 38 ) );
974
        __m256i gerr = _mm256_mullo_epi16( gdif, _mm256_set1_epi16( 76 ) );
975
        __m256i berr = _mm256_mullo_epi16( bdif, _mm256_set1_epi16( 14 ) );
976

977
        __m256i sum0 = _mm256_add_epi16( rerr, gerr );
978
        __m256i sum1 = _mm256_add_epi16( sum0, berr );
979

980
        __m256i sum2 = _mm256_madd_epi16( sum1, sum1 );
981

982
        __m128i sum3 = _mm_add_epi32( _mm256_castsi256_si128( sum2 ), _mm256_extracti128_si256( sum2, 1 ) );
983

984
        uint32_t err0 = _mm_extract_epi32( sum3, 0 );
985
        uint32_t err1 = _mm_extract_epi32( sum3, 1 );
986
        uint32_t err2 = _mm_extract_epi32( sum3, 2 );
987
        uint32_t err3 = _mm_extract_epi32( sum3, 3 );
988

989
        error = err0 + err1 + err2 + err3;
990
    }
991
    /**/
992

993
    uint32_t rgbv = ( rgbv0 & 0x3F ) | ( ( rgbv0 >> 2 ) & 0x1FC0 ) | ( ( rgbv0 >> 3 ) & 0x7E000 );
994
    uint64_t rgbho0_ = ( rgbho & 0x3F0000003F ) | ( ( rgbho >> 2 ) & 0x1FC000001FC0 ) | ( ( rgbho >> 3 ) & 0x7E0000007E000 );
995
    uint64_t rgbho0 = ( rgbho0_ & 0x7FFFF ) | ( ( rgbho0_ >> 13 ) & 0x3FFFF80000 );
996

997
    uint32_t hi = rgbv | ((rgbho0 & 0x1FFF) << 19);
998
    rgbho0 >>= 13;
999
    uint32_t lo = ( rgbho0 & 0x1 ) | ( ( rgbho0 & 0x1FE ) << 1 ) | ( ( rgbho0 & 0x600 ) << 2 ) | ( ( rgbho0 & 0x3F800 ) << 5 ) | ( ( rgbho0 & 0x1FC0000 ) << 6 );
1000

1001
    uint32_t idx = ( ( rgbho >> 33 ) & 0xF ) | ( ( rgbho >> 41 ) & 0x10 ) | ( ( rgbho >> 48 ) & 0x20 );
1002
    lo |= g_flags[idx];
1003
    uint64_t result = static_cast<uint32_t>(_bswap(lo));
1004
    result |= static_cast<uint64_t>(static_cast<uint32_t>(_bswap(hi))) << 32;
1005

1006
    Plane plane;
1007

1008
    plane.plane = result;
1009
    if( useHeuristics )
1010
    {
1011
        plane.error = 0;
1012
        mode = ModePlanar;
1013
    }
1014
    else
1015
    {
1016
        plane.error = error;
1017
    }
1018
    plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(2, 3, 0, 1));
1019

1020
    return plane;
1021
}
1022

1023
static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate, const uint64_t value, const uint32_t error) noexcept
1024
{
1025
    size_t tidx[2];
1026

1027
    // Get index of minimum error (terr[0] and terr[1])
1028
    __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
1029
    __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
1030

1031
    __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
1032
    __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
1033

1034
    __m256i errMin0 = _mm256_min_epu32(errLo, errHi);
1035

1036
    __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
1037
    __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
1038

1039
    __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
1040
    __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
1041

1042
    __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
1043
    __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
1044

1045
    __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
1046
    __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
1047

1048
    uint32_t mask0 = _mm256_movemask_epi8(errMask0);
1049
    uint32_t mask1 = _mm256_movemask_epi8(errMask1);
1050

1051
    tidx[0] = _bit_scan_forward(mask0) >> 2;
1052
    tidx[1] = _bit_scan_forward(mask1) >> 2;
1053

1054
    if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
1055
    {
1056
        return value;
1057
    }
1058

1059
    d |= tidx[0] << 26;
1060
    d |= tidx[1] << 29;
1061

1062
    unsigned int t0 = tsel[tidx[0]];
1063
    unsigned int t1 = tsel[tidx[1]];
1064

1065
    if (!rotate)
1066
    {
1067
        t0 &= 0xFF00FF00;
1068
        t1 &= 0x00FF00FF;
1069
    }
1070
    else
1071
    {
1072
        t0 &= 0xCCCCCCCC;
1073
        t1 &= 0x33333333;
1074
    }
1075

1076
    // Flip selectors from sign bit
1077
    unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
1078

1079
    return d | static_cast<uint64_t>(_bswap(t2)) << 32;
1080
}
1081

1082
#endif
1083

1084
static etcpak_force_inline void Average( const uint8_t* data, v4i* a )
1085
{
1086
#ifdef __SSE4_1__
1087
    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
1088
    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
1089
    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
1090
    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
1091

1092
    __m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
1093
    __m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
1094
    __m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128());
1095
    __m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128());
1096
    __m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
1097
    __m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
1098
    __m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128());
1099
    __m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128());
1100

1101
    __m128i sum0 = _mm_add_epi16(d0l, d1l);
1102
    __m128i sum1 = _mm_add_epi16(d0h, d1h);
1103
    __m128i sum2 = _mm_add_epi16(d2l, d3l);
1104
    __m128i sum3 = _mm_add_epi16(d2h, d3h);
1105

1106
    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
1107
    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
1108
    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
1109
    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
1110
    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
1111
    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
1112
    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
1113
    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
1114

1115
    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
1116
    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
1117
    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
1118
    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
1119

1120
    __m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3);
1121
    __m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3);
1122
    __m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3);
1123
    __m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3);
1124

1125
    _mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2))));
1126
    _mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2))));
1127
#elif defined __ARM_NEON
1128
    uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data +  0), uint8x16_t());
1129
    uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t());
1130
    uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t());
1131
    uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t());
1132

1133
    uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) };
1134
    uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) };
1135
    uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) };
1136
    uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) };
1137

1138
    uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ) ) ), uint16x8_t());
1139
    uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ) ) ), uint16x8_t());
1140
    uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ) ) ), uint16x8_t());
1141
    uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ) ) ), uint16x8_t());
1142

1143
    uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) };
1144
    uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) };
1145
    uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) };
1146
    uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) };
1147

1148
    uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]);
1149
    uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]);
1150
    uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]);
1151
    uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]);
1152

1153
    uint32x4_t a0 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b2, b3), vdupq_n_u32(4)), 3);
1154
    uint32x4_t a1 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b1), vdupq_n_u32(4)), 3);
1155
    uint32x4_t a2 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b1, b3), vdupq_n_u32(4)), 3);
1156
    uint32x4_t a3 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b2), vdupq_n_u32(4)), 3);
1157

1158
    uint16x8_t o0 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a0 )), vqmovun_s32(vreinterpretq_s32_u32( a1 )));
1159
    uint16x8_t o1 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a2 )), vqmovun_s32(vreinterpretq_s32_u32( a3 )));
1160

1161
    a[0] = v4i{o0[2], o0[1], o0[0], 0};
1162
    a[1] = v4i{o0[6], o0[5], o0[4], 0};
1163
    a[2] = v4i{o1[2], o1[1], o1[0], 0};
1164
    a[3] = v4i{o1[6], o1[5], o1[4], 0};
1165
#else
1166
    uint32_t r[4];
1167
    uint32_t g[4];
1168
    uint32_t b[4];
1169

1170
    memset(r, 0, sizeof(r));
1171
    memset(g, 0, sizeof(g));
1172
    memset(b, 0, sizeof(b));
1173

1174
    for( int j=0; j<4; j++ )
1175
    {
1176
        for( int i=0; i<4; i++ )
1177
        {
1178
            int index = (j & 2) + (i >> 1);
1179
            b[index] += *data++;
1180
            g[index] += *data++;
1181
            r[index] += *data++;
1182
            data++;
1183
        }
1184
    }
1185

1186
    a[0] = v4i{ uint16_t( (r[2] + r[3] + 4) / 8 ), uint16_t( (g[2] + g[3] + 4) / 8 ), uint16_t( (b[2] + b[3] + 4) / 8 ), 0};
1187
    a[1] = v4i{ uint16_t( (r[0] + r[1] + 4) / 8 ), uint16_t( (g[0] + g[1] + 4) / 8 ), uint16_t( (b[0] + b[1] + 4) / 8 ), 0};
1188
    a[2] = v4i{ uint16_t( (r[1] + r[3] + 4) / 8 ), uint16_t( (g[1] + g[3] + 4) / 8 ), uint16_t( (b[1] + b[3] + 4) / 8 ), 0};
1189
    a[3] = v4i{ uint16_t( (r[0] + r[2] + 4) / 8 ), uint16_t( (g[0] + g[2] + 4) / 8 ), uint16_t( (b[0] + b[2] + 4) / 8 ), 0};
1190
#endif
1191
}
1192

1193
static etcpak_force_inline void CalcErrorBlock( const uint8_t* data, unsigned int err[4][4] )
1194
{
1195
#ifdef __SSE4_1__
1196
    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
1197
    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
1198
    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
1199
    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
1200

1201
    __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
1202
    __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
1203
    __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
1204
    __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
1205

1206
    __m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128());
1207
    __m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128());
1208
    __m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128());
1209
    __m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128());
1210
    __m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128());
1211
    __m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128());
1212
    __m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128());
1213
    __m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128());
1214

1215
    __m128i sum0 = _mm_add_epi16(d0l, d1l);
1216
    __m128i sum1 = _mm_add_epi16(d0h, d1h);
1217
    __m128i sum2 = _mm_add_epi16(d2l, d3l);
1218
    __m128i sum3 = _mm_add_epi16(d2h, d3h);
1219

1220
    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
1221
    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
1222
    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
1223
    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
1224
    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
1225
    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
1226
    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
1227
    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
1228

1229
    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
1230
    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
1231
    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
1232
    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
1233

1234
    __m128i a0 = _mm_add_epi32(b2, b3);
1235
    __m128i a1 = _mm_add_epi32(b0, b1);
1236
    __m128i a2 = _mm_add_epi32(b1, b3);
1237
    __m128i a3 = _mm_add_epi32(b0, b2);
1238

1239
    _mm_storeu_si128((__m128i*)&err[0], a0);
1240
    _mm_storeu_si128((__m128i*)&err[1], a1);
1241
    _mm_storeu_si128((__m128i*)&err[2], a2);
1242
    _mm_storeu_si128((__m128i*)&err[3], a3);
1243
#elif defined __ARM_NEON
1244
    uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data +  0), uint8x16_t());
1245
    uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t());
1246
    uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t());
1247
    uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t());
1248

1249
    uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) };
1250
    uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) };
1251
    uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) };
1252
    uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) };
1253

1254
    uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ))), uint16x8_t());
1255
    uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ))), uint16x8_t());
1256
    uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ))), uint16x8_t());
1257
    uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ))), uint16x8_t());
1258

1259
    uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) };
1260
    uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) };
1261
    uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) };
1262
    uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) };
1263

1264
    uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]);
1265
    uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]);
1266
    uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]);
1267
    uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]);
1268

1269
    uint32x4_t a0 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b2, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1270
    uint32x4_t a1 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b1) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1271
    uint32x4_t a2 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b1, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1272
    uint32x4_t a3 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b2) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
1273

1274
    vst1q_u32(err[0], a0);
1275
    vst1q_u32(err[1], a1);
1276
    vst1q_u32(err[2], a2);
1277
    vst1q_u32(err[3], a3);
1278
#else
1279
    unsigned int terr[4][4];
1280

1281
    memset(terr, 0, 16 * sizeof(unsigned int));
1282

1283
    for( int j=0; j<4; j++ )
1284
    {
1285
        for( int i=0; i<4; i++ )
1286
        {
1287
            int index = (j & 2) + (i >> 1);
1288
            unsigned int d = *data++;
1289
            terr[index][0] += d;
1290
            d = *data++;
1291
            terr[index][1] += d;
1292
            d = *data++;
1293
            terr[index][2] += d;
1294
            data++;
1295
        }
1296
    }
1297

1298
    for( int i=0; i<3; i++ )
1299
    {
1300
        err[0][i] = terr[2][i] + terr[3][i];
1301
        err[1][i] = terr[0][i] + terr[1][i];
1302
        err[2][i] = terr[1][i] + terr[3][i];
1303
        err[3][i] = terr[0][i] + terr[2][i];
1304
    }
1305
    for( int i=0; i<4; i++ )
1306
    {
1307
        err[i][3] = 0;
1308
    }
1309
#endif
1310
}
1311

1312
static etcpak_force_inline unsigned int CalcError( const unsigned int block[4], const v4i& average )
1313
{
1314
    unsigned int err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow
1315
    err -= block[0] * 2 * average[2];
1316
    err -= block[1] * 2 * average[1];
1317
    err -= block[2] * 2 * average[0];
1318
    err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
1319
    return err;
1320
}
1321

1322
static etcpak_force_inline void ProcessAverages( v4i* a )
1323
{
1324
#ifdef __SSE4_1__
1325
    for( int i=0; i<2; i++ )
1326
    {
1327
        __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
1328

1329
        __m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128));
1330

1331
        __m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8);
1332

1333
        __m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
1334
        __m128i diff = _mm_sub_epi16(c, c1);
1335
        diff = _mm_max_epi16(diff, _mm_set1_epi16(-4));
1336
        diff = _mm_min_epi16(diff, _mm_set1_epi16(3));
1337

1338
        __m128i co = _mm_add_epi16(c1, diff);
1339

1340
        c = _mm_blend_epi16(co, c, 0xF0);
1341

1342
        __m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2));
1343

1344
        _mm_storeu_si128((__m128i*)a[4+i*2].data(), a0);
1345
    }
1346

1347
    for( int i=0; i<2; i++ )
1348
    {
1349
        __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
1350

1351
        __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128));
1352
        __m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8);
1353

1354
        __m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4));
1355

1356
        _mm_storeu_si128((__m128i*)a[i*2].data(), t2);
1357
    }
1358
#elif defined __ARM_NEON
1359
    for( int i=0; i<2; i++ )
1360
    {
1361
        int16x8_t d = vld1q_s16((int16_t*)&a[i*2]);
1362
        int16x8_t t = vaddq_s16(vmulq_s16(d, vdupq_n_s16(31)), vdupq_n_s16(128));
1363
        int16x8_t c = vshrq_n_s16(vaddq_s16(t, vshrq_n_s16(t, 8)), 8);
1364

1365
        int16x8_t c1 = vcombine_s16(vget_high_s16(c), vget_high_s16(c));
1366
        int16x8_t diff = vsubq_s16(c, c1);
1367
        diff = vmaxq_s16(diff, vdupq_n_s16(-4));
1368
        diff = vminq_s16(diff, vdupq_n_s16(3));
1369

1370
        int16x8_t co = vaddq_s16(c1, diff);
1371

1372
        c = vcombine_s16(vget_low_s16(co), vget_high_s16(c));
1373

1374
        int16x8_t a0 = vorrq_s16(vshlq_n_s16(c, 3), vshrq_n_s16(c, 2));
1375

1376
        vst1q_s16((int16_t*)&a[4+i*2], a0);
1377
    }
1378

1379
    for( int i=0; i<2; i++ )
1380
    {
1381
        int16x8_t d = vld1q_s16((int16_t*)&a[i*2]);
1382

1383
        int16x8_t t0 = vaddq_s16(vmulq_s16(d, vdupq_n_s16(15)), vdupq_n_s16(128));
1384
        int16x8_t t1 = vshrq_n_s16(vaddq_s16(t0, vshrq_n_s16(t0, 8)), 8);
1385

1386
        int16x8_t t2 = vorrq_s16(t1, vshlq_n_s16(t1, 4));
1387

1388
        vst1q_s16((int16_t*)&a[i*2], t2);
1389
    }
1390
#else
1391
    for( int i=0; i<2; i++ )
1392
    {
1393
        for( int j=0; j<3; j++ )
1394
        {
1395
            int32_t c1 = mul8bit( a[i*2+1][j], 31 );
1396
            int32_t c2 = mul8bit( a[i*2][j], 31 );
1397

1398
            int32_t diff = c2 - c1;
1399
            if( diff > 3 ) diff = 3;
1400
            else if( diff < -4 ) diff = -4;
1401

1402
            int32_t co = c1 + diff;
1403

1404
            a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 );
1405
            a[4+i*2][j] = ( co << 3 ) | ( co >> 2 );
1406
        }
1407
    }
1408

1409
    for( int i=0; i<4; i++ )
1410
    {
1411
        a[i][0] = g_avg2[mul8bit( a[i][0], 15 )];
1412
        a[i][1] = g_avg2[mul8bit( a[i][1], 15 )];
1413
        a[i][2] = g_avg2[mul8bit( a[i][2], 15 )];
1414
    }
1415
#endif
1416
}
1417

1418
static etcpak_force_inline void EncodeAverages( uint64_t& _d, const v4i* a, size_t idx )
1419
{
1420
    auto d = _d;
1421
    d |= ( idx << 24 );
1422
    size_t base = idx << 1;
1423

1424
    if( ( idx & 0x2 ) == 0 )
1425
    {
1426
        for( int i=0; i<3; i++ )
1427
        {
1428
            d |= uint64_t( a[base+0][i] >> 4 ) << ( i*8 );
1429
            d |= uint64_t( a[base+1][i] >> 4 ) << ( i*8 + 4 );
1430
        }
1431
    }
1432
    else
1433
    {
1434
        for( int i=0; i<3; i++ )
1435
        {
1436
            d |= uint64_t( a[base+1][i] & 0xF8 ) << ( i*8 );
1437
            int32_t c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3;
1438
            c &= ~0xFFFFFFF8;
1439
            d |= ((uint64_t)c) << ( i*8 );
1440
        }
1441
    }
1442
    _d = d;
1443
}
1444

1445
static etcpak_force_inline uint64_t CheckSolid( const uint8_t* src )
1446
{
1447
#ifdef __SSE4_1__
1448
    __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
1449
    __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
1450
    __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
1451
    __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
1452

1453
    __m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0));
1454

1455
    __m128i c0 = _mm_cmpeq_epi8(d0, c);
1456
    __m128i c1 = _mm_cmpeq_epi8(d1, c);
1457
    __m128i c2 = _mm_cmpeq_epi8(d2, c);
1458
    __m128i c3 = _mm_cmpeq_epi8(d3, c);
1459

1460
    __m128i m0 = _mm_and_si128(c0, c1);
1461
    __m128i m1 = _mm_and_si128(c2, c3);
1462
    __m128i m = _mm_and_si128(m0, m1);
1463

1464
    if (!_mm_testc_si128(m, _mm_set1_epi32(-1)))
1465
    {
1466
        return 0;
1467
    }
1468
#elif defined __ARM_NEON
1469
    int32x4_t d0 = vld1q_s32((int32_t*)src +  0);
1470
    int32x4_t d1 = vld1q_s32((int32_t*)src +  4);
1471
    int32x4_t d2 = vld1q_s32((int32_t*)src +  8);
1472
    int32x4_t d3 = vld1q_s32((int32_t*)src + 12);
1473

1474
    int32x4_t c = vdupq_n_s32(d0[0]);
1475

1476
    int32x4_t c0 = vreinterpretq_s32_u32(vceqq_s32(d0, c));
1477
    int32x4_t c1 = vreinterpretq_s32_u32(vceqq_s32(d1, c));
1478
    int32x4_t c2 = vreinterpretq_s32_u32(vceqq_s32(d2, c));
1479
    int32x4_t c3 = vreinterpretq_s32_u32(vceqq_s32(d3, c));
1480

1481
    int32x4_t m0 = vandq_s32(c0, c1);
1482
    int32x4_t m1 = vandq_s32(c2, c3);
1483
    int64x2_t m = vreinterpretq_s64_s32(vandq_s32(m0, m1));
1484

1485
    if (m[0] != -1 || m[1] != -1)
1486
    {
1487
        return 0;
1488
    }
1489
#else
1490
    const uint8_t* ptr = src + 4;
1491
    for( int i=1; i<16; i++ )
1492
    {
1493
        if( memcmp( src, ptr, 4 ) != 0 )
1494
        {
1495
            return 0;
1496
        }
1497
        ptr += 4;
1498
    }
1499
#endif
1500
    return 0x02000000 |
1501
        ( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
1502
        ( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
1503
        ( (unsigned int)( src[2] & 0xF8 ) );
1504
}
1505

1506
static etcpak_force_inline void PrepareAverages( v4i a[8], const uint8_t* src, unsigned int err[4] )
1507
{
1508
    Average( src, a );
1509
    ProcessAverages( a );
1510

1511
    unsigned int errblock[4][4];
1512
    CalcErrorBlock( src, errblock );
1513

1514
    for( int i=0; i<4; i++ )
1515
    {
1516
        err[i/2] += CalcError( errblock[i], a[i] );
1517
        err[2+i/2] += CalcError( errblock[i], a[i+4] );
1518
    }
1519
}
1520

1521
static etcpak_force_inline void FindBestFit( uint64_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
1522
{
1523
    for( size_t i=0; i<16; i++ )
1524
    {
1525
        uint16_t* sel = tsel[i];
1526
        unsigned int bid = id[i];
1527
        uint64_t* ter = terr[bid%2];
1528

1529
        uint8_t b = *data++;
1530
        uint8_t g = *data++;
1531
        uint8_t r = *data++;
1532
        data++;
1533

1534
        int dr = a[bid][0] - r;
1535
        int dg = a[bid][1] - g;
1536
        int db = a[bid][2] - b;
1537

1538
#ifdef __SSE4_1__
1539
        // Reference implementation
1540

1541
        __m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28);
1542
        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1543
        __m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0]));
1544
        __m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1]));
1545
        __m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0]));
1546
        __m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1]));
1547

1548
        __m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
1549
        __m128i minError0 = _mm_min_epi32(error0, error1);
1550

1551
        __m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
1552
        __m128i minError1 = _mm_min_epi32(error2, error3);
1553

1554
        __m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
1555
        __m128i minError = _mm_min_epi32(minError0, minError1);
1556

1557
        // Squaring the minimum error to produce correct values when adding
1558
        __m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
1559
        __m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
1560
        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
1561
        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
1562
        __m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
1563
        __m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
1564
        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
1565
        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
1566

1567
        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1568
        error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2]));
1569
        error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3]));
1570
        error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2]));
1571
        error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3]));
1572

1573
        index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
1574
        minError0 = _mm_min_epi32(error0, error1);
1575

1576
        index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
1577
        minError1 = _mm_min_epi32(error2, error3);
1578

1579
        __m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
1580
        minError = _mm_min_epi32(minError0, minError1);
1581

1582
        // Squaring the minimum error to produce correct values when adding
1583
        minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
1584
        squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
1585
        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2));
1586
        _mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow);
1587
        minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
1588
        squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
1589
        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3));
1590
        _mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh);
1591
        __m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1);
1592
        _mm_storeu_si128((__m128i*)sel, minIndex);
1593
#elif defined __ARM_NEON
1594
        int32x4_t pix = vdupq_n_s32(dr * 77 + dg * 151 + db * 28);
1595

1596
        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1597
        uint32x4_t error0 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[0])));
1598
        uint32x4_t error1 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[1])));
1599
        uint32x4_t error2 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[0])));
1600
        uint32x4_t error3 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[1])));
1601

1602
        uint32x4_t index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1));
1603
        uint32x4_t minError0 = vminq_u32(error0, error1);
1604

1605
        uint32x4_t index1 = vreinterpretq_u32_s32(vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))));
1606
        uint32x4_t minError1 = vminq_u32(error2, error3);
1607

1608
        uint32x4_t blendMask = vcltq_u32(minError1, minError0);
1609
        uint32x4_t minIndex0 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
1610
        uint32x4_t minError = vminq_u32(minError0, minError1);
1611

1612
        // Squaring the minimum error to produce correct values when adding
1613
        uint32x4_t squareErrorLow = vmulq_u32(minError, minError);
1614
        uint32x4_t squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError))), 1);
1615
        uint32x4x2_t squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
1616
        uint64x2x2_t squareError = { vreinterpretq_u64_u32(squareErrorZip.val[0]), vreinterpretq_u64_u32(squareErrorZip.val[1]) };
1617
        squareError.val[0] = vaddq_u64(squareError.val[0], vld1q_u64(ter + 0));
1618
        squareError.val[1] = vaddq_u64(squareError.val[1], vld1q_u64(ter + 2));
1619
        vst1q_u64(ter + 0, squareError.val[0]);
1620
        vst1q_u64(ter + 2, squareError.val[1]);
1621

1622
        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1623
        error0 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[2])));
1624
        error1 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[3])));
1625
        error2 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[2])));
1626
        error3 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[3])));
1627

1628
        index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1));
1629
        minError0 = vminq_u32(error0, error1);
1630

1631
        index1 = vreinterpretq_u32_s32( vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))) );
1632
        minError1 = vminq_u32(error2, error3);
1633

1634
        blendMask = vcltq_u32(minError1, minError0);
1635
        uint32x4_t minIndex1 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
1636
        minError = vminq_u32(minError0, minError1);
1637

1638
        // Squaring the minimum error to produce correct values when adding
1639
        squareErrorLow = vmulq_u32(minError, minError);
1640
        squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32( vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError)) ), 1 );
1641
        squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
1642
        squareError.val[0] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[0] ), vld1q_u64(ter + 4));
1643
        squareError.val[1] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[1] ), vld1q_u64(ter + 6));
1644
        vst1q_u64(ter + 4, squareError.val[0]);
1645
        vst1q_u64(ter + 6, squareError.val[1]);
1646

1647
        uint16x8_t minIndex = vcombine_u16(vqmovn_u32(minIndex0), vqmovn_u32(minIndex1));
1648
        vst1q_u16(sel, minIndex);
1649
#else
1650
        int pix = dr * 77 + dg * 151 + db * 28;
1651

1652
        for( int t=0; t<8; t++ )
1653
        {
1654
            const int64_t* tab = g_table256[t];
1655
            unsigned int idx = 0;
1656
            uint64_t err = sq( tab[0] + pix );
1657
            for( int j=1; j<4; j++ )
1658
            {
1659
                uint64_t local = sq( tab[j] + pix );
1660
                if( local < err )
1661
                {
1662
                    err = local;
1663
                    idx = j;
1664
                }
1665
            }
1666
            *sel++ = idx;
1667
            *ter++ += err;
1668
        }
1669
#endif
1670
    }
1671
}
1672

1673
#if defined __SSE4_1__ || defined __ARM_NEON
1674
// Non-reference implementation, but faster. Produces same results as the AVX2 version
1675
static etcpak_force_inline void FindBestFit( uint32_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
1676
{
1677
    for( size_t i=0; i<16; i++ )
1678
    {
1679
        uint16_t* sel = tsel[i];
1680
        unsigned int bid = id[i];
1681
        uint32_t* ter = terr[bid%2];
1682

1683
        uint8_t b = *data++;
1684
        uint8_t g = *data++;
1685
        uint8_t r = *data++;
1686
        data++;
1687

1688
        int dr = a[bid][0] - r;
1689
        int dg = a[bid][1] - g;
1690
        int db = a[bid][2] - b;
1691

1692
#ifdef __SSE4_1__
1693
        // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
1694
        // This produces slightly different results, but is significant faster
1695
        __m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14);
1696
        __m128i pix = _mm_abs_epi16(pixel);
1697

1698
        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
1699
        // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
1700
        __m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0]));
1701
        __m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1]));
1702

1703
        __m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1));
1704
        __m128i minError = _mm_min_epi16(error0, error1);
1705

1706
        // Exploiting symmetry of the selector table and use the sign bit
1707
        // This produces slightly different results, but is needed to produce same results as AVX2 implementation
1708
        __m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1));
1709
        __m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit));
1710

1711
        // Squaring the minimum error to produce correct values when adding
1712
        __m128i squareErrorLo = _mm_mullo_epi16(minError, minError);
1713
        __m128i squareErrorHi = _mm_mulhi_epi16(minError, minError);
1714

1715
        __m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi);
1716
        __m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi);
1717

1718
        squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
1719
        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
1720
        squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
1721
        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
1722

1723
        _mm_storeu_si128((__m128i*)sel, minIndex);
1724
#elif defined __ARM_NEON
1725
        int16x8_t pixel = vdupq_n_s16( dr * 38 + dg * 76 + db * 14 );
1726
        int16x8_t pix = vabsq_s16( pixel );
1727

1728
        int16x8_t error0 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[0] ) );
1729
        int16x8_t error1 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[1] ) );
1730

1731
        int16x8_t index = vandq_s16( vreinterpretq_s16_u16( vcltq_s16( error1, error0 ) ), vdupq_n_s16( 1 ) );
1732
        int16x8_t minError = vminq_s16( error0, error1 );
1733

1734
        int16x8_t indexBit = vandq_s16( vmvnq_s16( vshrq_n_s16( pixel, 15 ) ), vdupq_n_s16( -1 ) );
1735
        int16x8_t minIndex = vorrq_s16( index, vaddq_s16( indexBit, indexBit ) );
1736

1737
        int16x4_t minErrorLow = vget_low_s16( minError );
1738
        int16x4_t minErrorHigh = vget_high_s16( minError );
1739

1740
        int32x4_t squareErrorLow = vmull_s16( minErrorLow, minErrorLow );
1741
        int32x4_t squareErrorHigh = vmull_s16( minErrorHigh, minErrorHigh );
1742

1743
        int32x4_t squareErrorSumLow = vaddq_s32( squareErrorLow, vld1q_s32( (int32_t*)ter ) );
1744
        int32x4_t squareErrorSumHigh = vaddq_s32( squareErrorHigh, vld1q_s32( (int32_t*)ter + 4 ) );
1745

1746
        vst1q_s32( (int32_t*)ter, squareErrorSumLow );
1747
        vst1q_s32( (int32_t*)ter + 4, squareErrorSumHigh );
1748

1749
        vst1q_s16( (int16_t*)sel, minIndex );
1750
#endif
1751
    }
1752
}
1753
#endif
1754

1755
static etcpak_force_inline uint8_t convert6(float f)
1756
{
1757
    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
1758
    return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3;
1759
}
1760

1761
static etcpak_force_inline uint8_t convert7(float f)
1762
{
1763
    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
1764
    return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
1765
}
1766

1767
static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode, bool useHeuristics )
1768
{
1769
    int32_t r = 0;
1770
    int32_t g = 0;
1771
    int32_t b = 0;
1772

1773
    for( int i = 0; i < 16; ++i )
1774
    {
1775
        b += src[i * 4 + 0];
1776
        g += src[i * 4 + 1];
1777
        r += src[i * 4 + 2];
1778
    }
1779

1780
    int32_t difRyz = 0;
1781
    int32_t difGyz = 0;
1782
    int32_t difByz = 0;
1783
    int32_t difRxz = 0;
1784
    int32_t difGxz = 0;
1785
    int32_t difBxz = 0;
1786

1787
    const int32_t scaling[] = { -255, -85, 85, 255 };
1788

1789
    for (int i = 0; i < 16; ++i)
1790
    {
1791
        int32_t difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b;
1792
        int32_t difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g;
1793
        int32_t difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r;
1794

1795
        difRyz += difR * scaling[i % 4];
1796
        difGyz += difG * scaling[i % 4];
1797
        difByz += difB * scaling[i % 4];
1798

1799
        difRxz += difR * scaling[i / 4];
1800
        difGxz += difG * scaling[i / 4];
1801
        difBxz += difB * scaling[i / 4];
1802
    }
1803

1804
    const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f);
1805

1806
    float aR = difRxz * scale;
1807
    float aG = difGxz * scale;
1808
    float aB = difBxz * scale;
1809

1810
    float bR = difRyz * scale;
1811
    float bG = difGyz * scale;
1812
    float bB = difByz * scale;
1813

1814
    float dR = r * (4.0f / 16.0f);
1815
    float dG = g * (4.0f / 16.0f);
1816
    float dB = b * (4.0f / 16.0f);
1817

1818
    // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
1819
    float cofR = std::fma(aR,  255.0f, std::fma(bR,  255.0f, dR));
1820
    float cofG = std::fma(aG,  255.0f, std::fma(bG,  255.0f, dG));
1821
    float cofB = std::fma(aB,  255.0f, std::fma(bB,  255.0f, dB));
1822
    float chfR = std::fma(aR, -425.0f, std::fma(bR,  255.0f, dR));
1823
    float chfG = std::fma(aG, -425.0f, std::fma(bG,  255.0f, dG));
1824
    float chfB = std::fma(aB, -425.0f, std::fma(bB,  255.0f, dB));
1825
    float cvfR = std::fma(aR,  255.0f, std::fma(bR, -425.0f, dR));
1826
    float cvfG = std::fma(aG,  255.0f, std::fma(bG, -425.0f, dG));
1827
    float cvfB = std::fma(aB,  255.0f, std::fma(bB, -425.0f, dB));
1828

1829
    // convert to r6g7b6
1830
    int32_t coR = convert6(cofR);
1831
    int32_t coG = convert7(cofG);
1832
    int32_t coB = convert6(cofB);
1833
    int32_t chR = convert6(chfR);
1834
    int32_t chG = convert7(chfG);
1835
    int32_t chB = convert6(chfB);
1836
    int32_t cvR = convert6(cvfR);
1837
    int32_t cvG = convert7(cvfG);
1838
    int32_t cvB = convert6(cvfB);
1839

1840
    // Error calculation
1841
    uint64_t error = 0;
1842
    if( ModePlanar != mode && useHeuristics )
1843
    {
1844
        auto ro0 = coR;
1845
        auto go0 = coG;
1846
        auto bo0 = coB;
1847
        auto ro1 = ( ro0 >> 4 ) | ( ro0 << 2 );
1848
        auto go1 = ( go0 >> 6 ) | ( go0 << 1 );
1849
        auto bo1 = ( bo0 >> 4 ) | ( bo0 << 2 );
1850
        auto ro2 = ( ro1 << 2 ) + 2;
1851
        auto go2 = ( go1 << 2 ) + 2;
1852
        auto bo2 = ( bo1 << 2 ) + 2;
1853

1854
        auto rh0 = chR;
1855
        auto gh0 = chG;
1856
        auto bh0 = chB;
1857
        auto rh1 = ( rh0 >> 4 ) | ( rh0 << 2 );
1858
        auto gh1 = ( gh0 >> 6 ) | ( gh0 << 1 );
1859
        auto bh1 = ( bh0 >> 4 ) | ( bh0 << 2 );
1860

1861
        auto rh2 = rh1 - ro1;
1862
        auto gh2 = gh1 - go1;
1863
        auto bh2 = bh1 - bo1;
1864

1865
        auto rv0 = cvR;
1866
        auto gv0 = cvG;
1867
        auto bv0 = cvB;
1868
        auto rv1 = ( rv0 >> 4 ) | ( rv0 << 2 );
1869
        auto gv1 = ( gv0 >> 6 ) | ( gv0 << 1 );
1870
        auto bv1 = ( bv0 >> 4 ) | ( bv0 << 2 );
1871

1872
        auto rv2 = rv1 - ro1;
1873
        auto gv2 = gv1 - go1;
1874
        auto bv2 = bv1 - bo1;
1875
        for( int i = 0; i < 16; ++i )
1876
        {
1877
            int32_t cR = clampu8( ( rh2 * ( i / 4 ) + rv2 * ( i % 4 ) + ro2 ) >> 2 );
1878
            int32_t cG = clampu8( ( gh2 * ( i / 4 ) + gv2 * ( i % 4 ) + go2 ) >> 2 );
1879
            int32_t cB = clampu8( ( bh2 * ( i / 4 ) + bv2 * ( i % 4 ) + bo2 ) >> 2 );
1880

1881
            int32_t difB = static_cast<int>( src[i * 4 + 0] ) - cB;
1882
            int32_t difG = static_cast<int>( src[i * 4 + 1] ) - cG;
1883
            int32_t difR = static_cast<int>( src[i * 4 + 2] ) - cR;
1884

1885
            int32_t dif = difR * 38 + difG * 76 + difB * 14;
1886

1887
            error += dif * dif;
1888
        }
1889
    }
1890

1891
    /**/
1892
    uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 );
1893
    uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 );
1894
    uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 );
1895
    uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C );
1896
    lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 );
1897
    lo |= ( ( coG & 0x3F ) << 17 ) | ( ( coG & 0x40 ) << 18 );
1898
    lo |= coR << 25;
1899

1900
    const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 );
1901

1902
    lo |= g_flags[idx];
1903

1904
    uint64_t result = static_cast<uint32_t>( _bswap( lo ) );
1905
    result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32;
1906

1907
    return std::make_pair( result, error );
1908
}
1909

1910
#ifdef __ARM_NEON
1911

1912
static etcpak_force_inline int32x2_t Planar_NEON_DifXZ( int16x8_t dif_lo, int16x8_t dif_hi )
1913
{
1914
    int32x4_t dif0 = vmull_n_s16( vget_low_s16( dif_lo ), -255 );
1915
    int32x4_t dif1 = vmull_n_s16( vget_high_s16( dif_lo ), -85 );
1916
    int32x4_t dif2 = vmull_n_s16( vget_low_s16( dif_hi ), 85 );
1917
    int32x4_t dif3 = vmull_n_s16( vget_high_s16( dif_hi ), 255 );
1918
    int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
1919

1920
#ifndef __aarch64__
1921
    int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
1922
    return vpadd_s32( dif5, dif5 );
1923
#else
1924
    return vdup_n_s32( vaddvq_s32( dif4 ) );
1925
#endif
1926
}
1927

1928
static etcpak_force_inline int32x2_t Planar_NEON_DifYZ( int16x8_t dif_lo, int16x8_t dif_hi )
1929
{
1930
    int16x4_t scaling = { -255, -85, 85, 255 };
1931
    int32x4_t dif0 = vmull_s16( vget_low_s16( dif_lo ), scaling );
1932
    int32x4_t dif1 = vmull_s16( vget_high_s16( dif_lo ), scaling );
1933
    int32x4_t dif2 = vmull_s16( vget_low_s16( dif_hi ), scaling );
1934
    int32x4_t dif3 = vmull_s16( vget_high_s16( dif_hi ), scaling );
1935
    int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
1936

1937
#ifndef __aarch64__
1938
    int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
1939
    return vpadd_s32( dif5, dif5 );
1940
#else
1941
    return vdup_n_s32( vaddvq_s32( dif4 ) );
1942
#endif
1943
}
1944

1945
static etcpak_force_inline int16x8_t Planar_NEON_SumWide( uint8x16_t src )
1946
{
1947
    uint16x8_t accu8 = vpaddlq_u8( src );
1948
#ifndef __aarch64__
1949
    uint16x4_t accu4 = vpadd_u16( vget_low_u16( accu8 ), vget_high_u16( accu8 ) );
1950
    uint16x4_t accu2 = vpadd_u16( accu4, accu4 );
1951
    uint16x4_t accu1 = vpadd_u16( accu2, accu2 );
1952
    return vreinterpretq_s16_u16( vcombine_u16( accu1, accu1 ) );
1953
#else
1954
    return vdupq_n_s16( vaddvq_u16( accu8 ) );
1955
#endif
1956
}
1957

1958
static etcpak_force_inline int16x8_t convert6_NEON( int32x4_t lo, int32x4_t hi )
1959
{
1960
    uint16x8_t x = vcombine_u16( vqmovun_s32( lo ), vqmovun_s32( hi ) );
1961
    int16x8_t i = vreinterpretq_s16_u16( vshrq_n_u16( vqshlq_n_u16( x, 6 ), 6) ); // clamp 0-1023
1962
    i = vhsubq_s16( i, vdupq_n_s16( 15 ) );
1963

1964
    int16x8_t ip11 = vaddq_s16( i, vdupq_n_s16( 11 ) );
1965
    int16x8_t ip4 = vaddq_s16( i, vdupq_n_s16( 4 ) );
1966

1967
    return vshrq_n_s16( vsubq_s16( vsubq_s16( ip11, vshrq_n_s16( ip11, 7 ) ), vshrq_n_s16( ip4, 7) ), 3 );
1968
}
1969

1970
static etcpak_force_inline int16x4_t convert7_NEON( int32x4_t x )
1971
{
1972
    int16x4_t i = vreinterpret_s16_u16( vshr_n_u16( vqshl_n_u16( vqmovun_s32( x ), 6 ), 6 ) ); // clamp 0-1023
1973
    i = vhsub_s16( i, vdup_n_s16( 15 ) );
1974

1975
    int16x4_t p9 = vadd_s16( i, vdup_n_s16( 9 ) );
1976
    int16x4_t p6 = vadd_s16( i, vdup_n_s16( 6 ) );
1977
    return vshr_n_s16( vsub_s16( vsub_s16( p9, vshr_n_s16( p9, 8 ) ), vshr_n_s16( p6, 8 ) ), 2 );
1978
}
1979

1980
static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode, bool useHeuristics )
1981
{
1982
    uint8x16x4_t srcBlock = vld4q_u8( src );
1983

1984
    int16x8_t bSumWide = Planar_NEON_SumWide( srcBlock.val[0] );
1985
    int16x8_t gSumWide = Planar_NEON_SumWide( srcBlock.val[1] );
1986
    int16x8_t rSumWide = Planar_NEON_SumWide( srcBlock.val[2] );
1987

1988
    int16x8_t dif_R_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[2] ), 4) ), rSumWide );
1989
    int16x8_t dif_R_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[2] ), 4) ), rSumWide );
1990

1991
    int16x8_t dif_G_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[1] ), 4 ) ), gSumWide );
1992
    int16x8_t dif_G_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[1] ), 4 ) ), gSumWide );
1993

1994
    int16x8_t dif_B_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[0] ), 4) ), bSumWide );
1995
    int16x8_t dif_B_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[0] ), 4) ), bSumWide );
1996

1997
    int32x2x2_t dif_xz_z = vzip_s32( vzip_s32( Planar_NEON_DifXZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifXZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifXZ( dif_G_lo, dif_G_hi ) );
1998
    int32x4_t dif_xz = vcombine_s32( dif_xz_z.val[0], dif_xz_z.val[1] );
1999
    int32x2x2_t dif_yz_z = vzip_s32( vzip_s32( Planar_NEON_DifYZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifYZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifYZ( dif_G_lo, dif_G_hi ) );
2000
    int32x4_t dif_yz = vcombine_s32( dif_yz_z.val[0], dif_yz_z.val[1] );
2001

2002
    const float fscale = -4.0f / ( (255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f );
2003
    float32x4_t fa = vmulq_n_f32( vcvtq_f32_s32( dif_xz ), fscale );
2004
    float32x4_t fb = vmulq_n_f32( vcvtq_f32_s32( dif_yz ), fscale );
2005
    int16x4_t bgrgSum = vzip_s16( vzip_s16( vget_low_s16( bSumWide ), vget_low_s16( rSumWide ) ).val[0], vget_low_s16( gSumWide ) ).val[0];
2006
    float32x4_t fd = vmulq_n_f32( vcvtq_f32_s32( vmovl_s16( bgrgSum ) ), 4.0f / 16.0f);
2007

2008
    float32x4_t cof = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, 255.0f );
2009
    float32x4_t chf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, -425.0f );
2010
    float32x4_t cvf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, -425.0f ), fa, 255.0f );
2011

2012
    int32x4_t coi = vcvtq_s32_f32( cof );
2013
    int32x4_t chi = vcvtq_s32_f32( chf );
2014
    int32x4_t cvi = vcvtq_s32_f32( cvf );
2015

2016
    int32x4x2_t tr_hv = vtrnq_s32( chi, cvi );
2017
    int32x4x2_t tr_o = vtrnq_s32( coi, coi );
2018

2019
    int16x8_t c_hvoo_br_6 = convert6_NEON( tr_hv.val[0], tr_o.val[0] );
2020
    int16x4_t c_hvox_g_7 = convert7_NEON( vcombine_s32( vget_low_s32( tr_hv.val[1] ), vget_low_s32( tr_o.val[1] ) ) );
2021
    int16x8_t c_hvoo_br_8 = vorrq_s16( vshrq_n_s16( c_hvoo_br_6, 4 ), vshlq_n_s16( c_hvoo_br_6, 2 ) );
2022
    int16x4_t c_hvox_g_8 = vorr_s16( vshr_n_s16( c_hvox_g_7, 6 ), vshl_n_s16( c_hvox_g_7, 1 ) );
2023

2024
    uint64_t error = 0;
2025
    if( mode != ModePlanar && useHeuristics )
2026
    {
2027
        int16x4_t rec_gxbr_o = vext_s16( c_hvox_g_8, vget_high_s16( c_hvoo_br_8 ), 3 );
2028

2029
        rec_gxbr_o = vadd_s16( vshl_n_s16( rec_gxbr_o, 2 ), vdup_n_s16( 2 ) );
2030
        int16x8_t rec_ro_wide = vdupq_lane_s16( rec_gxbr_o, 3 );
2031
        int16x8_t rec_go_wide = vdupq_lane_s16( rec_gxbr_o, 0 );
2032
        int16x8_t rec_bo_wide = vdupq_lane_s16( rec_gxbr_o, 1 );
2033

2034
        int16x4_t br_hv2 = vsub_s16( vget_low_s16( c_hvoo_br_8 ), vget_high_s16( c_hvoo_br_8 ) );
2035
        int16x4_t gg_hv2 = vsub_s16( c_hvox_g_8, vdup_lane_s16( c_hvox_g_8, 2 ) );
2036

2037
        int16x8_t scaleh_lo = { 0, 0, 0, 0, 1, 1, 1, 1 };
2038
        int16x8_t scaleh_hi = { 2, 2, 2, 2, 3, 3, 3, 3 };
2039
        int16x8_t scalev = { 0, 1, 2, 3, 0, 1, 2, 3 };
2040

2041
        int16x8_t rec_r_1 = vmlaq_lane_s16( rec_ro_wide, scalev, br_hv2, 3 );
2042
        int16x8_t rec_r_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_lo, br_hv2, 2 ), 2 ) ) );
2043
        int16x8_t rec_r_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_hi, br_hv2, 2 ), 2 ) ) );
2044

2045
        int16x8_t rec_b_1 = vmlaq_lane_s16( rec_bo_wide, scalev, br_hv2, 1 );
2046
        int16x8_t rec_b_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_lo, br_hv2, 0 ), 2 ) ) );
2047
        int16x8_t rec_b_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_hi, br_hv2, 0 ), 2 ) ) );
2048

2049
        int16x8_t rec_g_1 = vmlaq_lane_s16( rec_go_wide, scalev, gg_hv2, 1 );
2050
        int16x8_t rec_g_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_lo, gg_hv2, 0 ), 2 ) ) );
2051
        int16x8_t rec_g_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_hi, gg_hv2, 0 ), 2 ) ) );
2052

2053
        int16x8_t dif_r_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[2] ) ) ), rec_r_lo );
2054
        int16x8_t dif_r_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[2] ) ) ), rec_r_hi );
2055

2056
        int16x8_t dif_g_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[1] ) ) ), rec_g_lo );
2057
        int16x8_t dif_g_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[1] ) ) ), rec_g_hi );
2058

2059
        int16x8_t dif_b_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[0] ) ) ), rec_b_lo );
2060
        int16x8_t dif_b_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[0] ) ) ), rec_b_hi );
2061

2062
        int16x8_t dif_lo = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_lo, 38 ), dif_g_lo, 76 ), dif_b_lo, 14 );
2063
        int16x8_t dif_hi = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_hi, 38 ), dif_g_hi, 76 ), dif_b_hi, 14 );
2064

2065
        int16x4_t tmpDif = vget_low_s16( dif_lo );
2066
        int32x4_t difsq_0 = vmull_s16( tmpDif, tmpDif );
2067
        tmpDif = vget_high_s16( dif_lo );
2068
        int32x4_t difsq_1 = vmull_s16( tmpDif, tmpDif );
2069
        tmpDif = vget_low_s16( dif_hi );
2070
        int32x4_t difsq_2 = vmull_s16( tmpDif, tmpDif );
2071
        tmpDif = vget_high_s16( dif_hi );
2072
        int32x4_t difsq_3 = vmull_s16( tmpDif, tmpDif );
2073

2074
        uint32x4_t difsq_5 = vaddq_u32( vreinterpretq_u32_s32( difsq_0 ), vreinterpretq_u32_s32( difsq_1 ) );
2075
        uint32x4_t difsq_6 = vaddq_u32( vreinterpretq_u32_s32( difsq_2 ), vreinterpretq_u32_s32( difsq_3 ) );
2076

2077
        uint64x2_t difsq_7 = vaddl_u32( vget_low_u32( difsq_5 ), vget_high_u32( difsq_5 ) );
2078
        uint64x2_t difsq_8 = vaddl_u32( vget_low_u32( difsq_6 ), vget_high_u32( difsq_6 ) );
2079

2080
        uint64x2_t difsq_9 = vaddq_u64( difsq_7, difsq_8 );
2081

2082
#ifdef __aarch64__
2083
        error = vaddvq_u64( difsq_9 );
2084
#else
2085
        error = vgetq_lane_u64( difsq_9, 0 ) + vgetq_lane_u64( difsq_9, 1 );
2086
#endif
2087
    }
2088

2089
    int32_t coR = c_hvoo_br_6[6];
2090
    int32_t coG = c_hvox_g_7[2];
2091
    int32_t coB = c_hvoo_br_6[4];
2092

2093
    int32_t chR = c_hvoo_br_6[2];
2094
    int32_t chG = c_hvox_g_7[0];
2095
    int32_t chB = c_hvoo_br_6[0];
2096

2097
    int32_t cvR = c_hvoo_br_6[3];
2098
    int32_t cvG = c_hvox_g_7[1];
2099
    int32_t cvB = c_hvoo_br_6[1];
2100

2101
    uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 );
2102
    uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 );
2103
    uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 );
2104
    uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C );
2105
    lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 );
2106
    lo |= ( ( coG & 0x3F) << 17) | ( (coG & 0x40 ) << 18 );
2107
    lo |= coR << 25;
2108

2109
    const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 );
2110

2111
    lo |= g_flags[idx];
2112

2113
    uint64_t result = static_cast<uint32_t>( _bswap(lo) );
2114
    result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32;
2115

2116
    return std::make_pair( result, error );
2117
}
2118

2119
#endif
2120

2121
#ifdef __AVX2__
2122
uint32_t calculateErrorTH( bool tMode, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist, __m128i r8, __m128i g8, __m128i b8 )
2123
#else
2124
uint32_t calculateErrorTH( bool tMode, uint8_t* src, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist )
2125
#endif
2126
{
2127
    uint32_t blockErr = 0, bestBlockErr = MaxError;
2128

2129
    uint32_t pixColors;
2130
    uint8_t possibleColors[4][3];
2131
    uint8_t colors[2][3];
2132

2133
    decompressColor( colorsRGB444, colors );
2134

2135
#ifdef __AVX2__
2136
    __m128i reverseMask = _mm_set_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 );
2137
#endif
2138

2139
    // test distances
2140
    for( uint8_t d = startDist; d < 8; ++d )
2141
    {
2142
        if( d >= 2 && dist == d - 2 ) break;
2143

2144
        blockErr = 0;
2145
        pixColors = 0;
2146

2147
        if( tMode )
2148
        {
2149
            calculatePaintColors59T( d, colors, possibleColors );
2150
        }
2151
        else
2152
        {
2153
            calculatePaintColors58H( d, colors, possibleColors );
2154
        }
2155

2156
#ifdef __AVX2__
2157
        // RGB ordering
2158
        __m128i b8Rev = _mm_shuffle_epi8( b8, reverseMask );
2159
        __m128i g8Rev = _mm_shuffle_epi8( g8, reverseMask );
2160
        __m128i r8Rev = _mm_shuffle_epi8( r8, reverseMask );
2161

2162
        // extends 3x128 bits RGB into 3x256 bits RGB for error comparisions
2163
        static const __m128i zero = _mm_setzero_si128();
2164
        __m128i b8Lo = _mm_unpacklo_epi8( b8Rev, zero );
2165
        __m128i g8Lo = _mm_unpacklo_epi8( g8Rev, zero );
2166
        __m128i r8Lo = _mm_unpacklo_epi8( r8Rev, zero );
2167
        __m128i b8Hi = _mm_unpackhi_epi8( b8Rev, zero );
2168
        __m128i g8Hi = _mm_unpackhi_epi8( g8Rev, zero );
2169
        __m128i r8Hi = _mm_unpackhi_epi8( r8Rev, zero );
2170

2171
        __m256i b8 = _mm256_set_m128i( b8Hi, b8Lo );
2172
        __m256i g8 = _mm256_set_m128i( g8Hi, g8Lo );
2173
        __m256i r8 = _mm256_set_m128i( r8Hi, r8Lo );
2174

2175
        // caculates differences between the pixel colrs and the palette colors
2176
        __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[0][B] ) ) );
2177
        __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[0][G] ) ) );
2178
        __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[0][R] ) ) );
2179

2180
        // luma-based error calculations
2181
        static const __m256i bWeight = _mm256_set1_epi16( 14 );
2182
        static const __m256i gWeight = _mm256_set1_epi16( 76 );
2183
        static const __m256i rWeight = _mm256_set1_epi16( 38 );
2184

2185
        diffb = _mm256_mullo_epi16( diffb, bWeight );
2186
        diffg = _mm256_mullo_epi16( diffg, gWeight );
2187
        diffr = _mm256_mullo_epi16( diffr, rWeight );
2188

2189
        // obtains the error with the current palette color
2190
        __m256i lowestPixErr = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
2191

2192
        // error calucations with the remaining three palette colors
2193
        static const uint32_t masks[4] = { 0, 0x55555555, 0xAAAAAAAA, 0xFFFFFFFF };
2194
        for( uint8_t c = 1; c < 4; c++ )
2195
        {
2196
            __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[c][B] ) ) );
2197
            __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[c][G] ) ) );
2198
            __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[c][R] ) ) );
2199

2200
            diffb = _mm256_mullo_epi16( diffb, bWeight );
2201
            diffg = _mm256_mullo_epi16( diffg, gWeight );
2202
            diffr = _mm256_mullo_epi16( diffr, rWeight );
2203

2204
            // error comparison with the previous best color
2205
            __m256i pixErrors = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
2206
            __m256i minErr = _mm256_min_epu16( lowestPixErr, pixErrors );
2207
            __m256i cmpRes = _mm256_cmpeq_epi16( pixErrors, minErr );
2208
            lowestPixErr = minErr;
2209

2210
            // update pixel colors
2211
            uint32_t updPixColors = _mm256_movemask_epi8( cmpRes );
2212
            uint32_t prevPixColors = pixColors & ~updPixColors;
2213
            uint32_t mskPixColors = masks[c] & updPixColors;
2214
            pixColors = prevPixColors | mskPixColors;
2215
        }
2216

2217
        // accumulate the block error
2218
        alignas( 32 ) uint16_t pixErr16[16] = { 0, };
2219
        _mm256_storeu_si256( (__m256i*)pixErr16, lowestPixErr );
2220
        for( uint8_t p = 0; p < 16; p++ )
2221
        {
2222
            blockErr += (int)( pixErr16[p] ) * pixErr16[p];
2223
        }
2224
#else
2225
        for( size_t y = 0; y < 4; ++y )
2226
        {
2227
            for( size_t x = 0; x < 4; ++x )
2228
            {
2229
                uint32_t bestPixErr = MaxError;
2230
                pixColors <<= 2; // Make room for next value
2231

2232
                // Loop possible block colors
2233
                for( uint8_t c = 0; c < 4; ++c )
2234
                {
2235
                    int diff[3];
2236
                    diff[R] = src[4 * ( x * 4 + y ) + R] - possibleColors[c][R];
2237
                    diff[G] = src[4 * ( x * 4 + y ) + G] - possibleColors[c][G];
2238
                    diff[B] = src[4 * ( x * 4 + y ) + B] - possibleColors[c][B];
2239

2240
                    const uint32_t err = 38 * abs( diff[R] ) + 76 * abs( diff[G] ) + 14 * abs( diff[B] );
2241
                    uint32_t pixErr = err * err;
2242

2243
                    // Choose best error
2244
                    if( pixErr < bestPixErr )
2245
                    {
2246
                        bestPixErr = pixErr;
2247
                        pixColors ^= ( pixColors & 3 ); // Reset the two first bits
2248
                        pixColors |= c;
2249
                    }
2250
                }
2251
                blockErr += bestPixErr;
2252
            }
2253
        }
2254
#endif
2255

2256
        if( blockErr < bestBlockErr )
2257
        {
2258
            bestBlockErr = blockErr;
2259
            dist = d;
2260
            pixIndices = pixColors;
2261
        }
2262
    }
2263

2264
    return bestBlockErr;
2265
}
2266

2267

2268
// main T-/H-mode compression function
2269
#ifdef __AVX2__
2270
uint32_t compressBlockTH( uint8_t* src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool& tMode, __m128i r8, __m128i g8, __m128i b8 )
2271
#else
2272
uint32_t compressBlockTH( uint8_t *src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool &tMode )
2273
#endif
2274
{
2275
#ifdef __AVX2__
2276
    alignas( 8 ) uint8_t luma[16] = { 0, };
2277
    _mm_storeu_si128 ( (__m128i* )luma, l.luma8 );
2278
#elif defined __ARM_NEON && defined __aarch64__
2279
    alignas( 8 ) uint8_t luma[16] = { 0 };
2280
    vst1q_u8( luma, l.luma8 );
2281
#else
2282
    uint8_t* luma = l.val;
2283
#endif
2284

2285
    uint8_t pixIdx[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
2286

2287
    // 1) sorts the pairs of (luma, pix_idx)
2288
    insertionSort( luma, pixIdx );
2289

2290
    // 2) finds the min (left+right)
2291
    uint8_t minSumRangeIdx = 0;
2292
    uint16_t minSumRangeValue;
2293
    uint16_t sum;
2294
    static const uint8_t diffBonus[15] = {8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 8};
2295
    const int16_t temp = luma[15] - luma[0];
2296

2297
    minSumRangeValue = luma[15] - luma[1] + diffBonus[0];
2298
    for( uint8_t i = 1; i < 14; i++ )
2299
    {
2300
        sum = temp - luma[i+1] + luma[i] + diffBonus[i];
2301
        if( minSumRangeValue > sum )
2302
        {
2303
            minSumRangeValue = sum;
2304
            minSumRangeIdx = i;
2305
        }
2306
    }
2307

2308
    sum = luma[14] - luma[0] + diffBonus[14];
2309
    if( minSumRangeValue > sum )
2310
    {
2311
        minSumRangeValue = sum;
2312
        minSumRangeIdx = 14;
2313
    }
2314
    uint8_t lRange, rRange;
2315

2316
    lRange = luma[minSumRangeIdx] - luma[0];
2317
    rRange = luma[15] - luma[minSumRangeIdx + 1];
2318

2319
    // 3) sets a proper mode
2320
    bool swap = false;
2321
    if( lRange >= rRange )
2322
    {
2323
        if( lRange >= rRange * 2 )
2324
        {
2325
            swap = true;
2326
            tMode = true;
2327
        }
2328
    }
2329
    else
2330
    {
2331
        if( lRange * 2 <= rRange ) tMode = true;
2332
    }
2333
    // 4) calculates the two base colors
2334
    uint8_t rangeIdx[4] = { pixIdx[0], pixIdx[minSumRangeIdx], pixIdx[minSumRangeIdx + 1], pixIdx[15] };
2335

2336
    uint16_t r[4], g[4], b[4];
2337
    for( uint8_t i = 0; i < 4; ++i )
2338
    {
2339
        uint8_t idx = rangeIdx[i] * 4;
2340
        b[i] = src[idx];
2341
        g[i] = src[idx + 1];
2342
        r[i] = src[idx + 2];
2343
    }
2344

2345
    uint8_t mid_rgb[2][3];
2346
    if( swap )
2347
    {
2348
        mid_rgb[1][B] = ( b[0] + b[1] ) / 2;
2349
        mid_rgb[1][G] = ( g[0] + g[1] ) / 2;
2350
        mid_rgb[1][R] = ( r[0] + r[1] ) / 2;
2351

2352
        uint16_t sum_rgb[3] = { 0, 0, 0 };
2353
        for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
2354
        {
2355
            uint8_t idx = pixIdx[i] * 4;
2356
            sum_rgb[B] += src[idx];
2357
            sum_rgb[G] += src[idx + 1];
2358
            sum_rgb[R] += src[idx + 2];
2359
        }
2360
        const uint8_t temp = 15 - minSumRangeIdx;
2361
        mid_rgb[0][B] = sum_rgb[B] / temp;
2362
        mid_rgb[0][G] = sum_rgb[G] / temp;
2363
        mid_rgb[0][R] = sum_rgb[R] / temp;
2364
    }
2365
    else
2366
    {
2367
        mid_rgb[0][B] = (b[0] + b[1]) / 2;
2368
        mid_rgb[0][G] = (g[0] + g[1]) / 2;
2369
        mid_rgb[0][R] = (r[0] + r[1]) / 2;
2370
        if( tMode )
2371
        {
2372
            uint16_t sum_rgb[3] = { 0, 0, 0 };
2373
            for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
2374
            {
2375
                uint8_t idx = pixIdx[i] * 4;
2376
                sum_rgb[B] += src[idx];
2377
                sum_rgb[G] += src[idx + 1];
2378
                sum_rgb[R] += src[idx + 2];
2379
            }
2380
            const uint8_t temp = 15 - minSumRangeIdx;
2381
            mid_rgb[1][B] = sum_rgb[B] / temp;
2382
            mid_rgb[1][G] = sum_rgb[G] / temp;
2383
            mid_rgb[1][R] = sum_rgb[R] / temp;
2384
        }
2385
        else
2386
        {
2387
            mid_rgb[1][B] = (b[2] + b[3]) / 2;
2388
            mid_rgb[1][G] = (g[2] + g[3]) / 2;
2389
            mid_rgb[1][R] = (r[2] + r[3]) / 2;
2390
        }
2391
    }
2392

2393
    // 5) sets the start distance index
2394
    uint32_t startDistCandidate;
2395
    uint32_t avgDist;
2396
    if( tMode )
2397
    {
2398
        if( swap )
2399
        {
2400
            avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] ) / 6;
2401
        }
2402
        else
2403
        {
2404
            avgDist = ( b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 6;
2405
        }
2406
    }
2407
    else
2408
    {
2409
        avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] + b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 12;
2410
    }
2411

2412
    if( avgDist <= 16)
2413
    {
2414
        startDistCandidate = 0;
2415
    }
2416
    else if( avgDist <= 23 )
2417
    {
2418
        startDistCandidate = 1;
2419
    }
2420
    else if( avgDist <= 32 )
2421
    {
2422
        startDistCandidate = 2;
2423
    }
2424
    else if( avgDist <= 41 )
2425
    {
2426
        startDistCandidate = 3;
2427
    }
2428
    else
2429
    {
2430
        startDistCandidate = 4;
2431
    }
2432

2433
    uint32_t bestErr = MaxError;
2434
    uint32_t bestPixIndices;
2435
    uint8_t bestDist = 10;
2436
    uint8_t colorsRGB444[2][3];
2437
    compressColor( mid_rgb, colorsRGB444, tMode );
2438
    compressed1 = 0;
2439

2440
    // 6) finds the best candidate with the lowest error
2441
#ifdef __AVX2__
2442
    // Vectorized ver
2443
    bestErr = calculateErrorTH( tMode, colorsRGB444, bestDist, bestPixIndices, startDistCandidate, r8, g8, b8 );
2444
#else
2445
    // Scalar ver
2446
    bestErr = calculateErrorTH( tMode, src, colorsRGB444, bestDist, bestPixIndices, startDistCandidate );
2447
#endif
2448

2449
    // 7) outputs the final T or H block
2450
    if( tMode )
2451
    {
2452
        // Put the compress params into the compression block
2453
        compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 23;
2454
        compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 19;
2455
        compressed1 |= ( colorsRGB444[0][B] ) << 15;
2456
        compressed1 |= ( colorsRGB444[1][R] ) << 11;
2457
        compressed1 |= ( colorsRGB444[1][G] ) << 7;
2458
        compressed1 |= ( colorsRGB444[1][B] ) << 3;
2459
        compressed1 |= bestDist & 0x7;
2460
    }
2461
    else
2462
    {
2463
        int bestRGB444ColPacked[2];
2464
        bestRGB444ColPacked[0] = (colorsRGB444[0][R] << 8) + (colorsRGB444[0][G] << 4) + colorsRGB444[0][B];
2465
        bestRGB444ColPacked[1] = (colorsRGB444[1][R] << 8) + (colorsRGB444[1][G] << 4) + colorsRGB444[1][B];
2466
        if( ( bestRGB444ColPacked[0] >= bestRGB444ColPacked[1] ) ^ ( ( bestDist & 1 ) == 1 ) )
2467
        {
2468
            swapColors( colorsRGB444 );
2469
            // Reshuffle pixel indices to to exchange C1 with C3, and C2 with C4
2470
            bestPixIndices = ( 0x55555555 & bestPixIndices ) | ( 0xaaaaaaaa & ( ~bestPixIndices ) );
2471
        }
2472

2473
        // Put the compress params into the compression block
2474
        compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 22;
2475
        compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 18;
2476
        compressed1 |= ( colorsRGB444[0][B] & 0xf ) << 14;
2477
        compressed1 |= ( colorsRGB444[1][R] & 0xf ) << 10;
2478
        compressed1 |= ( colorsRGB444[1][G] & 0xf ) << 6;
2479
        compressed1 |= ( colorsRGB444[1][B] & 0xf ) << 2;
2480
        compressed1 |= ( bestDist >> 1 ) & 0x3;
2481
    }
2482

2483
    bestPixIndices = indexConversion( bestPixIndices );
2484
    compressed2 = 0;
2485
    compressed2 = ( compressed2 & ~( ( 0x2 << 31 ) - 1 ) ) | ( bestPixIndices & ( ( 2 << 31 ) - 1 ) );
2486

2487
    return bestErr;
2488
}
2489
//#endif
2490

2491
template<class T, class S>
2492
static etcpak_force_inline uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id, const uint64_t value, const uint64_t error)
2493
{
2494
    size_t tidx[2];
2495
    tidx[0] = GetLeastError( terr[0], 8 );
2496
    tidx[1] = GetLeastError( terr[1], 8 );
2497

2498
    if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
2499
    {
2500
        return value;
2501
    }
2502

2503
    d |= tidx[0] << 26;
2504
    d |= tidx[1] << 29;
2505
    for( int i=0; i<16; i++ )
2506
    {
2507
        uint64_t t = tsel[i][tidx[id[i]%2]];
2508
        d |= ( t & 0x1 ) << ( i + 32 );
2509
        d |= ( t & 0x2 ) << ( i + 47 );
2510
    }
2511

2512
    return FixByteOrder(d);
2513
}
2514

2515
}
2516

2517
static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src )
2518
{
2519
#ifdef __AVX2__
2520
    uint64_t d = CheckSolid_AVX2( src );
2521
    if( d != 0 ) return d;
2522

2523
    alignas(32) v4i a[8];
2524

2525
    __m128i err0 = PrepareAverages_AVX2( a, src );
2526

2527
    // Get index of minimum error (err0)
2528
    __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
2529
    __m128i errMin0 = _mm_min_epu32(err0, err1);
2530

2531
    __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
2532
    __m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
2533

2534
    __m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
2535

2536
    uint32_t mask = _mm_movemask_epi8(errMask);
2537

2538
    uint32_t idx = _bit_scan_forward(mask) >> 2;
2539

2540
    d |= EncodeAverages_AVX2( a, idx );
2541

2542
    alignas(32) uint32_t terr[2][8] = {};
2543
    alignas(32) uint32_t tsel[8];
2544

2545
    if ((idx == 0) || (idx == 2))
2546
    {
2547
        FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
2548
    }
2549
    else
2550
    {
2551
        FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
2552
    }
2553

2554
    return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1 );
2555
#else
2556
    uint64_t d = CheckSolid( src );
2557
    if( d != 0 ) return d;
2558

2559
    v4i a[8];
2560
    unsigned int err[4] = {};
2561
    PrepareAverages( a, src, err );
2562
    size_t idx = GetLeastError( err, 4 );
2563
    EncodeAverages( d, a, idx );
2564

2565
#if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
2566
    uint32_t terr[2][8] = {};
2567
#else
2568
    uint64_t terr[2][8] = {};
2569
#endif
2570
    uint16_t tsel[16][8];
2571
    auto id = g_id[idx];
2572
    FindBestFit( terr, tsel, a, id, src );
2573

2574
    return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) );
2575
#endif
2576
}
2577

2578
#ifdef __AVX2__
2579
// horizontal min/max functions. https://stackoverflow.com/questions/22256525/horizontal-minimum-and-maximum-using-sse
2580
// if an error occurs in GCC, please change the value of -march in CFLAGS to a specific value for your CPU (e.g., skylake).
2581
static inline int16_t hMax( __m128i buffer, uint8_t& idx )
2582
{
2583
    __m128i tmp1 = _mm_sub_epi8( _mm_set1_epi8( (char)( 255 ) ), buffer );
2584
    __m128i tmp2 = _mm_min_epu8( tmp1, _mm_srli_epi16( tmp1, 8 ) );
2585
    __m128i tmp3 = _mm_minpos_epu16( tmp2 );
2586
    uint8_t result = 255 - (uint8_t)_mm_cvtsi128_si32( tmp3 );
2587
    __m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) );
2588
    idx = _tzcnt_u32( _mm_movemask_epi8( mask ) );
2589

2590
    return result;
2591
}
2592
#elif defined __ARM_NEON && defined __aarch64__
2593
static inline int16_t hMax( uint8x16_t buffer, uint8_t& idx )
2594
{
2595
    const uint8_t max = vmaxvq_u8( buffer );
2596
    const uint16x8_t vmax = vdupq_n_u16( max );
2597
    uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() );
2598
    uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[0] );
2599
    uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[1] );
2600
    uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmax );
2601
    uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmax );
2602

2603
    static const uint16_t mask_lsb[] = {
2604
	    0x1, 0x2, 0x4, 0x8,
2605
	    0x10, 0x20, 0x40, 0x80 };
2606

2607
    static const uint16_t mask_msb[] = {
2608
	    0x100, 0x200, 0x400, 0x800,
2609
	    0x1000, 0x2000, 0x4000, 0x8000 };
2610

2611
    uint16x8_t vmask_lsb = vld1q_u16( mask_lsb );
2612
    uint16x8_t vmask_msb = vld1q_u16( mask_msb );
2613
    uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask );
2614
    uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask );
2615
    pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2616
    pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2617
    pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2618
    uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), 0 );
2619
    pos_msb = vpaddq_u16( pos_msb, pos_msb );
2620
    pos_msb = vpaddq_u16( pos_msb, pos_msb );
2621
    pos_msb = vpaddq_u16( pos_msb, pos_msb );
2622
    uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), 0 );
2623
    idx = idx_lane1 != 0 ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 );
2624

2625
    return max;
2626
}
2627
#endif
2628

2629
#ifdef __AVX2__
2630
static inline int16_t hMin( __m128i buffer, uint8_t& idx )
2631
{
2632
    __m128i tmp2 = _mm_min_epu8( buffer, _mm_srli_epi16( buffer, 8 ) );
2633
    __m128i tmp3 = _mm_minpos_epu16( tmp2 );
2634
    uint8_t result = (uint8_t)_mm_cvtsi128_si32( tmp3 );
2635
    __m128i mask = _mm_cmpeq_epi8( buffer, _mm_set1_epi8( result ) );
2636
    idx = _tzcnt_u32( _mm_movemask_epi8( mask ) );
2637
    return result;
2638
}
2639
#elif defined __ARM_NEON && defined __aarch64__
2640
static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx )
2641
{
2642
    const uint8_t min = vminvq_u8( buffer );
2643
    const uint16x8_t vmin = vdupq_n_u16( min );
2644
    uint8x16x2_t buff_wide = vzipq_u8( buffer, uint8x16_t() );
2645
    uint16x8_t lowbuf16 = vreinterpretq_u16_u8( buff_wide.val[0] );
2646
    uint16x8_t hibuf16 = vreinterpretq_u16_u8( buff_wide.val[1] );
2647
    uint16x8_t low_eqmask = vceqq_u16( lowbuf16, vmin );
2648
    uint16x8_t hi_eqmask = vceqq_u16( hibuf16, vmin );
2649

2650
    static const uint16_t mask_lsb[] = {
2651
	    0x1, 0x2, 0x4, 0x8,
2652
	    0x10, 0x20, 0x40, 0x80 };
2653

2654
    static const uint16_t mask_msb[] = {
2655
	    0x100, 0x200, 0x400, 0x800,
2656
	    0x1000, 0x2000, 0x4000, 0x8000 };
2657

2658
    uint16x8_t vmask_lsb = vld1q_u16( mask_lsb );
2659
    uint16x8_t vmask_msb = vld1q_u16( mask_msb );
2660
    uint16x8_t pos_lsb = vandq_u16( vmask_lsb, low_eqmask );
2661
    uint16x8_t pos_msb = vandq_u16( vmask_msb, hi_eqmask );
2662
    pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2663
    pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2664
    pos_lsb = vpaddq_u16( pos_lsb, pos_lsb );
2665
    uint64_t idx_lane1 = vgetq_lane_u64( vreinterpretq_u64_u16( pos_lsb ), 0 );
2666
    pos_msb = vpaddq_u16( pos_msb, pos_msb );
2667
    pos_msb = vpaddq_u16( pos_msb, pos_msb );
2668
    pos_msb = vpaddq_u16( pos_msb, pos_msb );
2669
    uint32_t idx_lane2 = vgetq_lane_u32( vreinterpretq_u32_u16( pos_msb ), 0 );
2670
    idx = idx_lane1 != 0 ? __builtin_ctz( idx_lane1 ) : __builtin_ctz( idx_lane2 );
2671

2672
    return min;
2673
}
2674
#endif
2675

2676
// During search it is not convenient to store the bits the way they are stored in the
2677
// file format. Hence, after search, it is converted to this format.
2678
// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
2679
static inline void stuff59bits( unsigned int thumbT59W1, unsigned int thumbT59W2, unsigned int& thumbTW1, unsigned int& thumbTW2 )
2680
{
2681
    // Put bits in twotimer configuration for 59 (red overflows)
2682
    //
2683
    // Go from this bit layout:
2684
    //
2685
    //     |63 62 61 60 59|58 57 56 55|54 53 52 51|50 49 48 47|46 45 44 43|42 41 40 39|38 37 36 35|34 33 32|
2686
    //     |----empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|--dist--|
2687
    //
2688
    //     |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2689
    //     |----------------------------------------index bits---------------------------------------------|
2690
    //
2691
    //
2692
    //  To this:
2693
    //
2694
    //      63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2695
    //      -----------------------------------------------------------------------------------------------
2696
    //     |// // //|R0a  |//|R0b  |G0         |B0         |R1         |G1         |B1          |da  |df|db|
2697
    //      -----------------------------------------------------------------------------------------------
2698
    //
2699
    //     |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2700
    //     |----------------------------------------index bits---------------------------------------------|
2701
    //
2702
    //      63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2703
    //      -----------------------------------------------------------------------------------------------
2704
    //     | base col1    | dcol 2 | base col1    | dcol 2 | base col 1   | dcol 2 | table  | table  |df|fp|
2705
    //     | R1' (5 bits) | dR2    | G1' (5 bits) | dG2    | B1' (5 bits) | dB2    | cw 1   | cw 2   |bt|bt|
2706
    //      ------------------------------------------------------------------------------------------------
2707

2708
    uint8_t R0a;
2709
    uint8_t bit, a, b, c, d, bits;
2710

2711
    R0a = ( thumbT59W1 >> 25 ) & 0x3;
2712

2713
    // Fix middle part
2714
    thumbTW1 = thumbT59W1 << 1;
2715
    // Fix R0a (top two bits of R0)
2716
    thumbTW1 = ( thumbTW1 & ~( 0x3 << 27 ) ) | ( ( R0a & 0x3 ) << 27 );
2717
    // Fix db (lowest bit of d)
2718
    thumbTW1 = ( thumbTW1 & ~0x1 ) | ( thumbT59W1 & 0x1 );
2719

2720
    // Make sure that red overflows:
2721
    a = ( thumbTW1 >> 28 ) & 0x1;
2722
    b = ( thumbTW1 >> 27 ) & 0x1;
2723
    c = ( thumbTW1 >> 25 ) & 0x1;
2724
    d = ( thumbTW1 >> 24 ) & 0x1;
2725

2726
    // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
2727
    // The following logical expression checks for the presence of any of those:
2728
    bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
2729
    bits = 0xf * bit;
2730
    thumbTW1 = ( thumbTW1 & ~( 0x7 << 29 ) ) | ( bits & 0x7 ) << 29;
2731
    thumbTW1 = ( thumbTW1 & ~( 0x1 << 26 ) ) | ( !bit & 0x1 ) << 26;
2732

2733
    // Set diffbit
2734
    thumbTW1 = ( thumbTW1 & ~0x2 ) | 0x2;
2735
    thumbTW2 = thumbT59W2;
2736
}
2737

2738
// During search it is not convenient to store the bits the way they are stored in the
2739
// file format. Hence, after search, it is converted to this format.
2740
// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
2741
static inline void stuff58bits( unsigned int thumbH58W1, unsigned int thumbH58W2, unsigned int& thumbHW1, unsigned int& thumbHW2 )
2742
{
2743
    // Put bits in twotimer configuration for 58 (red doesn't overflow, green does)
2744
    //
2745
    // Go from this bit layout:
2746
    //
2747
    //
2748
    //     |63 62 61 60 59 58|57 56 55 54|53 52 51 50|49 48 47 46|45 44 43 42|41 40 39 38|37 36 35 34|33 32|
2749
    //     |-------empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|d2 d1|
2750
    //
2751
    //     |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2752
    //     |---------------------------------------index bits----------------------------------------------|
2753
    //
2754
    //  To this:
2755
    //
2756
    //      63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2757
    //      -----------------------------------------------------------------------------------------------
2758
    //     |//|R0         |G0      |// // //|G0|B0|//|B0b     |R1         |G1         |B0         |d2|df|d1|
2759
    //      -----------------------------------------------------------------------------------------------
2760
    //
2761
    //     |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
2762
    //     |---------------------------------------index bits----------------------------------------------|
2763
    //
2764
    //      63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2765
    //      -----------------------------------------------------------------------------------------------
2766
    //     | base col1    | dcol 2 | base col1    | dcol 2 | base col 1   | dcol 2 | table  | table  |df|fp|
2767
    //     | R1' (5 bits) | dR2    | G1' (5 bits) | dG2    | B1' (5 bits) | dB2    | cw 1   | cw 2   |bt|bt|
2768
    //      -----------------------------------------------------------------------------------------------
2769
    //
2770
    //
2771
    // Thus, what we are really doing is going from this bit layout:
2772
    //
2773
    //
2774
    //     |63 62 61 60 59 58|57 56 55 54 53 52 51|50 49|48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33|32   |
2775
    //     |-------empty-----|part0---------------|part1|part2------------------------------------------|part3|
2776
    //
2777
    //  To this:
2778
    //
2779
    //      63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
2780
    //      --------------------------------------------------------------------------------------------------|
2781
    //     |//|part0               |// // //|part1|//|part2                                          |df|part3|
2782
    //      --------------------------------------------------------------------------------------------------|
2783

2784
    unsigned int part0, part1, part2, part3;
2785
    uint8_t bit, a, b, c, d, bits;
2786

2787
    // move parts
2788
    part0 = ( thumbH58W1 >> 19 ) & 0x7f;
2789
    part1 = ( thumbH58W1 >> 17 ) & 0x3;
2790
    part2 = ( thumbH58W1 >> 1 ) & 0xffff;
2791
    part3 = thumbH58W1 & 0x1;
2792
    thumbHW1 = 0;
2793
    thumbHW1 = ( thumbHW1 & ~( 0x7f << 24 ) ) | ( ( part0 & 0x7f ) << 24 );
2794
    thumbHW1 = ( thumbHW1 & ~( 0x3 << 19 ) ) | ( ( part1 & 0x3 ) << 19 );
2795
    thumbHW1 = ( thumbHW1 & ~( 0xffff << 2 ) ) | ( ( part2 & 0xffff ) << 2 );
2796
    thumbHW1 = ( thumbHW1 & ~0x1 ) | ( part3 & 0x1 );
2797

2798
    // Make sure that red does not overflow:
2799
    bit = ( thumbHW1 >> 30 ) & 0x1;
2800
    thumbHW1 = ( thumbHW1 & ~( 0x1 << 31 ) ) | ( ( !bit & 0x1 ) << 31 );
2801

2802
    // Make sure that green overflows:
2803
    a = ( thumbHW1 >> 20 ) & 0x1;
2804
    b = ( thumbHW1 >> 19 ) & 0x1;
2805
    c = ( thumbHW1 >> 17 ) & 0x1;
2806
    d = ( thumbHW1 >> 16 ) & 0x1;
2807
    // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
2808
    // The following logical expression checks for the presence of any of those:
2809
    bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
2810
    bits = 0xf * bit;
2811
    thumbHW1 = ( thumbHW1 & ~( 0x7 << 21 ) ) | ( ( bits & 0x7 ) << 21 );
2812
    thumbHW1 = ( thumbHW1 & ~( 0x1 << 18 ) ) | ( ( !bit & 0x1 ) << 18 );
2813

2814
    // Set diffbit
2815
    thumbHW1 = ( thumbHW1 & ~0x2 ) | 0x2;
2816
    thumbHW2 = thumbH58W2;
2817
}
2818

2819
#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
2820
static etcpak_force_inline Channels GetChannels( const uint8_t* src )
2821
{
2822
    Channels ch;
2823
#ifdef __AVX2__
2824
    __m128i d0 = _mm_loadu_si128( ( (__m128i*)src ) + 0 );
2825
    __m128i d1 = _mm_loadu_si128( ( (__m128i*)src ) + 1 );
2826
    __m128i d2 = _mm_loadu_si128( ( (__m128i*)src ) + 2 );
2827
    __m128i d3 = _mm_loadu_si128( ( (__m128i*)src ) + 3 );
2828

2829
    __m128i rgb0 = _mm_shuffle_epi8( d0, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2830
    __m128i rgb1 = _mm_shuffle_epi8( d1, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2831
    __m128i rgb2 = _mm_shuffle_epi8( d2, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2832
    __m128i rgb3 = _mm_shuffle_epi8( d3, _mm_setr_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1 ) );
2833

2834
    __m128i rg0 = _mm_unpacklo_epi32( rgb0, rgb1 );
2835
    __m128i rg1 = _mm_unpacklo_epi32( rgb2, rgb3 );
2836
    __m128i b0 = _mm_unpackhi_epi32( rgb0, rgb1 );
2837
    __m128i b1 = _mm_unpackhi_epi32( rgb2, rgb3 );
2838

2839
    // swap channels
2840
    ch.b8 = _mm_unpacklo_epi64( rg0, rg1 );
2841
    ch.g8 = _mm_unpackhi_epi64( rg0, rg1 );
2842
    ch.r8 = _mm_unpacklo_epi64( b0, b1 );
2843
#elif defined __ARM_NEON && defined __aarch64__
2844
    //load pixel data into 4 rows
2845
    uint8x16_t px0 = vld1q_u8( src + 0 );
2846
    uint8x16_t px1 = vld1q_u8( src + 16 );
2847
    uint8x16_t px2 = vld1q_u8( src + 32 );
2848
    uint8x16_t px3 = vld1q_u8( src + 48 );
2849

2850
    uint8x16x2_t px0z1 = vzipq_u8( px0, px1 );
2851
    uint8x16x2_t px2z3 = vzipq_u8( px2, px3 );
2852
    uint8x16x2_t px01 = vzipq_u8( px0z1.val[0], px0z1.val[1] );
2853
    uint8x16x2_t rgb01 = vzipq_u8( px01.val[0], px01.val[1] );
2854
    uint8x16x2_t px23 = vzipq_u8( px2z3.val[0], px2z3.val[1] );
2855
    uint8x16x2_t rgb23 = vzipq_u8( px23.val[0], px23.val[1] );
2856

2857
    uint8x16_t rr = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[0] ), vreinterpretq_u64_u8( rgb23.val[0] ) ) );
2858
    uint8x16_t gg = vreinterpretq_u8_u64( vzip2q_u64( vreinterpretq_u64_u8( rgb01.val[0] ), vreinterpretq_u64_u8( rgb23.val[0] ) ) );
2859
    uint8x16_t bb = vreinterpretq_u8_u64( vzip1q_u64( vreinterpretq_u64_u8( rgb01.val[1] ), vreinterpretq_u64_u8( rgb23.val[1] ) ) );
2860

2861
    uint8x16x2_t red = vzipq_u8( rr, uint8x16_t() );
2862
    uint8x16x2_t grn = vzipq_u8( gg, uint8x16_t() );
2863
    uint8x16x2_t blu = vzipq_u8( bb, uint8x16_t() );
2864
    ch.r = red;
2865
    ch.b = blu;
2866
    ch.g = grn;
2867
#endif
2868
    return ch;
2869
}
2870
#endif
2871

2872
#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
2873
static etcpak_force_inline void CalculateLuma( Channels& ch, Luma& luma )
2874
#else
2875
static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
2876
#endif
2877
{
2878
#ifdef __AVX2__
2879
    __m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.b8 ), _mm256_set1_epi16( 14 ) );
2880
    __m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.g8 ), _mm256_set1_epi16( 76 ) );
2881
    __m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.r8 ), _mm256_set1_epi16( 38 ) );
2882

2883
    __m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma );
2884
    __m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, 7 );
2885
    __m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, 0 );
2886
    __m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, 1 );
2887

2888
    static const __m128i interleaving_mask_lo = _mm_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 );
2889
    static const __m128i interleaving_mask_hi = _mm_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 );
2890
    __m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo );
2891
    __m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi );
2892
    __m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved );
2893
    luma.luma8 = luma_8bit;
2894

2895
    // min/max calculation
2896
    luma.min = hMin( luma_8bit, luma.minIdx ) * 0.00392156f;
2897
    luma.max = hMax( luma_8bit, luma.maxIdx ) * 0.00392156f;
2898
#elif defined __ARM_NEON && defined __aarch64__
2899
    //load pixel data into 4 rows
2900
    uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[0] ), 14 );
2901
    uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[1] ), 14 );
2902
    uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[0] ), 76 );
2903
    uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[1] ), 76 );
2904
    uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[0] ), 38 );
2905
    uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[1] ), 38 );
2906

2907
    //calculate luma for rows 0,1 and 2,3
2908
    uint16x8_t lum_r01 = vaddq_u16( vaddq_u16( red0, grn0 ), blu0 );
2909
    uint16x8_t lum_r23 = vaddq_u16( vaddq_u16( red1, grn1 ), blu1 );
2910

2911
    //divide luma values with right shift and narrow results to 8bit
2912
    uint8x8_t lum_r01_d = vshrn_n_u16( lum_r01, 7 );
2913
    uint8x8_t lum_r02_d = vshrn_n_u16( lum_r23, 7 );
2914

2915
    luma.luma8 = vcombine_u8( lum_r01_d, lum_r02_d );
2916
    //find min and max luma value
2917
    luma.min = hMin( luma.luma8, luma.minIdx ) * 0.00392156f;
2918
    luma.max = hMax( luma.luma8, luma.maxIdx ) * 0.00392156f;
2919
#else
2920
    for( int i = 0; i < 16; ++i )
2921
    {
2922
        luma.val[i] = ( src[i * 4 + 2] * 76 + src[i * 4 + 1] * 150 + src[i * 4] * 28 ) / 254; // luma calculation
2923
        if( luma.min > luma.val[i] )
2924
        {
2925
            luma.min = luma.val[i];
2926
            luma.minIdx = i;
2927
        }
2928
        if( luma.max < luma.val[i] )
2929
        {
2930
            luma.max = luma.val[i];
2931
            luma.maxIdx = i;
2932
        }
2933
    }
2934
#endif
2935
}
2936

2937
static etcpak_force_inline uint8_t SelectModeETC2( const Luma& luma )
2938
{
2939
#if defined __AVX2__  || defined __ARM_NEON
2940
    const float lumaRange = ( luma.max - luma.min );
2941
#else
2942
    const float lumaRange = ( luma.max - luma.min ) * ( 1.f / 255.f );
2943
#endif
2944
    // filters a very-low-contrast block
2945
    if( lumaRange <= ecmd_threshold[0] )
2946
    {
2947
        return ModePlanar;
2948
    }
2949
    // checks whether a pair of the corner pixels in a block has the min/max luma values;
2950
    // if so, the ETC2 planar mode is enabled, and otherwise, the ETC1 mode is enabled
2951
    else if( lumaRange <= ecmd_threshold[1] )
2952
    {
2953
#ifdef __AVX2__
2954
        static const __m128i corner_pair = _mm_set_epi8( 1, 1, 1, 1, 1, 1, 1, 1, 0, 15, 3, 12, 12, 3, 15, 0 );
2955
        __m128i current_max_min = _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx, luma.minIdx, luma.maxIdx );
2956

2957
        __m128i max_min_result = _mm_cmpeq_epi16( corner_pair, current_max_min );
2958

2959
        int mask = _mm_movemask_epi8( max_min_result );
2960
        if( mask )
2961
        {
2962
            return ModePlanar;
2963
        }
2964
#else
2965
        // check whether a pair of the corner pixels in a block has the min/max luma values;
2966
        // if so, the ETC2 planar mode is enabled.
2967
        if( ( luma.minIdx == 0 && luma.maxIdx == 15 ) ||
2968
            ( luma.minIdx == 15 && luma.maxIdx == 0 ) ||
2969
            ( luma.minIdx == 3 && luma.maxIdx == 12 ) ||
2970
            ( luma.minIdx == 12 && luma.maxIdx == 3 ) )
2971
        {
2972
            return ModePlanar;
2973
        }
2974
#endif
2975
    }
2976
    // filters a high-contrast block for checking both ETC1 mode and the ETC2 T/H mode
2977
    else if( lumaRange >= ecmd_threshold[2] )
2978
    {
2979
        return ModeTH;
2980
    }
2981
    return ModeUndecided;
2982
}
2983

2984
static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool useHeuristics )
2985
{
2986
#ifdef __AVX2__
2987
    uint64_t d = CheckSolid_AVX2( src );
2988
    if( d != 0 ) return d;
2989
#else
2990
    uint64_t d = CheckSolid( src );
2991
    if (d != 0) return d;
2992
#endif
2993

2994
    uint8_t mode = ModeUndecided;
2995
    Luma luma;
2996
#ifdef __AVX2__
2997
    Channels ch = GetChannels( src );
2998
    if( useHeuristics )
2999
    {
3000
        CalculateLuma( ch, luma );
3001
        mode = SelectModeETC2( luma );
3002
    }
3003

3004
    auto plane = Planar_AVX2( ch, mode, useHeuristics );
3005
    if( useHeuristics && mode == ModePlanar ) return plane.plane;
3006

3007
    alignas( 32 ) v4i a[8];
3008
    __m128i err0 = PrepareAverages_AVX2( a, plane.sum4 );
3009

3010
    // Get index of minimum error (err0)
3011
    __m128i err1 = _mm_shuffle_epi32( err0, _MM_SHUFFLE( 2, 3, 0, 1 ) );
3012
    __m128i errMin0 = _mm_min_epu32(err0, err1);
3013

3014
    __m128i errMin1 = _mm_shuffle_epi32( errMin0, _MM_SHUFFLE( 1, 0, 3, 2 ) );
3015
    __m128i errMin2 = _mm_min_epu32( errMin1, errMin0 );
3016

3017
    __m128i errMask = _mm_cmpeq_epi32( errMin2, err0 );
3018

3019
    uint32_t mask = _mm_movemask_epi8( errMask );
3020

3021
    size_t idx = _bit_scan_forward( mask ) >> 2;
3022

3023
    d = EncodeAverages_AVX2( a, idx );
3024

3025
    alignas(32) uint32_t terr[2][8] = {};
3026
    alignas(32) uint32_t tsel[8];
3027

3028
    if ((idx == 0) || (idx == 2))
3029
    {
3030
        FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
3031
    }
3032
    else
3033
    {
3034
        FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
3035
    }
3036

3037
    if( useHeuristics )
3038
    {
3039
        if( mode == ModeTH )
3040
        {
3041
            uint64_t result = 0;
3042
            uint64_t error = 0;
3043
            uint32_t compressed[4] = { 0, 0, 0, 0 };
3044
            bool tMode = false;
3045

3046
            error = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode, ch.r8, ch.g8, ch.b8 );
3047
            if( tMode )
3048
            {
3049
                stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3050
            }
3051
            else
3052
            {
3053
                stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3054
            }
3055

3056
            result = (uint32_t)_bswap( compressed[2] );
3057
            result |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
3058

3059
            plane.plane = result;
3060
            plane.error = error;
3061
        }
3062
        else
3063
        {
3064
            plane.plane = 0;
3065
            plane.error = MaxError;
3066
        }
3067
    }
3068

3069
    return EncodeSelectors_AVX2( d, terr, tsel, ( idx % 2 ) == 1, plane.plane, plane.error );
3070
#else
3071
    if( useHeuristics )
3072
    {
3073
#if defined __ARM_NEON && defined __aarch64__
3074
        Channels ch = GetChannels( src );
3075
        CalculateLuma( ch, luma );
3076
#else
3077
        CalculateLuma( src, luma );
3078
#endif
3079
        mode = SelectModeETC2( luma );
3080
    }
3081
#ifdef __ARM_NEON
3082
    auto result = Planar_NEON( src, mode, useHeuristics );
3083
#else
3084
    auto result = Planar( src, mode, useHeuristics );
3085
#endif
3086
    if( result.second == 0 ) return result.first;
3087

3088
    v4i a[8];
3089
    unsigned int err[4] = {};
3090
    PrepareAverages( a, src, err );
3091
    size_t idx = GetLeastError( err, 4 );
3092
    EncodeAverages( d, a, idx );
3093

3094
#if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
3095
    uint32_t terr[2][8] = {};
3096
#else
3097
    uint64_t terr[2][8] = {};
3098
#endif
3099
    uint16_t tsel[16][8];
3100
    auto id = g_id[idx];
3101
    FindBestFit( terr, tsel, a, id, src );
3102

3103
    if( useHeuristics )
3104
    {
3105
        if( mode == ModeTH )
3106
        {
3107
            uint32_t compressed[4] = { 0, 0, 0, 0 };
3108
            bool tMode = false;
3109

3110
            result.second = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode );
3111
            if( tMode )
3112
            {
3113
                stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3114
            }
3115
            else
3116
            {
3117
                stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
3118
            }
3119

3120
            result.first = (uint32_t)_bswap( compressed[2] );
3121
            result.first |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
3122
        }
3123
        else
3124
        {
3125
            result.first = 0;
3126
            result.second = MaxError;
3127
        }
3128
    }
3129

3130
    return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
3131
#endif
3132
}
3133

3134
#ifdef __SSE4_1__
3135
template<int K>
3136
static etcpak_force_inline __m128i Widen( const __m128i src )
3137
{
3138
    static_assert( K >= 0 && K <= 7, "Index out of range" );
3139

3140
    __m128i tmp;
3141
    switch( K )
3142
    {
3143
    case 0:
3144
        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3145
        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3146
    case 1:
3147
        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3148
        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3149
    case 2:
3150
        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3151
        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3152
    case 3:
3153
        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
3154
        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3155
    case 4:
3156
        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
3157
        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3158
    case 5:
3159
        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3160
        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3161
    case 6:
3162
        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3163
        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3164
    case 7:
3165
        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
3166
        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
3167
    }
3168
}
3169

3170
static etcpak_force_inline int GetMulSel( int sel )
3171
{
3172
    switch( sel )
3173
    {
3174
    case 0:
3175
        return 0;
3176
    case 1:
3177
    case 2:
3178
    case 3:
3179
        return 1;
3180
    case 4:
3181
        return 2;
3182
    case 5:
3183
    case 6:
3184
    case 7:
3185
        return 3;
3186
    case 8:
3187
    case 9:
3188
    case 10:
3189
    case 11:
3190
    case 12:
3191
    case 13:
3192
        return 4;
3193
    case 14:
3194
    case 15:
3195
        return 5;
3196
    }
3197
}
3198

3199
#endif
3200

3201
#ifdef __ARM_NEON
3202

3203
static constexpr etcpak_force_inline int GetMulSel(int sel)
3204
{
3205
    return ( sel < 1 ) ? 0 : ( sel < 4 ) ? 1 : ( sel < 5 ) ? 2 : ( sel < 8 ) ? 3 : ( sel < 14 ) ? 4 : 5;
3206
}
3207

3208
static constexpr int ClampConstant( int x, int min, int max )
3209
{
3210
    return x < min ? min : x > max ? max : x;
3211
}
3212

3213
template <int Index>
3214
etcpak_force_inline static uint16x8_t ErrorProbe_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
3215
{
3216
    uint8x8_t srcValWide;
3217
#ifndef __aarch64__
3218
    if( Index < 8 )
3219
        srcValWide = vdup_lane_u8( vget_low_u8( alphaBlock ), ClampConstant( Index, 0, 7 ) );
3220
    else
3221
        srcValWide = vdup_lane_u8( vget_high_u8( alphaBlock ), ClampConstant( Index - 8, 0, 7 ) );
3222
#else
3223
    srcValWide = vdup_laneq_u8( alphaBlock, Index );
3224
#endif
3225

3226
    uint8x8_t deltaVal = vabd_u8( srcValWide, recVal );
3227
    return vmull_u8( deltaVal, deltaVal );
3228
}
3229

3230
etcpak_force_inline static uint16_t MinError_EAC_NEON( uint16x8_t errProbe )
3231
{
3232
#ifndef __aarch64__
3233
    uint16x4_t tmpErr = vpmin_u16( vget_low_u16( errProbe ), vget_high_u16( errProbe ) );
3234
    tmpErr = vpmin_u16( tmpErr, tmpErr );
3235
    return vpmin_u16( tmpErr, tmpErr )[0];
3236
#else
3237
    return vminvq_u16( errProbe );
3238
#endif
3239
}
3240

3241
template <int Index>
3242
etcpak_force_inline static uint64_t MinErrorIndex_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
3243
{
3244
    uint16x8_t errProbe = ErrorProbe_EAC_NEON<Index>( recVal, alphaBlock );
3245
    uint16x8_t minErrMask = vceqq_u16( errProbe, vdupq_n_u16( MinError_EAC_NEON( errProbe ) ) );
3246
    uint64_t idx = __builtin_ctzll( vget_lane_u64( vreinterpret_u64_u8( vqmovn_u16( minErrMask ) ), 0 ) );
3247
    idx >>= 3;
3248
    idx <<= 45 - Index * 3;
3249

3250
    return idx;
3251
}
3252

3253
template <int Index>
3254
etcpak_force_inline static int16x8_t WidenMultiplier_EAC_NEON( int16x8_t multipliers )
3255
{
3256
    constexpr int Lane = GetMulSel( Index );
3257
#ifndef __aarch64__
3258
    if( Lane < 4 )
3259
        return vdupq_lane_s16( vget_low_s16( multipliers ), ClampConstant( Lane, 0, 3 ) );
3260
    else
3261
        return vdupq_lane_s16( vget_high_s16( multipliers ), ClampConstant( Lane - 4, 0, 3 ) );
3262
#else
3263
    return vdupq_laneq_s16( multipliers, Lane );
3264
#endif
3265
}
3266

3267
#endif
3268

3269
template<bool checkSolid = true>
3270
static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
3271
{
3272
#if defined __SSE4_1__
3273
    __m128i s = _mm_loadu_si128( (__m128i*)src );
3274

3275
    if( checkSolid )
3276
    {
3277
        // Check solid
3278
        __m128i solidCmp = _mm_set1_epi8( src[0] );
3279
        __m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
3280
        if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
3281
        {
3282
            return src[0];
3283
        }
3284
    }
3285

3286
    // Calculate min, max
3287
    __m128i s1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 2, 3, 0, 1 ) );
3288
    __m128i max1 = _mm_max_epu8( s, s1 );
3289
    __m128i min1 = _mm_min_epu8( s, s1 );
3290
    __m128i smax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
3291
    __m128i smin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
3292
    __m128i max2 = _mm_max_epu8( max1, smax2 );
3293
    __m128i min2 = _mm_min_epu8( min1, smin2 );
3294
    __m128i smax3 = _mm_alignr_epi8( max2, max2, 2 );
3295
    __m128i smin3 = _mm_alignr_epi8( min2, min2, 2 );
3296
    __m128i max3 = _mm_max_epu8( max2, smax3 );
3297
    __m128i min3 = _mm_min_epu8( min2, smin3 );
3298
    __m128i smax4 = _mm_alignr_epi8( max3, max3, 1 );
3299
    __m128i smin4 = _mm_alignr_epi8( min3, min3, 1 );
3300
    __m128i max = _mm_max_epu8( max3, smax4 );
3301
    __m128i min = _mm_min_epu8( min3, smin4 );
3302
    __m128i max16 = _mm_unpacklo_epi8( max, _mm_setzero_si128() );
3303
    __m128i min16 = _mm_unpacklo_epi8( min, _mm_setzero_si128() );
3304

3305
    // src range, mid
3306
    __m128i srcRange = _mm_sub_epi16( max16, min16 );
3307
    __m128i srcRangeHalf = _mm_srli_epi16( srcRange, 1 );
3308
    __m128i srcMid = _mm_add_epi16( min16, srcRangeHalf );
3309

3310
    // multiplier
3311
    __m128i mul1 = _mm_mulhi_epi16( srcRange, g_alphaRange_SIMD );
3312
    __m128i mul = _mm_add_epi16( mul1, _mm_set1_epi16( 1 ) );
3313

3314
    // wide source
3315
    __m128i s16_1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 3, 2, 3, 2 ) );
3316
    __m128i s16[2] = { _mm_unpacklo_epi8( s, _mm_setzero_si128() ), _mm_unpacklo_epi8( s16_1, _mm_setzero_si128() ) };
3317

3318
    __m128i sr[16] = {
3319
        Widen<0>( s16[0] ),
3320
        Widen<1>( s16[0] ),
3321
        Widen<2>( s16[0] ),
3322
        Widen<3>( s16[0] ),
3323
        Widen<4>( s16[0] ),
3324
        Widen<5>( s16[0] ),
3325
        Widen<6>( s16[0] ),
3326
        Widen<7>( s16[0] ),
3327
        Widen<0>( s16[1] ),
3328
        Widen<1>( s16[1] ),
3329
        Widen<2>( s16[1] ),
3330
        Widen<3>( s16[1] ),
3331
        Widen<4>( s16[1] ),
3332
        Widen<5>( s16[1] ),
3333
        Widen<6>( s16[1] ),
3334
        Widen<7>( s16[1] )
3335
    };
3336

3337
#ifdef __AVX2__
3338
    __m256i srcRangeWide = _mm256_broadcastsi128_si256( srcRange );
3339
    __m256i srcMidWide = _mm256_broadcastsi128_si256( srcMid );
3340

3341
    __m256i mulWide1 = _mm256_mulhi_epi16( srcRangeWide, g_alphaRange_AVX );
3342
    __m256i mulWide = _mm256_add_epi16( mulWide1, _mm256_set1_epi16( 1 ) );
3343

3344
    __m256i modMul[8] = {
3345
        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ) ), _mm256_setzero_si256() ),
3346
        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ) ), _mm256_setzero_si256() ),
3347
        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ) ), _mm256_setzero_si256() ),
3348
        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ) ), _mm256_setzero_si256() ),
3349
        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ) ), _mm256_setzero_si256() ),
3350
        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ) ), _mm256_setzero_si256() ),
3351
        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ) ), _mm256_setzero_si256() ),
3352
        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ) ), _mm256_setzero_si256() ),
3353
    };
3354

3355
    // find selector
3356
    __m256i mulErr = _mm256_setzero_si256();
3357
    for( int j=0; j<16; j++ )
3358
    {
3359
        __m256i s16Wide = _mm256_broadcastsi128_si256( sr[j] );
3360
        __m256i err1, err2;
3361

3362
        err1 = _mm256_sub_epi16( s16Wide, modMul[0] );
3363
        __m256i localErr = _mm256_mullo_epi16( err1, err1 );
3364

3365
        err1 = _mm256_sub_epi16( s16Wide, modMul[1] );
3366
        err2 = _mm256_mullo_epi16( err1, err1 );
3367
        localErr = _mm256_min_epu16( localErr, err2 );
3368

3369
        err1 = _mm256_sub_epi16( s16Wide, modMul[2] );
3370
        err2 = _mm256_mullo_epi16( err1, err1 );
3371
        localErr = _mm256_min_epu16( localErr, err2 );
3372

3373
        err1 = _mm256_sub_epi16( s16Wide, modMul[3] );
3374
        err2 = _mm256_mullo_epi16( err1, err1 );
3375
        localErr = _mm256_min_epu16( localErr, err2 );
3376

3377
        err1 = _mm256_sub_epi16( s16Wide, modMul[4] );
3378
        err2 = _mm256_mullo_epi16( err1, err1 );
3379
        localErr = _mm256_min_epu16( localErr, err2 );
3380

3381
        err1 = _mm256_sub_epi16( s16Wide, modMul[5] );
3382
        err2 = _mm256_mullo_epi16( err1, err1 );
3383
        localErr = _mm256_min_epu16( localErr, err2 );
3384

3385
        err1 = _mm256_sub_epi16( s16Wide, modMul[6] );
3386
        err2 = _mm256_mullo_epi16( err1, err1 );
3387
        localErr = _mm256_min_epu16( localErr, err2 );
3388

3389
        err1 = _mm256_sub_epi16( s16Wide, modMul[7] );
3390
        err2 = _mm256_mullo_epi16( err1, err1 );
3391
        localErr = _mm256_min_epu16( localErr, err2 );
3392

3393
        // note that this can overflow, but since we're looking for the smallest error, it shouldn't matter
3394
        mulErr = _mm256_adds_epu16( mulErr, localErr );
3395
    }
3396
    uint64_t minPos1 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_castsi256_si128( mulErr ) ) );
3397
    uint64_t minPos2 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_extracti128_si256( mulErr, 1 ) ) );
3398
    int sel = ( ( minPos1 & 0xFFFF ) < ( minPos2 & 0xFFFF ) ) ? ( minPos1 >> 16 ) : ( 8 + ( minPos2 >> 16 ) );
3399

3400
    __m128i recVal16;
3401
    switch( sel )
3402
    {
3403
    case 0:
3404
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() );
3405
        break;
3406
    case 1:
3407
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() );
3408
        break;
3409
    case 2:
3410
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() );
3411
        break;
3412
    case 3:
3413
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() );
3414
        break;
3415
    case 4:
3416
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() );
3417
        break;
3418
    case 5:
3419
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() );
3420
        break;
3421
    case 6:
3422
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() );
3423
        break;
3424
    case 7:
3425
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() );
3426
        break;
3427
    case 8:
3428
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() );
3429
        break;
3430
    case 9:
3431
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() );
3432
        break;
3433
    case 10:
3434
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() );
3435
        break;
3436
    case 11:
3437
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() );
3438
        break;
3439
    case 12:
3440
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() );
3441
        break;
3442
    case 13:
3443
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() );
3444
        break;
3445
    case 14:
3446
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() );
3447
        break;
3448
    case 15:
3449
        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() );
3450
        break;
3451
    default:
3452
        assert( false );
3453
        break;
3454
    }
3455
#else
3456
    // wide multiplier
3457
    __m128i rangeMul[16] = {
3458
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() ),
3459
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() ),
3460
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() ),
3461
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() ),
3462
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() ),
3463
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() ),
3464
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() ),
3465
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() ),
3466
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() ),
3467
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() ),
3468
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() ),
3469
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() ),
3470
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() ),
3471
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() ),
3472
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() ),
3473
        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() )
3474
    };
3475

3476
    // find selector
3477
    int err = std::numeric_limits<int>::max();
3478
    int sel;
3479
    for( int r=0; r<16; r++ )
3480
    {
3481
        __m128i err1, err2, minerr;
3482
        __m128i recVal16 = rangeMul[r];
3483
        int rangeErr;
3484

3485
        err1 = _mm_sub_epi16( sr[0], recVal16 );
3486
        err2 = _mm_mullo_epi16( err1, err1 );
3487
        minerr = _mm_minpos_epu16( err2 );
3488
        rangeErr = _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3489

3490
        err1 = _mm_sub_epi16( sr[1], recVal16 );
3491
        err2 = _mm_mullo_epi16( err1, err1 );
3492
        minerr = _mm_minpos_epu16( err2 );
3493
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3494

3495
        err1 = _mm_sub_epi16( sr[2], recVal16 );
3496
        err2 = _mm_mullo_epi16( err1, err1 );
3497
        minerr = _mm_minpos_epu16( err2 );
3498
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3499

3500
        err1 = _mm_sub_epi16( sr[3], recVal16 );
3501
        err2 = _mm_mullo_epi16( err1, err1 );
3502
        minerr = _mm_minpos_epu16( err2 );
3503
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3504

3505
        err1 = _mm_sub_epi16( sr[4], recVal16 );
3506
        err2 = _mm_mullo_epi16( err1, err1 );
3507
        minerr = _mm_minpos_epu16( err2 );
3508
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3509

3510
        err1 = _mm_sub_epi16( sr[5], recVal16 );
3511
        err2 = _mm_mullo_epi16( err1, err1 );
3512
        minerr = _mm_minpos_epu16( err2 );
3513
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3514

3515
        err1 = _mm_sub_epi16( sr[6], recVal16 );
3516
        err2 = _mm_mullo_epi16( err1, err1 );
3517
        minerr = _mm_minpos_epu16( err2 );
3518
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3519

3520
        err1 = _mm_sub_epi16( sr[7], recVal16 );
3521
        err2 = _mm_mullo_epi16( err1, err1 );
3522
        minerr = _mm_minpos_epu16( err2 );
3523
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3524

3525
        err1 = _mm_sub_epi16( sr[8], recVal16 );
3526
        err2 = _mm_mullo_epi16( err1, err1 );
3527
        minerr = _mm_minpos_epu16( err2 );
3528
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3529

3530
        err1 = _mm_sub_epi16( sr[9], recVal16 );
3531
        err2 = _mm_mullo_epi16( err1, err1 );
3532
        minerr = _mm_minpos_epu16( err2 );
3533
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3534

3535
        err1 = _mm_sub_epi16( sr[10], recVal16 );
3536
        err2 = _mm_mullo_epi16( err1, err1 );
3537
        minerr = _mm_minpos_epu16( err2 );
3538
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3539

3540
        err1 = _mm_sub_epi16( sr[11], recVal16 );
3541
        err2 = _mm_mullo_epi16( err1, err1 );
3542
        minerr = _mm_minpos_epu16( err2 );
3543
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3544

3545
        err1 = _mm_sub_epi16( sr[12], recVal16 );
3546
        err2 = _mm_mullo_epi16( err1, err1 );
3547
        minerr = _mm_minpos_epu16( err2 );
3548
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3549

3550
        err1 = _mm_sub_epi16( sr[13], recVal16 );
3551
        err2 = _mm_mullo_epi16( err1, err1 );
3552
        minerr = _mm_minpos_epu16( err2 );
3553
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3554

3555
        err1 = _mm_sub_epi16( sr[14], recVal16 );
3556
        err2 = _mm_mullo_epi16( err1, err1 );
3557
        minerr = _mm_minpos_epu16( err2 );
3558
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3559

3560
        err1 = _mm_sub_epi16( sr[15], recVal16 );
3561
        err2 = _mm_mullo_epi16( err1, err1 );
3562
        minerr = _mm_minpos_epu16( err2 );
3563
        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
3564

3565
        if( rangeErr < err )
3566
        {
3567
            err = rangeErr;
3568
            sel = r;
3569
            if( err == 0 ) break;
3570
        }
3571
    }
3572

3573
    __m128i recVal16 = rangeMul[sel];
3574
#endif
3575

3576
    // find indices
3577
    __m128i err1, err2, minerr;
3578
    uint64_t idx = 0, tmp;
3579

3580
    err1 = _mm_sub_epi16( sr[0], recVal16 );
3581
    err2 = _mm_mullo_epi16( err1, err1 );
3582
    minerr = _mm_minpos_epu16( err2 );
3583
    tmp = _mm_cvtsi128_si64( minerr );
3584
    idx |= ( tmp >> 16 ) << 15*3;
3585

3586
    err1 = _mm_sub_epi16( sr[1], recVal16 );
3587
    err2 = _mm_mullo_epi16( err1, err1 );
3588
    minerr = _mm_minpos_epu16( err2 );
3589
    tmp = _mm_cvtsi128_si64( minerr );
3590
    idx |= ( tmp >> 16 ) << 14*3;
3591

3592
    err1 = _mm_sub_epi16( sr[2], recVal16 );
3593
    err2 = _mm_mullo_epi16( err1, err1 );
3594
    minerr = _mm_minpos_epu16( err2 );
3595
    tmp = _mm_cvtsi128_si64( minerr );
3596
    idx |= ( tmp >> 16 ) << 13*3;
3597

3598
    err1 = _mm_sub_epi16( sr[3], recVal16 );
3599
    err2 = _mm_mullo_epi16( err1, err1 );
3600
    minerr = _mm_minpos_epu16( err2 );
3601
    tmp = _mm_cvtsi128_si64( minerr );
3602
    idx |= ( tmp >> 16 ) << 12*3;
3603

3604
    err1 = _mm_sub_epi16( sr[4], recVal16 );
3605
    err2 = _mm_mullo_epi16( err1, err1 );
3606
    minerr = _mm_minpos_epu16( err2 );
3607
    tmp = _mm_cvtsi128_si64( minerr );
3608
    idx |= ( tmp >> 16 ) << 11*3;
3609

3610
    err1 = _mm_sub_epi16( sr[5], recVal16 );
3611
    err2 = _mm_mullo_epi16( err1, err1 );
3612
    minerr = _mm_minpos_epu16( err2 );
3613
    tmp = _mm_cvtsi128_si64( minerr );
3614
    idx |= ( tmp >> 16 ) << 10*3;
3615

3616
    err1 = _mm_sub_epi16( sr[6], recVal16 );
3617
    err2 = _mm_mullo_epi16( err1, err1 );
3618
    minerr = _mm_minpos_epu16( err2 );
3619
    tmp = _mm_cvtsi128_si64( minerr );
3620
    idx |= ( tmp >> 16 ) << 9*3;
3621

3622
    err1 = _mm_sub_epi16( sr[7], recVal16 );
3623
    err2 = _mm_mullo_epi16( err1, err1 );
3624
    minerr = _mm_minpos_epu16( err2 );
3625
    tmp = _mm_cvtsi128_si64( minerr );
3626
    idx |= ( tmp >> 16 ) << 8*3;
3627

3628
    err1 = _mm_sub_epi16( sr[8], recVal16 );
3629
    err2 = _mm_mullo_epi16( err1, err1 );
3630
    minerr = _mm_minpos_epu16( err2 );
3631
    tmp = _mm_cvtsi128_si64( minerr );
3632
    idx |= ( tmp >> 16 ) << 7*3;
3633

3634
    err1 = _mm_sub_epi16( sr[9], recVal16 );
3635
    err2 = _mm_mullo_epi16( err1, err1 );
3636
    minerr = _mm_minpos_epu16( err2 );
3637
    tmp = _mm_cvtsi128_si64( minerr );
3638
    idx |= ( tmp >> 16 ) << 6*3;
3639

3640
    err1 = _mm_sub_epi16( sr[10], recVal16 );
3641
    err2 = _mm_mullo_epi16( err1, err1 );
3642
    minerr = _mm_minpos_epu16( err2 );
3643
    tmp = _mm_cvtsi128_si64( minerr );
3644
    idx |= ( tmp >> 16 ) << 5*3;
3645

3646
    err1 = _mm_sub_epi16( sr[11], recVal16 );
3647
    err2 = _mm_mullo_epi16( err1, err1 );
3648
    minerr = _mm_minpos_epu16( err2 );
3649
    tmp = _mm_cvtsi128_si64( minerr );
3650
    idx |= ( tmp >> 16 ) << 4*3;
3651

3652
    err1 = _mm_sub_epi16( sr[12], recVal16 );
3653
    err2 = _mm_mullo_epi16( err1, err1 );
3654
    minerr = _mm_minpos_epu16( err2 );
3655
    tmp = _mm_cvtsi128_si64( minerr );
3656
    idx |= ( tmp >> 16 ) << 3*3;
3657

3658
    err1 = _mm_sub_epi16( sr[13], recVal16 );
3659
    err2 = _mm_mullo_epi16( err1, err1 );
3660
    minerr = _mm_minpos_epu16( err2 );
3661
    tmp = _mm_cvtsi128_si64( minerr );
3662
    idx |= ( tmp >> 16 ) << 2*3;
3663

3664
    err1 = _mm_sub_epi16( sr[14], recVal16 );
3665
    err2 = _mm_mullo_epi16( err1, err1 );
3666
    minerr = _mm_minpos_epu16( err2 );
3667
    tmp = _mm_cvtsi128_si64( minerr );
3668
    idx |= ( tmp >> 16 ) << 1*3;
3669

3670
    err1 = _mm_sub_epi16( sr[15], recVal16 );
3671
    err2 = _mm_mullo_epi16( err1, err1 );
3672
    minerr = _mm_minpos_epu16( err2 );
3673
    tmp = _mm_cvtsi128_si64( minerr );
3674
    idx |= ( tmp >> 16 ) << 0*3;
3675

3676
    uint16_t rm[8];
3677
    _mm_storeu_si128( (__m128i*)rm, mul );
3678
    uint16_t sm = _mm_cvtsi128_si64( srcMid );
3679

3680
    uint64_t d = ( uint64_t( sm ) << 56 ) |
3681
        ( uint64_t( rm[GetMulSel( sel )] ) << 52 ) |
3682
        ( uint64_t( sel ) << 48 ) |
3683
        idx;
3684

3685
    return _bswap64( d );
3686
#elif defined __ARM_NEON
3687

3688
    int16x8_t srcMidWide, multipliers;
3689
    int srcMid;
3690
    uint8x16_t srcAlphaBlock = vld1q_u8( src );
3691
    {
3692
        if( checkSolid )
3693
        {
3694
            uint8_t ref = src[0];
3695
            uint8x16_t a0 = vdupq_n_u8( ref );
3696
            uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 );
3697
            int64x2_t m = vreinterpretq_s64_u8( r );
3698
            if( m[0] == -1 && m[1] == -1 )
3699
                return ref;
3700
        }
3701

3702
        // srcRange
3703
#ifdef __aarch64__
3704
        uint8_t min = vminvq_u8( srcAlphaBlock );
3705
        uint8_t max = vmaxvq_u8( srcAlphaBlock );
3706
        uint8_t srcRange = max - min;
3707
        multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_n_s16( g_alphaRange_NEON, srcRange ), 1 ), vdupq_n_s16( 1 ) );
3708
        srcMid = min + srcRange / 2;
3709
        srcMidWide = vdupq_n_s16( srcMid );
3710
#else
3711
        uint8x8_t vmin = vpmin_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
3712
        vmin = vpmin_u8( vmin, vmin );
3713
        vmin = vpmin_u8( vmin, vmin );
3714
        vmin = vpmin_u8( vmin, vmin );
3715
        uint8x8_t vmax = vpmax_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
3716
        vmax = vpmax_u8( vmax, vmax );
3717
        vmax = vpmax_u8( vmax, vmax );
3718
        vmax = vpmax_u8( vmax, vmax );
3719

3720
        int16x8_t srcRangeWide = vreinterpretq_s16_u16( vsubl_u8( vmax, vmin ) );
3721
        multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_s16( g_alphaRange_NEON, srcRangeWide ), 1 ), vdupq_n_s16( 1 ) );
3722
        srcMidWide = vsraq_n_s16( vreinterpretq_s16_u16(vmovl_u8(vmin)), srcRangeWide, 1);
3723
        srcMid = vgetq_lane_s16( srcMidWide, 0 );
3724
#endif
3725
    }
3726

3727
    // calculate reconstructed values
3728
#define EAC_APPLY_16X( m ) m( 0 ) m( 1 ) m( 2 ) m( 3 ) m( 4 ) m( 5 ) m( 6 ) m( 7 ) m( 8 ) m( 9 ) m( 10 ) m( 11 ) m( 12 ) m( 13 ) m( 14 ) m( 15 )
3729

3730
#define EAC_RECONSTRUCT_VALUE( n ) vqmovun_s16( vmlaq_s16( srcMidWide, g_alpha_NEON[n], WidenMultiplier_EAC_NEON<n>( multipliers ) ) ),
3731
    uint8x8_t recVals[16] = { EAC_APPLY_16X( EAC_RECONSTRUCT_VALUE ) };
3732

3733
    // find selector
3734
    int err = std::numeric_limits<int>::max();
3735
    int sel = 0;
3736
    for( int r = 0; r < 16; r++ )
3737
    {
3738
        uint8x8_t recVal = recVals[r];
3739

3740
        int rangeErr = 0;
3741
#define EAC_ACCUMULATE_ERROR( n ) rangeErr += MinError_EAC_NEON( ErrorProbe_EAC_NEON<n>( recVal, srcAlphaBlock ) );
3742
        EAC_APPLY_16X( EAC_ACCUMULATE_ERROR )
3743

3744
        if( rangeErr < err )
3745
        {
3746
            err = rangeErr;
3747
            sel = r;
3748
            if ( err == 0 ) break;
3749
        }
3750
    }
3751

3752
    // combine results
3753
    uint64_t d = ( uint64_t( srcMid ) << 56 ) |
3754
        ( uint64_t( multipliers[GetMulSel( sel )] ) << 52 ) |
3755
        ( uint64_t( sel ) << 48);
3756

3757
    // generate indices
3758
    uint8x8_t recVal = recVals[sel];
3759
#define EAC_INSERT_INDEX(n) d |= MinErrorIndex_EAC_NEON<n>( recVal, srcAlphaBlock );
3760
    EAC_APPLY_16X( EAC_INSERT_INDEX )
3761

3762
    return _bswap64( d );
3763

3764
#undef EAC_APPLY_16X
3765
#undef EAC_INSERT_INDEX
3766
#undef EAC_ACCUMULATE_ERROR
3767
#undef EAC_RECONSTRUCT_VALUE
3768

3769
#else
3770
    if( checkSolid )
3771
    {
3772
        bool solid = true;
3773
        const uint8_t* ptr = src + 1;
3774
        const uint8_t ref = *src;
3775
        for( int i=1; i<16; i++ )
3776
        {
3777
            if( ref != *ptr++ )
3778
            {
3779
                solid = false;
3780
                break;
3781
            }
3782
        }
3783
        if( solid )
3784
        {
3785
            return ref;
3786
        }
3787
    }
3788

3789
    uint8_t min = src[0];
3790
    uint8_t max = src[0];
3791
    for( int i=1; i<16; i++ )
3792
    {
3793
        if( min > src[i] ) min = src[i];
3794
        else if( max < src[i] ) max = src[i];
3795
    }
3796
    int srcRange = max - min;
3797
    int srcMid = min + srcRange / 2;
3798

3799
    uint8_t buf[16][16];
3800
    int err = std::numeric_limits<int>::max();
3801
    int sel;
3802
    int selmul;
3803
    for( int r=0; r<16; r++ )
3804
    {
3805
        int mul = ( ( srcRange * g_alphaRange[r] ) >> 16 ) + 1;
3806

3807
        int rangeErr = 0;
3808
        for( int i=0; i<16; i++ )
3809
        {
3810
            const auto srcVal = src[i];
3811

3812
            int idx = 0;
3813
            const auto modVal = g_alpha[r][0] * mul;
3814
            const auto recVal = clampu8( srcMid + modVal );
3815
            int localErr = sq( srcVal - recVal );
3816

3817
            if( localErr != 0 )
3818
            {
3819
                for( int j=1; j<8; j++ )
3820
                {
3821
                    const auto modVal = g_alpha[r][j] * mul;
3822
                    const auto recVal = clampu8( srcMid + modVal );
3823
                    const auto errProbe = sq( srcVal - recVal );
3824
                    if( errProbe < localErr )
3825
                    {
3826
                        localErr = errProbe;
3827
                        idx = j;
3828
                    }
3829
                }
3830
            }
3831

3832
            buf[r][i] = idx;
3833
            rangeErr += localErr;
3834
        }
3835

3836
        if( rangeErr < err )
3837
        {
3838
            err = rangeErr;
3839
            sel = r;
3840
            selmul = mul;
3841
            if( err == 0 ) break;
3842
        }
3843
    }
3844

3845
    uint64_t d = ( uint64_t( srcMid ) << 56 ) |
3846
        ( uint64_t( selmul ) << 52 ) |
3847
        ( uint64_t( sel ) << 48 );
3848

3849
    int offset = 45;
3850
    auto ptr = buf[sel];
3851
    for( int i=0; i<16; i++ )
3852
    {
3853
        d |= uint64_t( *ptr++ ) << offset;
3854
        offset -= 3;
3855
    }
3856

3857
    return _bswap64( d );
3858
#endif
3859
}
3860

3861
void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
3862
{
3863
    int w = 0;
3864
    uint32_t buf[4*4];
3865
    do
3866
    {
3867
#ifdef __SSE4_1__
3868
        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
3869
        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
3870
        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
3871
        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
3872

3873
        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3874

3875
        _mm_store_si128( (__m128i*)(buf + 0),  _mm_castps_si128( px0 ) );
3876
        _mm_store_si128( (__m128i*)(buf + 4),  _mm_castps_si128( px1 ) );
3877
        _mm_store_si128( (__m128i*)(buf + 8),  _mm_castps_si128( px2 ) );
3878
        _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
3879

3880
        src += 4;
3881
#else
3882
        auto ptr = buf;
3883
        for( int x=0; x<4; x++ )
3884
        {
3885
            *ptr++ = *src;
3886
            src += width;
3887
            *ptr++ = *src;
3888
            src += width;
3889
            *ptr++ = *src;
3890
            src += width;
3891
            *ptr++ = *src;
3892
            src -= width * 3 - 1;
3893
        }
3894
#endif
3895
        if( ++w == width/4 )
3896
        {
3897
            src += width * 3;
3898
            w = 0;
3899
        }
3900
        *dst++ = ProcessRGB( (uint8_t*)buf );
3901
    }
3902
    while( --blocks );
3903
}
3904

3905
void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
3906
{
3907
    int w = 0;
3908
    uint32_t buf[4*4];
3909
    do
3910
    {
3911
#ifdef __SSE4_1__
3912
        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
3913
        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
3914
        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
3915
        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
3916

3917
        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3918

3919
#  ifdef __AVX2__
3920
        DitherAvx2( (uint8_t*)buf, _mm_castps_si128( px0 ), _mm_castps_si128( px1 ), _mm_castps_si128( px2 ), _mm_castps_si128( px3 ) );
3921
#  else
3922
        _mm_store_si128( (__m128i*)(buf + 0),  _mm_castps_si128( px0 ) );
3923
        _mm_store_si128( (__m128i*)(buf + 4),  _mm_castps_si128( px1 ) );
3924
        _mm_store_si128( (__m128i*)(buf + 8),  _mm_castps_si128( px2 ) );
3925
        _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
3926

3927
        Dither( (uint8_t*)buf );
3928
#  endif
3929

3930
        src += 4;
3931
#else
3932
        auto ptr = buf;
3933
        for( int x=0; x<4; x++ )
3934
        {
3935
            *ptr++ = *src;
3936
            src += width;
3937
            *ptr++ = *src;
3938
            src += width;
3939
            *ptr++ = *src;
3940
            src += width;
3941
            *ptr++ = *src;
3942
            src -= width * 3 - 1;
3943
        }
3944
#endif
3945
        if( ++w == width/4 )
3946
        {
3947
            src += width * 3;
3948
            w = 0;
3949
        }
3950
        *dst++ = ProcessRGB( (uint8_t*)buf );
3951
    }
3952
    while( --blocks );
3953
}
3954

3955
void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics )
3956
{
3957
    int w = 0;
3958
    uint32_t buf[4*4];
3959
    do
3960
    {
3961
#ifdef __SSE4_1__
3962
        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
3963
        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
3964
        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
3965
        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
3966

3967
        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
3968

3969
        _mm_store_si128( (__m128i*)(buf + 0),  _mm_castps_si128( px0 ) );
3970
        _mm_store_si128( (__m128i*)(buf + 4),  _mm_castps_si128( px1 ) );
3971
        _mm_store_si128( (__m128i*)(buf + 8),  _mm_castps_si128( px2 ) );
3972
        _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
3973

3974
        src += 4;
3975
#else
3976
        auto ptr = buf;
3977
        for( int x=0; x<4; x++ )
3978
        {
3979
            *ptr++ = *src;
3980
            src += width;
3981
            *ptr++ = *src;
3982
            src += width;
3983
            *ptr++ = *src;
3984
            src += width;
3985
            *ptr++ = *src;
3986
            src -= width * 3 - 1;
3987
        }
3988
#endif
3989
        if( ++w == width/4 )
3990
        {
3991
            src += width * 3;
3992
            w = 0;
3993
        }
3994
        *dst++ = ProcessRGB_ETC2( (uint8_t*)buf, useHeuristics );
3995
    }
3996
    while( --blocks );
3997
}
3998

3999
void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics )
4000
{
4001
    int w = 0;
4002
    uint32_t rgba[4*4];
4003
    uint8_t alpha[4*4];
4004
    do
4005
    {
4006
#ifdef __SSE4_1__
4007
        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
4008
        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
4009
        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
4010
        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
4011

4012
        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4013

4014
        __m128i c0 = _mm_castps_si128( px0 );
4015
        __m128i c1 = _mm_castps_si128( px1 );
4016
        __m128i c2 = _mm_castps_si128( px2 );
4017
        __m128i c3 = _mm_castps_si128( px3 );
4018

4019
        _mm_store_si128( (__m128i*)(rgba + 0),  c0 );
4020
        _mm_store_si128( (__m128i*)(rgba + 4),  c1 );
4021
        _mm_store_si128( (__m128i*)(rgba + 8),  c2 );
4022
        _mm_store_si128( (__m128i*)(rgba + 12), c3 );
4023

4024
        __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );
4025

4026
        __m128i a0 = _mm_shuffle_epi8( c0, mask );
4027
        __m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
4028
        __m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
4029
        __m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
4030

4031
        __m128i s0 = _mm_or_si128( a0, a1 );
4032
        __m128i s1 = _mm_or_si128( a2, a3 );
4033
        __m128i s2 = _mm_or_si128( s0, s1 );
4034

4035
        _mm_store_si128( (__m128i*)alpha, s2 );
4036

4037
        src += 4;
4038
#else
4039
        auto ptr = rgba;
4040
        auto ptr8 = alpha;
4041
        for( int x=0; x<4; x++ )
4042
        {
4043
            auto v = *src;
4044
            *ptr++ = v;
4045
            *ptr8++ = v >> 24;
4046
            src += width;
4047
            v = *src;
4048
            *ptr++ = v;
4049
            *ptr8++ = v >> 24;
4050
            src += width;
4051
            v = *src;
4052
            *ptr++ = v;
4053
            *ptr8++ = v >> 24;
4054
            src += width;
4055
            v = *src;
4056
            *ptr++ = v;
4057
            *ptr8++ = v >> 24;
4058
            src -= width * 3 - 1;
4059
        }
4060
#endif
4061
        if( ++w == width/4 )
4062
        {
4063
            src += width * 3;
4064
            w = 0;
4065
        }
4066
        *dst++ = ProcessAlpha_ETC2<true>( alpha );
4067
        *dst++ = ProcessRGB_ETC2( (uint8_t*)rgba, useHeuristics );
4068
    }
4069
    while( --blocks );
4070
}
4071

4072
void CompressEacR( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
4073
{
4074
    int w = 0;
4075
    uint8_t r[4*4];
4076
    do
4077
    {
4078
#ifdef __SSE4_1__
4079
        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
4080
        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
4081
        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
4082
        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
4083

4084
        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4085

4086
        __m128i c0 = _mm_castps_si128( px0 );
4087
        __m128i c1 = _mm_castps_si128( px1 );
4088
        __m128i c2 = _mm_castps_si128( px2 );
4089
        __m128i c3 = _mm_castps_si128( px3 );
4090

4091
        __m128i mask = _mm_setr_epi32( 0x0e0a0602, -1, -1, -1 );
4092

4093
        __m128i a0 = _mm_shuffle_epi8( c0, mask );
4094
        __m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
4095
        __m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
4096
        __m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
4097

4098
        __m128i s0 = _mm_or_si128( a0, a1 );
4099
        __m128i s1 = _mm_or_si128( a2, a3 );
4100
        __m128i s2 = _mm_or_si128( s0, s1 );
4101

4102
        _mm_store_si128( (__m128i*)r, s2 );
4103

4104
        src += 4;
4105
#else
4106
        auto ptr8 = r;
4107
        for( int x=0; x<4; x++ )
4108
        {
4109
            auto v = *src;
4110
            *ptr8++ = (v & 0xff0000) >> 16;
4111
            src += width;
4112
            v = *src;
4113
            *ptr8++ = (v & 0xff0000) >> 16;
4114
            src += width;
4115
            v = *src;
4116
            *ptr8++ = (v & 0xff0000) >> 16;
4117
            src += width;
4118
            v = *src;
4119
            *ptr8++ = (v & 0xff0000) >> 16;
4120
            src -= width * 3 - 1;
4121
        }
4122
#endif
4123
        if( ++w == width/4 )
4124
        {
4125
            src += width * 3;
4126
            w = 0;
4127
        }
4128
        *dst++ = ProcessAlpha_ETC2<false>( r );
4129
    }
4130
    while( --blocks );
4131
}
4132

4133
void CompressEacRg( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
4134
{
4135
    int w = 0;
4136
    uint8_t rg[4*4*2];
4137
    do
4138
    {
4139
#ifdef __SSE4_1__
4140
        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
4141
        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
4142
        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
4143
        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
4144

4145
        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
4146

4147
        __m128i c0 = _mm_castps_si128( px0 );
4148
        __m128i c1 = _mm_castps_si128( px1 );
4149
        __m128i c2 = _mm_castps_si128( px2 );
4150
        __m128i c3 = _mm_castps_si128( px3 );
4151

4152
        __m128i mask = _mm_setr_epi32( 0x0e0a0602, -1, -1, -1 );
4153

4154
        __m128i r0 = _mm_shuffle_epi8( c0, mask );
4155
        __m128i r1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
4156
        __m128i r2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
4157
        __m128i r3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
4158

4159
        __m128i s0 = _mm_or_si128( r0, r1 );
4160
        __m128i s1 = _mm_or_si128( r2, r3 );
4161
        __m128i s2 = _mm_or_si128( s0, s1 );
4162

4163
        _mm_store_si128( (__m128i*)rg, s2 );
4164

4165
        mask = _mm_setr_epi32( 0x0d090501, -1, -1, -1 );
4166

4167
        r0 = _mm_shuffle_epi8( c0, mask );
4168
        r1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
4169
        r2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
4170
        r3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
4171

4172
        s0 = _mm_or_si128( r0, r1 );
4173
        s1 = _mm_or_si128( r2, r3 );
4174
        s2 = _mm_or_si128( s0, s1 );
4175

4176
        _mm_store_si128( (__m128i*)&rg[16], s2 );
4177
        src += 4;
4178
#else
4179
        auto ptrr = rg;
4180
        auto ptrg = ptrr + 16;
4181
        for( int x=0; x<4; x++ )
4182
        {
4183
            auto v = *src;
4184
            *ptrr++ = (v & 0xff0000) >> 16;
4185
            *ptrg++ = (v & 0xff00) >> 8;
4186
            src += width;
4187
            v = *src;
4188
            *ptrr++ = (v & 0xff0000) >> 16;
4189
            *ptrg++ = (v & 0xff00) >> 8;
4190
            src += width;
4191
            v = *src;
4192
            *ptrr++ = (v & 0xff0000) >> 16;
4193
            *ptrg++ = (v & 0xff00) >> 8;
4194
            src += width;
4195
            v = *src;
4196
            *ptrr++ = (v & 0xff0000) >> 16;
4197
            *ptrg++ = (v & 0xff00) >> 8;
4198
            src -= width * 3 - 1;
4199
        }
4200
#endif
4201
        if( ++w == width/4 )
4202
        {
4203
            src += width * 3;
4204
            w = 0;
4205
        }
4206
        *dst++ = ProcessAlpha_ETC2<false>( rg );
4207
        *dst++ = ProcessAlpha_ETC2<false>( &rg[16] );
4208
    }
4209
    while( --blocks );
4210
}
4211

4212
Product

Resources

Company