CoCalc -- ConvectionKernels

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/cvtt/ConvectionKernels_ETC.cpp
⁹⁹⁰² views
1
/*
2
Convection Texture Tools
3
Copyright (c) 2018-2019 Eric Lasota
4

5
Permission is hereby granted, free of charge, to any person obtaining
6
a copy of this software and associated documentation files (the
7
"Software"), to deal in the Software without restriction, including
8
without limitation the rights to use, copy, modify, merge, publish,
9
distribute, sublicense, and/or sell copies of the Software, and to
10
permit persons to whom the Software is furnished to do so, subject
11
to the following conditions:
12

13
The above copyright notice and this permission notice shall be included
14
in all copies or substantial portions of the Software.
15

16
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23

24
-------------------------------------------------------------------------------------
25

26
Portions based on DirectX Texture Library (DirectXTex)
27

28
Copyright (c) Microsoft Corporation. All rights reserved.
29
Licensed under the MIT License.
30

31
http://go.microsoft.com/fwlink/?LinkId=248926
32
*/
33
#include "ConvectionKernels_Config.h"
34

35
#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
36

37
#include "ConvectionKernels.h"
38
#include "ConvectionKernels_ETC.h"
39
#include "ConvectionKernels_ETC1.h"
40
#include "ConvectionKernels_ETC2.h"
41
#include "ConvectionKernels_ETC2_Rounding.h"
42
#include "ConvectionKernels_ParallelMath.h"
43
#include "ConvectionKernels_FakeBT709_Rounding.h"
44

45
#include <cmath>
46

47
const int cvtt::Internal::ETCComputer::g_flipTables[2][2][8] =
48
{
49
    {
50
        { 0, 1, 4, 5, 8, 9, 12, 13 },
51
        { 2, 3, 6, 7, 10, 11, 14, 15 }
52
    },
53
    {
54
        { 0, 1, 2, 3, 4, 5, 6, 7 },
55
        { 8, 9, 10, 11, 12, 13, 14, 15 }
56
    },
57
};
58

59
cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorUniform(const MUInt15 pixelA[3], const MUInt15 pixelB[3])
60
{
61
    MSInt16 d0 = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[0]);
62
    MFloat fd0 = ParallelMath::ToFloat(d0);
63
    MFloat error = fd0 * fd0;
64
    for (int ch = 1; ch < 3; ch++)
65
    {
66
        MSInt16 d = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[ch]);
67
        MFloat fd = ParallelMath::ToFloat(d);
68
        error = error + fd * fd;
69
    }
70
    return error;
71
}
72

73
cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorWeighted(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3], const Options options)
74
{
75
    MFloat dr = ParallelMath::ToFloat(reconstructed[0]) * options.redWeight - preWeightedPixel[0];
76
    MFloat dg = ParallelMath::ToFloat(reconstructed[1]) * options.greenWeight - preWeightedPixel[1];
77
    MFloat db = ParallelMath::ToFloat(reconstructed[2]) * options.blueWeight - preWeightedPixel[2];
78

79
    return dr * dr + dg * dg + db * db;
80
}
81

82
cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorFakeBT709(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3])
83
{
84
    MFloat yuv[3];
85
    ConvertToFakeBT709(yuv, reconstructed);
86

87
    MFloat dy = yuv[0] - preWeightedPixel[0];
88
    MFloat du = yuv[1] - preWeightedPixel[1];
89
    MFloat dv = yuv[2] - preWeightedPixel[2];
90

91
    return dy * dy + du * du + dv * dv;
92
}
93

94
void cvtt::Internal::ETCComputer::TestHalfBlock(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const MSInt16 modifiers[4], bool isDifferential, const Options &options)
95
{
96
    MUInt15 quantized[3];
97
    MUInt15 unquantized[3];
98

99
    for (int ch = 0; ch < 3; ch++)
100
    {
101
        quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
102

103
        if (isDifferential)
104
            unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
105
        else
106
            unquantized[ch] = (quantized[ch] << 4) | quantized[ch];
107
    }
108

109
    MUInt16 selectors = ParallelMath::MakeUInt16(0);
110
    MFloat totalError = ParallelMath::MakeFloatZero();
111

112
    MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
113
    MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
114

115
    MUInt15 unquantizedModified[4][3];
116
    for (unsigned int s = 0; s < 4; s++)
117
        for (int ch = 0; ch < 3; ch++)
118
            unquantizedModified[s][ch] = ParallelMath::Min(ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::ToSInt16(unquantized[ch]) + modifiers[s], s16_zero)), u15_255);
119

120
    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
121
    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
122

123
    for (int px = 0; px < 8; px++)
124
    {
125
        MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
126
        MUInt16 bestSelector = ParallelMath::MakeUInt16(0);
127

128
        for (unsigned int s = 0; s < 4; s++)
129
        {
130
            MFloat error;
131
            if (isFakeBT709)
132
                error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
133
            else if (isUniform)
134
                error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
135
            else
136
                error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
137

138
            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
139
            bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt16(s), bestSelector);
140
            bestError = ParallelMath::Min(error, bestError);
141
        }
142

143
        totalError = totalError + bestError;
144
        selectors = selectors | (bestSelector << (px * 2));
145
    }
146

147
    outError = totalError;
148
    outSelectors = selectors;
149
}
150

151
void cvtt::Internal::ETCComputer::TestHalfBlockPunchthrough(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const ParallelMath::Int16CompFlag isTransparent[8], const MUInt15 modifier, const Options &options)
152
{
153
    MUInt15 quantized[3];
154
    MUInt15 unquantized[3];
155

156
    for (int ch = 0; ch < 3; ch++)
157
    {
158
        quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
159
        unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
160
    }
161

162
    MUInt16 selectors = ParallelMath::MakeUInt16(0);
163
    MFloat totalError = ParallelMath::MakeFloatZero();
164

165
    MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
166
    MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
167

168
    MUInt15 unquantizedModified[3][3];
169
    for (int ch = 0; ch < 3; ch++)
170
    {
171
        unquantizedModified[0][ch] = ParallelMath::Max(unquantized[ch], modifier) - modifier;
172
        unquantizedModified[1][ch] = unquantized[ch];
173
        unquantizedModified[2][ch] = ParallelMath::Min(unquantized[ch] + modifier, u15_255);
174
    }
175

176
    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
177
    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
178

179
    for (int px = 0; px < 8; px++)
180
    {
181
        ParallelMath::FloatCompFlag isTransparentFloat = ParallelMath::Int16FlagToFloat(isTransparent[px]);
182

183
        MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
184
        MUInt15 bestSelector = ParallelMath::MakeUInt15(0);
185

186
        for (unsigned int s = 0; s < 3; s++)
187
        {
188
            MFloat error;
189
            if (isFakeBT709)
190
                error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
191
            else if (isUniform)
192
                error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
193
            else
194
                error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
195

196
            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
197
            bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(s), bestSelector);
198
            bestError = ParallelMath::Min(error, bestError);
199
        }
200

201
        // Annoying quirk: The ETC encoding machinery assumes that selectors are in the table order in the spec, which isn't
202
        // the same as their encoding bits, so the transparent index is actually 1 and the valid indexes are 0, 2, and 3.
203

204
        // Remap selector 1 to 2, and 2 to 3
205
        bestSelector = ParallelMath::Min(ParallelMath::MakeUInt15(3), bestSelector << 1);
206

207
        // Mark zero transparent as 
208
        ParallelMath::ConditionalSet(bestError, isTransparentFloat, ParallelMath::MakeFloatZero());
209
        ParallelMath::ConditionalSet(bestSelector, isTransparent[px], ParallelMath::MakeUInt15(1));
210

211
        totalError = totalError + bestError;
212
        selectors = selectors | (ParallelMath::LosslessCast<MUInt16>::Cast(bestSelector) << (px * 2));
213
    }
214

215
    outError = totalError;
216
    outSelectors = selectors;
217
}
218

219
void cvtt::Internal::ETCComputer::FindBestDifferentialCombination(int flip, int d, const ParallelMath::Int16CompFlag canIgnoreSector[2], ParallelMath::Int16CompFlag& bestIsThisMode, MFloat& bestTotalError, MUInt15& bestFlip, MUInt15& bestD, MUInt15 bestColors[2], MUInt16 bestSelectors[2], MUInt15 bestTables[2], DifferentialResolveStorage &drs)
220
{
221
    // We do this part scalar because most of the cost benefit of parallelization is in error evaluation,
222
    // and this code has a LOT of early-outs and disjointed index lookups that vary heavily between blocks
223
    // and save a lot of time.
224
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
225
    {
226
        bool canIgnore[2] = { ParallelMath::Extract(canIgnoreSector[0], block), ParallelMath::Extract(canIgnoreSector[1], block) };
227
        bool canIgnoreEither = canIgnore[0] || canIgnore[1];
228
        float blockBestTotalError = ParallelMath::Extract(bestTotalError, block);
229
        float bestDiffErrors[2] = { FLT_MAX, FLT_MAX };
230
        uint16_t bestDiffSelectors[2] = { 0, 0 };
231
        uint16_t bestDiffColors[2] = { 0, 0 };
232
        uint16_t bestDiffTables[2] = { 0, 0 };
233
        for (int sector = 0; sector < 2; sector++)
234
        {
235
            unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
236
            for (unsigned int i = 0; i < sectorNumAttempts; i++)
237
            {
238
                float error = ParallelMath::Extract(drs.diffErrors[sector][i], block);
239
                if (error < bestDiffErrors[sector])
240
                {
241
                    bestDiffErrors[sector] = error;
242
                    bestDiffSelectors[sector] = ParallelMath::Extract(drs.diffSelectors[sector][i], block);
243
                    bestDiffColors[sector] = ParallelMath::Extract(drs.diffColors[sector][i], block);
244
                    bestDiffTables[sector] = ParallelMath::Extract(drs.diffTables[sector][i], block);
245
                }
246
            }
247
        }
248

249
        if (canIgnore[0])
250
            bestDiffColors[0] = bestDiffColors[1];
251
        else if (canIgnore[1])
252
            bestDiffColors[1] = bestDiffColors[0];
253

254
        // The best differential possibilities must be better than the best total error
255
        if (bestDiffErrors[0] + bestDiffErrors[1] < blockBestTotalError)
256
        {
257
            // Fast path if the best possible case is legal
258
            if (canIgnoreEither || ETCDifferentialIsLegalScalar(bestDiffColors[0], bestDiffColors[1]))
259
            {
260
                ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
261
                ParallelMath::PutFloat(bestTotalError, block, bestDiffErrors[0] + bestDiffErrors[1]);
262
                ParallelMath::PutUInt15(bestFlip, block, flip);
263
                ParallelMath::PutUInt15(bestD, block, d);
264
                for (int sector = 0; sector < 2; sector++)
265
                {
266
                    ParallelMath::PutUInt15(bestColors[sector], block, bestDiffColors[sector]);
267
                    ParallelMath::PutUInt16(bestSelectors[sector], block, bestDiffSelectors[sector]);
268
                    ParallelMath::PutUInt15(bestTables[sector], block, bestDiffTables[sector]);
269
                }
270
            }
271
            else
272
            {
273
                // Slow path: Sort the possible cases by quality, and search valid combinations
274
                // TODO: Pre-flatten the error lists so this is nicer to cache
275
                unsigned int numSortIndexes[2] = { 0, 0 };
276
                for (int sector = 0; sector < 2; sector++)
277
                {
278
                    unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
279

280
                    for (unsigned int i = 0; i < sectorNumAttempts; i++)
281
                    {
282
                        if (ParallelMath::Extract(drs.diffErrors[sector][i], block) < blockBestTotalError)
283
                            drs.attemptSortIndexes[sector][numSortIndexes[sector]++] = i;
284
                    }
285

286
                    struct SortPredicate
287
                    {
288
                        const MFloat *diffErrors;
289
                        int block;
290

291
                        bool operator()(uint16_t a, uint16_t b) const
292
                        {
293
                            float errorA = ParallelMath::Extract(diffErrors[a], block);
294
                            float errorB = ParallelMath::Extract(diffErrors[b], block);
295

296
                            if (errorA < errorB)
297
                                return true;
298
                            if (errorA > errorB)
299
                                return false;
300

301
                            return a < b;
302
                        }
303
                    };
304

305
                    SortPredicate sp;
306
                    sp.diffErrors = drs.diffErrors[sector];
307
                    sp.block = block;
308

309
                    std::sort<uint16_t*, const SortPredicate&>(drs.attemptSortIndexes[sector], drs.attemptSortIndexes[sector] + numSortIndexes[sector], sp);
310
                }
311

312
                int scannedElements = 0;
313
                for (unsigned int i = 0; i < numSortIndexes[0]; i++)
314
                {
315
                    unsigned int attemptIndex0 = drs.attemptSortIndexes[0][i];
316
                    float error0 = ParallelMath::Extract(drs.diffErrors[0][attemptIndex0], block);
317

318
                    scannedElements++;
319

320
                    if (error0 >= blockBestTotalError)
321
                        break;
322

323
                    float maxError1 = ParallelMath::Extract(bestTotalError, block) - error0;
324
                    uint16_t diffColor0 = ParallelMath::Extract(drs.diffColors[0][attemptIndex0], block);
325

326
                    if (maxError1 < bestDiffErrors[1])
327
                        break;
328

329
                    for (unsigned int j = 0; j < numSortIndexes[1]; j++)
330
                    {
331
                        unsigned int attemptIndex1 = drs.attemptSortIndexes[1][j];
332
                        float error1 = ParallelMath::Extract(drs.diffErrors[1][attemptIndex1], block);
333

334
                        scannedElements++;
335

336
                        if (error1 >= maxError1)
337
                            break;
338

339
                        uint16_t diffColor1 = ParallelMath::Extract(drs.diffColors[1][attemptIndex1], block);
340

341
                        if (ETCDifferentialIsLegalScalar(diffColor0, diffColor1))
342
                        {
343
                            blockBestTotalError = error0 + error1;
344

345
                            ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
346
                            ParallelMath::PutFloat(bestTotalError, block, blockBestTotalError);
347
                            ParallelMath::PutUInt15(bestFlip, block, flip);
348
                            ParallelMath::PutUInt15(bestD, block, d);
349
                            ParallelMath::PutUInt15(bestColors[0], block, diffColor0);
350
                            ParallelMath::PutUInt15(bestColors[1], block, diffColor1);
351
                            ParallelMath::PutUInt16(bestSelectors[0], block, ParallelMath::Extract(drs.diffSelectors[0][attemptIndex0], block));
352
                            ParallelMath::PutUInt16(bestSelectors[1], block, ParallelMath::Extract(drs.diffSelectors[1][attemptIndex1], block));
353
                            ParallelMath::PutUInt15(bestTables[0], block, ParallelMath::Extract(drs.diffTables[0][attemptIndex0], block));
354
                            ParallelMath::PutUInt15(bestTables[1], block, ParallelMath::Extract(drs.diffTables[1][attemptIndex1], block));
355
                            break;
356
                        }
357
                    }
358
                }
359
            }
360
        }
361
    }
362
}
363

364
cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannel(const MUInt15 &a, const MUInt15 &b)
365
{
366
    MSInt16 diff = ParallelMath::LosslessCast<MSInt16>::Cast(b) - ParallelMath::LosslessCast<MSInt16>::Cast(a);
367

368
    return ParallelMath::Less(ParallelMath::MakeSInt16(-5), diff) & ParallelMath::Less(diff, ParallelMath::MakeSInt16(4));
369
}
370

371
cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegal(const MUInt15 &a, const MUInt15 &b)
372
{
373
    MUInt15 mask = ParallelMath::MakeUInt15(31);
374

375
    return ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 10), ParallelMath::RightShift(b, 10))
376
        & ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 5) & mask, ParallelMath::RightShift(b, 5) & mask)
377
        & ETCDifferentialIsLegalForChannel(a & mask, b & mask);
378
}
379

380
bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannelScalar(const uint16_t &a, const uint16_t &b)
381
{
382
    int16_t diff = static_cast<int16_t>(b) - static_cast<int16_t>(a);
383

384
    return (-4 <= diff) && (diff <= 3);
385
}
386

387
bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalScalar(const uint16_t &a, const uint16_t &b)
388
{
389
    MUInt15 mask = ParallelMath::MakeUInt15(31);
390

391
    return ETCDifferentialIsLegalForChannelScalar((a >> 10), (b >> 10))
392
        & ETCDifferentialIsLegalForChannelScalar((a >> 5) & 31, (b >> 5) & 31)
393
        & ETCDifferentialIsLegalForChannelScalar(a & 31, b & 31);
394
}
395

396
void cvtt::Internal::ETCComputer::EncodeTMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
397
{
398
    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
399
    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
400

401
    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
402

403
    MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
404
    MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
405

406
    MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
407

408
    // To speed this up, we compute line total as the sum, then subtract out isolated
409
    for (unsigned int px = 0; px < 16; px++)
410
    {
411
        for (int ch = 0; ch < 3; ch++)
412
        {
413
            isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
414
            lineTotal[ch] = lineTotal[ch] + pixels[px][ch];
415
        }
416
        numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
417
    }
418

419
    for (int ch = 0; ch < 3; ch++)
420
        lineTotal[ch] = lineTotal[ch] - isolatedTotal[ch];
421

422
    MUInt15 numPixelsLine = ParallelMath::MakeUInt15(16) - numPixelsIsolated;
423

424
    MUInt15 isolatedAverageQuantized[3];
425
    MUInt15 isolatedAverageTargets[3];
426
    {
427
        int divisors[ParallelMath::ParallelSize];
428
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
429
            divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
430

431
        MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
432
        for (int ch = 0; ch < 3; ch++)
433
        {
434
            // isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
435

436
            MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
437
            if (!isFakeBT709)
438
                numerator = numerator + addend;
439

440
            for (int block = 0; block < ParallelMath::ParallelSize; block++)
441
            {
442
                int divisor = divisors[block];
443
                if (divisor == 0)
444
                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
445
                else
446
                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
447
            }
448

449
            isolatedAverageTargets[ch] = numerator;
450
        }
451
    }
452

453
    if (isFakeBT709)
454
        ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
455

456
    MUInt15 isolatedColor[3];
457
    for (int ch = 0; ch < 3; ch++)
458
        isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
459

460
    MFloat isolatedError[16];
461
    for (int px = 0; px < 16; px++)
462
    {
463
        if (isFakeBT709)
464
            isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
465
        else if (isUniform)
466
            isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
467
        else
468
            isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
469
    }
470

471
    MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
472
    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
473
    MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
474

475
    MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
476
    MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
477

478
    int16_t clusterMaxLine = 0;
479
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
480
    {
481
        int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
482
        if (blockMaxLine > clusterMaxLine)
483
            clusterMaxLine = blockMaxLine;
484
    }
485

486
    int16_t clusterMinLine = -clusterMaxLine;
487

488
    int lineDivisors[ParallelMath::ParallelSize];
489
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
490
        lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
491

492
    MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
493

494
    for (int table = 0; table < 8; table++)
495
    {
496
        int numUniqueColors[ParallelMath::ParallelSize];
497
        MUInt15 uniqueQuantizedColors[31];
498

499
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
500
            numUniqueColors[block] = 0;
501

502
        MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
503
        MUInt15 modifierOffset = (modifier + modifier);
504

505
        for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier++)
506
        {
507
            MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
508
            MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
509

510
            MUInt15 quantized[3];
511
            if (isFakeBT709)
512
            {
513
                MUInt15 targets[3];
514
                for (int ch = 0; ch < 3; ch++)
515
                {
516
                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
517
                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
518
                    MUInt15 divided = ParallelMath::MakeUInt15(0);
519
                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
520
                    {
521
                        int divisor = lineDivisors[block];
522
                        if (divisor == 0)
523
                            ParallelMath::PutUInt15(divided, block, 0);
524
                        else
525
                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
526
                    }
527
                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
528
                    targets[ch] = numerator;
529
                }
530

531
                ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
532
            }
533
            else
534
            {
535
                for (int ch = 0; ch < 3; ch++)
536
                {
537
                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
538
                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
539
                    MUInt15 divided = ParallelMath::MakeUInt15(0);
540
                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
541
                    {
542
                        int divisor = lineDivisors[block];
543
                        if (divisor == 0)
544
                            ParallelMath::PutUInt15(divided, block, 0);
545
                        else
546
                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
547
                    }
548
                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
549
                }
550
            }
551

552
            MUInt15 packedColor = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
553

554
            for (int block = 0; block < ParallelMath::ParallelSize; block++)
555
            {
556
                uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
557
                if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
558
                    ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
559
            }
560
        }
561

562
        // Stripe unfilled unique colors
563
        int maxUniqueColors = 0;
564
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
565
        {
566
            if (numUniqueColors[block] > maxUniqueColors)
567
                maxUniqueColors = numUniqueColors[block];
568
        }
569

570
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
571
        {
572
            uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
573

574
            int numUnique = numUniqueColors[block];
575
            for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
576
                ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
577
        }
578

579
        for (int ci = 0; ci < maxUniqueColors; ci++)
580
        {
581
            MUInt15 lineColors[3][3];
582
            for (int ch = 0; ch < 3; ch++)
583
            {
584
                MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], (ch * 5)) & ParallelMath::MakeUInt15(15));
585

586
                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
587
                lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
588
                lineColors[1][ch] = unquantizedColor;
589
                lineColors[2][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
590
            }
591

592
            MSInt32 selectors = ParallelMath::MakeSInt32(0);
593
            MFloat error = ParallelMath::MakeFloatZero();
594
            for (int px = 0; px < 16; px++)
595
            {
596
                MFloat pixelError = isolatedError[px];
597

598
                MUInt15 pixelBestSelector = ParallelMath::MakeUInt15(0);
599
                for (int i = 0; i < 3; i++)
600
                {
601
                    MFloat error = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
602
                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, pixelError);
603
                    pixelError = ParallelMath::Min(error, pixelError);
604
                    pixelBestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(i + 1), pixelBestSelector);
605
                }
606

607
                error = error + pixelError;
608
                selectors = selectors | (ParallelMath::ToInt32(pixelBestSelector) << (px * 2));
609
            }
610

611
            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
612
            bestError = ParallelMath::Min(error, bestError);
613

614
            if (ParallelMath::AnySet(errorBetter))
615
            {
616
                ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
617
                ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
618
                ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
619
                bestIsThisMode = bestIsThisMode | errorBetter;
620
            }
621
        }
622
    }
623

624
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
625
    {
626
        if (ParallelMath::Extract(bestIsThisMode, block))
627
        {
628
            uint32_t lowBits = 0;
629
            uint32_t highBits = 0;
630

631
            uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
632
            ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
633

634
            for (int ch = 0; ch < 3; ch++)
635
                blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
636

637
            uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
638
            int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
639

640
            ParallelMath::ScalarUInt16 lineColor[3];
641
            for (int ch = 0; ch < 3; ch++)
642
                lineColor[ch] = (blockBestLineColor >> (ch * 5)) & 15;
643

644
            EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, true);
645
        }
646
    }
647
}
648

649
void cvtt::Internal::ETCComputer::EncodeHMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag groupings[16], const MUInt15 pixels[16][3], HModeEval &he, const MFloat preWeightedPixels[16][3], const Options &options)
650
{
651
    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
652
    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
653

654
    MUInt15 zero15 = ParallelMath::MakeUInt15(0);
655

656
    MUInt15 counts[2] = { zero15, zero15 };
657

658
    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
659

660
    MUInt15 totals[2][3] =
661
    {
662
        { zero15, zero15, zero15 },
663
        { zero15, zero15, zero15 }
664
    };
665

666
    for (unsigned int px = 0; px < 16; px++)
667
    {
668
        for (int ch = 0; ch < 3; ch++)
669
        {
670
            totals[0][ch] = totals[0][ch] + pixels[px][ch];
671
            totals[1][ch] = totals[1][ch] + ParallelMath::SelectOrZero(groupings[px], pixels[px][ch]);
672
        }
673
        counts[1] = counts[1] + ParallelMath::SelectOrZero(groupings[px], ParallelMath::MakeUInt15(1));
674
    }
675

676
    for (int ch = 0; ch < 3; ch++)
677
        totals[0][ch] = totals[0][ch] - totals[1][ch];
678
    counts[0] = ParallelMath::MakeUInt15(16) - counts[1];
679

680
    MUInt16 bestSectorBits = ParallelMath::MakeUInt16(0);
681
    MUInt16 bestSignBits = ParallelMath::MakeUInt16(0);
682
    MUInt15 bestColors[2] = { zero15, zero15 };
683
    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
684

685
    for (int table = 0; table < 8; table++)
686
    {
687
        MUInt15 numUniqueColors = zero15;
688

689
        int modifier = cvtt::Tables::ETC1::g_thModifierTable[table];
690

691
        for (int sector = 0; sector < 2; sector++)
692
        {
693
            for (int block = 0; block < ParallelMath::ParallelSize; block++)
694
            {
695
                int blockNumUniqueColors = 0;
696
                uint16_t blockUniqueQuantizedColors[31];
697

698
                int maxOffsetMultiplier = ParallelMath::Extract(counts[sector], block);
699
                int minOffsetMultiplier = -maxOffsetMultiplier;
700

701
                int modifierOffset = modifier * 2;
702

703
                int blockSectorCounts = ParallelMath::Extract(counts[sector], block);
704
                int blockSectorTotals[3];
705
                for (int ch = 0; ch < 3; ch++)
706
                    blockSectorTotals[ch] = ParallelMath::Extract(totals[sector][ch], block);
707

708
                for (int offsetPremultiplier = minOffsetMultiplier; offsetPremultiplier <= maxOffsetMultiplier; offsetPremultiplier++)
709
                {
710
                    // TODO: This isn't ideal for FakeBT709
711
                    int16_t quantized[3];
712
                    for (int ch = 0; ch < 3; ch++)
713
                    {
714
                        if (blockSectorCounts == 0)
715
                            quantized[ch] = 0;
716
                        else
717
                            quantized[ch] = std::min<int16_t>(15, std::max<int16_t>(0, (blockSectorTotals[ch] * 2 + blockSectorCounts * 17 + modifierOffset * offsetPremultiplier)) / (blockSectorCounts * 34));
718
                    }
719

720
                    uint16_t packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
721
                    if (blockNumUniqueColors == 0 || packedColor != blockUniqueQuantizedColors[blockNumUniqueColors - 1])
722
                    {
723
                        assert(blockNumUniqueColors < 32);
724
                        blockUniqueQuantizedColors[blockNumUniqueColors++] = packedColor;
725
                    }
726
                }
727

728
                ParallelMath::PutUInt15(he.numUniqueColors[sector], block, blockNumUniqueColors);
729

730
                int baseIndex = 0;
731
                if (sector == 1)
732
                    baseIndex = ParallelMath::Extract(he.numUniqueColors[0], block);
733

734
                for (int i = 0; i < blockNumUniqueColors; i++)
735
                    ParallelMath::PutUInt15(he.uniqueQuantizedColors[baseIndex + i], block, blockUniqueQuantizedColors[i]);
736
            }
737
        }
738

739
        MUInt15 totalColors = he.numUniqueColors[0] + he.numUniqueColors[1];
740
        int maxErrorColors = 0;
741
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
742
            maxErrorColors = std::max<int>(maxErrorColors, ParallelMath::Extract(totalColors, block));
743

744
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
745
        {
746
            int lastColor = ParallelMath::Extract(totalColors, block);
747
            uint16_t stripeColor = ParallelMath::Extract(he.uniqueQuantizedColors[0], block);
748
            for (int i = lastColor; i < maxErrorColors; i++)
749
                ParallelMath::PutUInt15(he.uniqueQuantizedColors[i], block, stripeColor);
750
        }
751

752
        for (int ci = 0; ci < maxErrorColors; ci++)
753
        {
754
            MUInt15 fifteen = ParallelMath::MakeUInt15(15);
755
            MUInt15 twoFiftyFive = ParallelMath::MakeUInt15(255);
756
            MSInt16 zeroS16 = ParallelMath::MakeSInt16(0);
757

758
            MUInt15 colors[2][3];
759
            for (int ch = 0; ch < 3; ch++)
760
            {
761
                MUInt15 quantizedChannel = ParallelMath::RightShift(he.uniqueQuantizedColors[ci], ((2 - ch) * 5)) & fifteen;
762

763
                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
764
                colors[0][ch] = ParallelMath::Min(twoFiftyFive, unquantizedColor + modifier);
765
                colors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(zeroS16, ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::MakeSInt16(modifier)));
766
            }
767

768
            MUInt16 signBits = ParallelMath::MakeUInt16(0);
769
            for (int px = 0; px < 16; px++)
770
            {
771
                MFloat errors[2];
772
                for (int i = 0; i < 2; i++)
773
                {
774
                    if (isFakeBT709)
775
                        errors[i] = ComputeErrorFakeBT709(colors[i], preWeightedPixels[px]);
776
                    else if (isUniform)
777
                        errors[i] = ComputeErrorUniform(colors[i], pixels[px]);
778
                    else
779
                        errors[i] = ComputeErrorWeighted(colors[i], preWeightedPixels[px], options);
780
                }
781

782
                ParallelMath::Int16CompFlag errorOneLess = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errors[1], errors[0]));
783
                he.errors[ci][px] = ParallelMath::Min(errors[0], errors[1]);
784
                signBits = signBits | ParallelMath::SelectOrZero(errorOneLess, ParallelMath::MakeUInt16(1 << px));
785
            }
786
            he.signBits[ci] = signBits;
787
        }
788

789
        int maxUniqueColorCombos = 0;
790
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
791
        {
792
            int numUniqueColorCombos = ParallelMath::Extract(he.numUniqueColors[0], block) * ParallelMath::Extract(he.numUniqueColors[1], block);
793
            if (numUniqueColorCombos > maxUniqueColorCombos)
794
                maxUniqueColorCombos = numUniqueColorCombos;
795
        }
796

797
        MUInt15 indexes[2] = { zero15, zero15 };
798
        MUInt15 maxIndex[2] = { he.numUniqueColors[0] - ParallelMath::MakeUInt15(1), he.numUniqueColors[1] - ParallelMath::MakeUInt15(1) };
799

800
        int block1Starts[ParallelMath::ParallelSize];
801
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
802
            block1Starts[block] = ParallelMath::Extract(he.numUniqueColors[0], block);
803

804
        for (int combo = 0; combo < maxUniqueColorCombos; combo++)
805
        {
806
            MUInt15 index0 = indexes[0] + ParallelMath::MakeUInt15(1);
807
            ParallelMath::Int16CompFlag index0Overflow = ParallelMath::Less(maxIndex[0], index0);
808
            ParallelMath::ConditionalSet(index0, index0Overflow, ParallelMath::MakeUInt15(0));
809

810
            MUInt15 index1 = ParallelMath::Min(maxIndex[1], indexes[1] + ParallelMath::SelectOrZero(index0Overflow, ParallelMath::MakeUInt15(1)));
811
            indexes[0] = index0;
812
            indexes[1] = index1;
813

814
            int ci0[ParallelMath::ParallelSize];
815
            int ci1[ParallelMath::ParallelSize];
816
            MUInt15 color0;
817
            MUInt15 color1;
818

819
            for (int block = 0; block < ParallelMath::ParallelSize; block++)
820
            {
821
                ci0[block] = ParallelMath::Extract(index0, block);
822
                ci1[block] = ParallelMath::Extract(index1, block) + block1Starts[block];
823
                ParallelMath::PutUInt15(color0, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci0[block]], block));
824
                ParallelMath::PutUInt15(color1, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci1[block]], block));
825
            }
826

827
            MFloat totalError = ParallelMath::MakeFloatZero();
828
            MUInt16 sectorBits = ParallelMath::MakeUInt16(0);
829
            MUInt16 signBits = ParallelMath::MakeUInt16(0);
830
            for (int px = 0; px < 16; px++)
831
            {
832
                MFloat errorCI0;
833
                MFloat errorCI1;
834
                MUInt16 signBits0;
835
                MUInt16 signBits1;
836

837
                for (int block = 0; block < ParallelMath::ParallelSize; block++)
838
                {
839
                    ParallelMath::PutFloat(errorCI0, block, ParallelMath::Extract(he.errors[ci0[block]][px], block));
840
                    ParallelMath::PutFloat(errorCI1, block, ParallelMath::Extract(he.errors[ci1[block]][px], block));
841
                    ParallelMath::PutUInt16(signBits0, block, ParallelMath::Extract(he.signBits[ci0[block]], block));
842
                    ParallelMath::PutUInt16(signBits1, block, ParallelMath::Extract(he.signBits[ci1[block]], block));
843
                }
844

845
                totalError = totalError + ParallelMath::Min(errorCI0, errorCI1);
846

847
                MUInt16 bitPosition = ParallelMath::MakeUInt16(1 << px);
848

849
                ParallelMath::Int16CompFlag error1Better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errorCI1, errorCI0));
850

851
                sectorBits = sectorBits | ParallelMath::SelectOrZero(error1Better, bitPosition);
852
                signBits = signBits | (bitPosition & ParallelMath::Select(error1Better, signBits1, signBits0));
853
            }
854

855
            ParallelMath::FloatCompFlag totalErrorBetter = ParallelMath::Less(totalError, bestError);
856
            ParallelMath::Int16CompFlag totalErrorBetter16 = ParallelMath::FloatFlagToInt16(totalErrorBetter);
857
            if (ParallelMath::AnySet(totalErrorBetter16))
858
            {
859
                bestIsThisMode = bestIsThisMode | totalErrorBetter16;
860
                ParallelMath::ConditionalSet(bestTable, totalErrorBetter16, ParallelMath::MakeUInt15(table));
861
                ParallelMath::ConditionalSet(bestColors[0], totalErrorBetter16, color0);
862
                ParallelMath::ConditionalSet(bestColors[1], totalErrorBetter16, color1);
863
                ParallelMath::ConditionalSet(bestSectorBits, totalErrorBetter16, sectorBits);
864
                ParallelMath::ConditionalSet(bestSignBits, totalErrorBetter16, signBits);
865
                bestError = ParallelMath::Min(totalError, bestError);
866
            }
867
        }
868
    }
869

870
    if (ParallelMath::AnySet(bestIsThisMode))
871
    {
872
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
873
        {
874
            if (!ParallelMath::Extract(bestIsThisMode, block))
875
                continue;
876

877
            ParallelMath::ScalarUInt16 blockBestColors[2] = { ParallelMath::Extract(bestColors[0], block), ParallelMath::Extract(bestColors[1], block) };
878
            ParallelMath::ScalarUInt16 blockBestSectorBits = ParallelMath::Extract(bestSectorBits, block);
879
            ParallelMath::ScalarUInt16 blockBestSignBits = ParallelMath::Extract(bestSignBits, block);
880
            ParallelMath::ScalarUInt16 blockBestTable = ParallelMath::Extract(bestTable, block);
881

882
            EmitHModeBlock(outputBuffer + block * 8, blockBestColors, blockBestSectorBits, blockBestSignBits, blockBestTable, true);
883
        }
884
    }
885
}
886

887
void cvtt::Internal::ETCComputer::EncodeVirtualTModePunchthrough(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolatedBase[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], const ParallelMath::Int16CompFlag& anyTransparent, const ParallelMath::Int16CompFlag& allTransparent, const Options &options)
888
{
889
    // We treat T and H mode as the same mode ("Virtual T mode") with punchthrough, because of how the colors work:
890
    //
891
    // T mode: C1, C2+M, Transparent, C2-M
892
    // H mode: C1+M, C1-M, Transparent, C2-M
893
    //
894
    // So in either case, we have 2 colors +/- a modifier, and a third unique color, which is basically T mode except without the middle color.
895
    // The only thing that matters is whether it's better to store the isolated color as T mode color 1, or store it offset in H mode color 2.
896
    //
897
    // Sometimes it won't even be possible to store it in H mode color 2 because the table low bit derives from a numeric comparison of the colors,
898
    // but unlike opaque blocks, we can't flip them.
899
    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
900
    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
901

902
    ParallelMath::FloatCompFlag isTransparentF[16];
903
    for (int px = 0; px < 16; px++)
904
        isTransparentF[px] = ParallelMath::Int16FlagToFloat(isTransparent[px]);
905

906
    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
907
    ParallelMath::Int16CompFlag bestIsHMode = ParallelMath::MakeBoolInt16(false);
908

909
    MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
910
    MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
911

912
    MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
913
    MUInt15 numPixelsLine = ParallelMath::MakeUInt15(0);
914

915
    ParallelMath::Int16CompFlag isIsolated[16];
916
    ParallelMath::Int16CompFlag isLine[16];
917

918
    for (unsigned int px = 0; px < 16; px++)
919
    {
920
        ParallelMath::Int16CompFlag isOpaque = ParallelMath::Not(isTransparent[px]);
921
        isIsolated[px] = isIsolatedBase[px] & isOpaque;
922
        isLine[px] = ParallelMath::Not(isIsolatedBase[px]) & isOpaque;
923
    }
924

925
    for (unsigned int px = 0; px < 16; px++)
926
    {
927
        for (int ch = 0; ch < 3; ch++)
928
        {
929
            isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
930
            lineTotal[ch] = lineTotal[ch] + ParallelMath::SelectOrZero(isLine[px], pixels[px][ch]);
931
        }
932
        numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
933
        numPixelsLine = numPixelsLine + ParallelMath::SelectOrZero(isLine[px], ParallelMath::MakeUInt15(1));
934
    }
935

936
    MUInt15 isolatedAverageQuantized[3];
937
    MUInt15 hModeIsolatedQuantized[8][3];
938
    MUInt15 isolatedAverageTargets[3];
939
    {
940
        int divisors[ParallelMath::ParallelSize];
941
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
942
            divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
943

944
        MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
945
        for (int ch = 0; ch < 3; ch++)
946
        {
947
            // isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
948

949
            MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
950
            if (!isFakeBT709)
951
                numerator = numerator + addend;
952

953
            MUInt15 hModeIsolatedNumerators[8];
954
            for (int table = 0; table < 8; table++)
955
            {
956
                // FIXME: Handle fake BT.709 correctly
957
                MUInt15 offsetTotal = isolatedTotal[ch] + ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]), numPixelsIsolated));
958

959
                hModeIsolatedNumerators[table] = (offsetTotal + offsetTotal) + addend;
960
            }
961

962
            for (int block = 0; block < ParallelMath::ParallelSize; block++)
963
            {
964
                int divisor = divisors[block];
965
                if (divisor == 0)
966
                {
967
                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
968
                    for (int table = 0; table < 8; table++)
969
                        ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, 0);
970
                }
971
                else
972
                {
973
                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
974
                    for (int table = 0; table < 8; table++)
975
                        ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, ParallelMath::Extract(hModeIsolatedNumerators[table], block) / divisor);
976
                }
977
            }
978

979
            isolatedAverageTargets[ch] = numerator;
980
        }
981
    }
982

983
    if (isFakeBT709)
984
        ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
985

986
    for (int table = 0; table < 8; table++)
987
        for (int ch = 0; ch < 3; ch++)
988
            hModeIsolatedQuantized[table][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), hModeIsolatedQuantized[table][ch]);
989

990
    MUInt15 isolatedColor[3];
991
    for (int ch = 0; ch < 3; ch++)
992
        isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
993

994
    MFloat isolatedError[16];
995
    for (int px = 0; px < 16; px++)
996
    {
997
        if (isFakeBT709)
998
            isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
999
        else if (isUniform)
1000
            isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
1001
        else
1002
            isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
1003

1004
        ParallelMath::ConditionalSet(isolatedError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1005
    }
1006

1007
    MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
1008
    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
1009
    MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
1010
    MUInt15 bestIsolatedColor = ParallelMath::MakeUInt15(0);
1011
    MUInt15 bestHModeColor2 = ParallelMath::MakeUInt15(0);
1012
    ParallelMath::Int16CompFlag bestUseHMode = ParallelMath::MakeBoolInt16(false);
1013

1014
    MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
1015
    MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
1016

1017
    int16_t clusterMaxLine = 0;
1018
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
1019
    {
1020
        int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
1021
        if (blockMaxLine > clusterMaxLine)
1022
            clusterMaxLine = blockMaxLine;
1023
    }
1024

1025
    int16_t clusterMinLine = -clusterMaxLine;
1026

1027
    int lineDivisors[ParallelMath::ParallelSize];
1028
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
1029
        lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
1030

1031
    MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
1032

1033
    for (int table = 0; table < 8; table++)
1034
    {
1035
        int numUniqueColors[ParallelMath::ParallelSize];
1036
        MUInt15 uniqueQuantizedColors[31];
1037

1038
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
1039
            numUniqueColors[block] = 0;
1040

1041
        MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
1042
        MUInt15 modifierOffset = (modifier + modifier);
1043

1044
        for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier += 2)
1045
        {
1046
            MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
1047
            MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
1048

1049
            MUInt15 quantized[3];
1050
            if (isFakeBT709)
1051
            {
1052
                MUInt15 targets[3];
1053
                for (int ch = 0; ch < 3; ch++)
1054
                {
1055
                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
1056
                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
1057
                    MUInt15 divided = ParallelMath::MakeUInt15(0);
1058
                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
1059
                    {
1060
                        int divisor = lineDivisors[block];
1061
                        if (divisor == 0)
1062
                            ParallelMath::PutUInt15(divided, block, 0);
1063
                        else
1064
                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
1065
                    }
1066
                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
1067
                    targets[ch] = numerator;
1068
                }
1069

1070
                ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
1071
            }
1072
            else
1073
            {
1074
                for (int ch = 0; ch < 3; ch++)
1075
                {
1076
                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
1077
                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
1078
                    MUInt15 divided = ParallelMath::MakeUInt15(0);
1079
                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
1080
                    {
1081
                        int divisor = lineDivisors[block];
1082
                        if (divisor == 0)
1083
                            ParallelMath::PutUInt15(divided, block, 0);
1084
                        else
1085
                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
1086
                    }
1087
                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
1088
                }
1089
            }
1090

1091
            MUInt15 packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
1092

1093
            for (int block = 0; block < ParallelMath::ParallelSize; block++)
1094
            {
1095
                uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
1096
                if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
1097
                    ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
1098
            }
1099
        }
1100

1101
        // Stripe unfilled unique colors
1102
        int maxUniqueColors = 0;
1103
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
1104
        {
1105
            if (numUniqueColors[block] > maxUniqueColors)
1106
                maxUniqueColors = numUniqueColors[block];
1107
        }
1108

1109
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
1110
        {
1111
            uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
1112

1113
            int numUnique = numUniqueColors[block];
1114
            for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
1115
                ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
1116
        }
1117

1118
        MFloat hModeErrors[16];
1119
        MUInt15 hModeUnquantizedColor[3];
1120
        for (int ch = 0; ch < 3; ch++)
1121
        {
1122
            MUInt15 quantizedChannel = hModeIsolatedQuantized[table][ch];
1123

1124
            MUInt15 unquantizedCh = (quantizedChannel << 4) | quantizedChannel;
1125
            hModeUnquantizedColor[ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedCh) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
1126
        }
1127

1128
        for (int px = 0; px < 16; px++)
1129
        {
1130
            hModeErrors[px] = isUniform ? ComputeErrorUniform(hModeUnquantizedColor, pixels[px]) : ComputeErrorWeighted(hModeUnquantizedColor, preWeightedPixels[px], options);
1131
            ParallelMath::ConditionalSet(hModeErrors[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1132
        }
1133

1134
        MUInt15 packedHModeColor2 = (hModeIsolatedQuantized[table][0] << 10) | (hModeIsolatedQuantized[table][1] << 5) | hModeIsolatedQuantized[table][2];
1135
        ParallelMath::Int16CompFlag tableLowBitIsZero = ((table & 1) == 0) ? ParallelMath::MakeBoolInt16(true) : ParallelMath::MakeBoolInt16(false);
1136

1137
        for (int ci = 0; ci < maxUniqueColors; ci++)
1138
        {
1139
            MUInt15 lineColors[2][3];
1140
            for (int ch = 0; ch < 3; ch++)
1141
            {
1142
                MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], 10 - (ch * 5)) & ParallelMath::MakeUInt15(15));
1143

1144
                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
1145
                lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
1146
                lineColors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
1147
            }
1148

1149
            MUInt15 bestLineSelector[16];
1150
            MFloat bestLineError[16];
1151
            for (int px = 0; px < 16; px++)
1152
            {
1153
                MFloat lineErrors[2];
1154
                for (int i = 0; i < 2; i++)
1155
                    lineErrors[i] = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
1156

1157
                ParallelMath::Int16CompFlag firstIsBetter = ParallelMath::FloatFlagToInt16(ParallelMath::LessOrEqual(lineErrors[0], lineErrors[1]));
1158
                bestLineSelector[px] = ParallelMath::Select(firstIsBetter, ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(3));
1159
                bestLineError[px] = ParallelMath::Min(lineErrors[0], lineErrors[1]);
1160

1161
                ParallelMath::ConditionalSet(bestLineError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1162
            }
1163

1164
            // One case considered here was if it was possible to force H mode to be valid when the line color is unused.
1165
            // That case isn't actually useful because it's equivalent to the isolated color being unused at maximum offset,
1166
            // which is always checked after a swap.
1167
            MFloat tModeError = ParallelMath::MakeFloatZero();
1168
            MFloat hModeError = ParallelMath::MakeFloatZero();
1169
            for (int px = 0; px < 16; px++)
1170
            {
1171
                tModeError = tModeError + ParallelMath::Min(bestLineError[px], isolatedError[px]);
1172
                hModeError = hModeError + ParallelMath::Min(bestLineError[px], hModeErrors[px]);
1173
            }
1174

1175
            ParallelMath::FloatCompFlag hLessError = ParallelMath::Less(hModeError, tModeError);
1176

1177
            MUInt15 packedHModeColor1 = uniqueQuantizedColors[ci];
1178

1179
            ParallelMath::Int16CompFlag hModeTableLowBitMustBeZero = ParallelMath::Less(packedHModeColor1, packedHModeColor2);
1180

1181
            ParallelMath::Int16CompFlag hModeIsLegal = ParallelMath::Equal(hModeTableLowBitMustBeZero, tableLowBitIsZero);
1182
            ParallelMath::Int16CompFlag useHMode = ParallelMath::FloatFlagToInt16(hLessError) & hModeIsLegal;
1183

1184
            MFloat roundBestError = tModeError;
1185
            ParallelMath::ConditionalSet(roundBestError, ParallelMath::Int16FlagToFloat(useHMode), hModeError);
1186

1187
            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(roundBestError, bestError));
1188
            ParallelMath::FloatCompFlag useHModeF = ParallelMath::Int16FlagToFloat(useHMode);
1189

1190
            if (ParallelMath::AnySet(errorBetter))
1191
            {
1192
                MSInt32 selectors = ParallelMath::MakeSInt32(0);
1193
                for (int px = 0; px < 16; px++)
1194
                {
1195
                    MUInt15 selector = bestLineSelector[px];
1196

1197
                    MFloat isolatedPixelError = ParallelMath::Select(useHModeF, hModeErrors[px], isolatedError[px]);
1198
                    ParallelMath::Int16CompFlag isolatedBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(isolatedPixelError, bestLineError[px]));
1199

1200
                    ParallelMath::ConditionalSet(selector, isolatedBetter, ParallelMath::MakeUInt15(0));
1201
                    ParallelMath::ConditionalSet(selector, isTransparent[px], ParallelMath::MakeUInt15(2));
1202
                    selectors = selectors | (ParallelMath::ToInt32(selector) << (px * 2));
1203
                }
1204

1205
                bestError = ParallelMath::Min(bestError, roundBestError);
1206
                ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
1207
                ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
1208
                ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
1209
                ParallelMath::ConditionalSet(bestIsHMode, errorBetter, useHMode);
1210
                ParallelMath::ConditionalSet(bestHModeColor2, errorBetter, packedHModeColor2);
1211
                
1212
                bestIsThisMode = bestIsThisMode | errorBetter;
1213
            }
1214
        }
1215
    }
1216

1217
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
1218
    {
1219
        if (ParallelMath::Extract(bestIsThisMode, block))
1220
        {
1221
            uint32_t lowBits = 0;
1222
            uint32_t highBits = 0;
1223

1224
            uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
1225
            ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
1226

1227
            for (int ch = 0; ch < 3; ch++)
1228
                blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
1229

1230
            uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
1231
            int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
1232

1233
            ParallelMath::ScalarUInt16 lineColor[3];
1234
            for (int ch = 0; ch < 3; ch++)
1235
                lineColor[ch] = (blockBestLineColor >> (10 - (ch * 5))) & 15;
1236

1237
            if (ParallelMath::Extract(bestIsHMode, block))
1238
            {
1239
                // T mode: C1, C2+M, Transparent, C2-M
1240
                // H mode: C1+M, C1-M, Transparent, C2-M
1241
                static const ParallelMath::ScalarUInt16 selectorRemapSector[4] = { 1, 0, 1, 0 };
1242
                static const ParallelMath::ScalarUInt16 selectorRemapSign[4] = { 1, 0, 0, 1 };
1243

1244
                // Remap selectors
1245
                ParallelMath::ScalarUInt16 signBits = 0;
1246
                ParallelMath::ScalarUInt16 sectorBits = 0;
1247
                int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
1248
                for (int px = 0; px < 16; px++)
1249
                {
1250
                    int32_t selector = (blockBestSelectors >> (px * 2)) & 3;
1251
                    sectorBits |= (selectorRemapSector[selector] << px);
1252
                    signBits |= (selectorRemapSign[selector] << px);
1253
                }
1254

1255
                ParallelMath::ScalarUInt16 blockColors[2] = { blockBestLineColor, ParallelMath::Extract(bestHModeColor2, block) };
1256

1257
                EmitHModeBlock(outputBuffer + block * 8, blockColors, sectorBits, signBits, blockBestTable, false);
1258
            }
1259
            else
1260
                EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, false);
1261
        }
1262
    }
1263
}
1264

1265

1266
cvtt::ParallelMath::UInt15 cvtt::Internal::ETCComputer::DecodePlanarCoeff(const MUInt15 &coeff, int ch)
1267
{
1268
    if (ch == 1)
1269
        return (coeff << 1) | (ParallelMath::RightShift(coeff, 6));
1270
    else
1271
        return (coeff << 2) | (ParallelMath::RightShift(coeff, 4));
1272
}
1273

1274
void cvtt::Internal::ETCComputer::EncodePlanar(uint8_t *outputBuffer, MFloat &bestError, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
1275
{
1276
    // NOTE: If it's desired to do this in another color space, the best way to do it would probably be
1277
    // to do everything in that color space and then transform it back to RGB.
1278

1279
    // We compute H = (H-O)/4 and V= (V-O)/4 to simplify the math
1280

1281
    // error = (x*H + y*V + O - C)^2
1282
    MFloat h[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1283
    MFloat v[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1284
    MFloat o[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1285

1286
    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
1287
    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
1288

1289
    MFloat totalError = ParallelMath::MakeFloatZero();
1290
    MUInt15 bestCoeffs[3][3];	// [Channel][Coeff]
1291
    for (int ch = 0; ch < 3; ch++)
1292
    {
1293
        float fhh = 0.f;
1294
        float fho = 0.f;
1295
        float fhv = 0.f;
1296
        float foo = 0.f;
1297
        float fov = 0.f;
1298
        float fvv = 0.f;
1299
        MFloat fc = ParallelMath::MakeFloatZero();
1300
        MFloat fh = ParallelMath::MakeFloatZero();
1301
        MFloat fv = ParallelMath::MakeFloatZero();
1302
        MFloat fo = ParallelMath::MakeFloatZero();
1303

1304
        float &foh = fho;
1305
        float &fvh = fhv;
1306
        float &fvo = fov;
1307

1308
        for (int px = 0; px < 16; px++)
1309
        {
1310
            float x = static_cast<float>(px % 4);
1311
            float y = static_cast<float>(px / 4);
1312
            MFloat c = isFakeBT709 ? preWeightedPixels[px][ch] : ParallelMath::ToFloat(pixels[px][ch]);
1313

1314
            // (x*H + y*V + O - C)^2
1315
            fhh += x * x;
1316
            fhv += x * y;
1317
            fho += x;
1318
            fh = fh - c * x;
1319

1320
            fvh += y * x;
1321
            fvv += y * y;
1322
            fvo += y;
1323
            fv = fv - c * y;
1324

1325
            foh += x;
1326
            fov += y;
1327
            foo += 1;
1328
            fo = fo - c;
1329

1330
            fh = fh - c * x;
1331
            fv = fv - c * y;
1332
            fo = fo - c;
1333
            fc = fc + c * c;
1334
        }
1335

1336
        //float totalError = fhh * h * h + fho * h*o + fhv * h*v + foo * o * o + fov * o*v + fvv * v * v + fh * h + fv * v + fo * o + fc;
1337

1338
        // error = fhh*h^2 + fho*h*o + fhv*h*v + foo*o^2 + fov*o*v + fvv*v^2 + fh*h + fv*v + fo*o + fc
1339
        // derror/dh = 2*fhh*h + fho*o + fhv*v + fh
1340
        // derror/dv = fhv*h + fov*o + 2*fvv*v + fv
1341
        // derror/do = fho*h + 2*foo*o + fov*v + fo
1342

1343
        // Solve system of equations
1344
        // h o v 1 = 0
1345
        // -------
1346
        // d e f g  R0
1347
        // i j k l  R1
1348
        // m n p q  R2
1349

1350
        float d = 2.0f * fhh;
1351
        float e = fho;
1352
        float f = fhv;
1353
        MFloat gD = fh;
1354

1355
        float i = fhv;
1356
        float j = fov;
1357
        float k = 2.0f * fvv;
1358
        MFloat lD = fv;
1359

1360
        float m = fho;
1361
        float n = 2.0f * foo;
1362
        float p = fov;
1363
        MFloat qD = fo;
1364

1365
        {
1366
            // Factor out first column from R1 and R2
1367
            float r0to1 = -i / d;
1368
            float r0to2 = -m / d;
1369

1370
            // 0 j1 k1 l1D
1371
            float j1 = j + r0to1 * e;
1372
            float k1 = k + r0to1 * f;
1373
            MFloat l1D = lD + gD * r0to1;
1374

1375
            // 0 n1 p1 q1D
1376
            float n1 = n + r0to2 * e;
1377
            float p1 = p + r0to2 * f;
1378
            MFloat q1D = qD + gD * r0to2;
1379

1380
            // Factor out third column from R2
1381
            float r1to2 = -p1 / k1;
1382

1383
            // 0 n2 0 q2D
1384
            float n2 = n1 + r1to2 * j1;
1385
            MFloat q2D = q1D + l1D * r1to2;
1386

1387
            o[ch] = -q2D / n2;
1388

1389
            // Factor out second column from R1
1390
            // 0 n2 0 q2D
1391

1392
            float r2to1 = -j1 / n2;
1393

1394
            // 0 0 k1 l2D
1395
            // 0 n2 0 q2D
1396
            MFloat l2D = l1D + q2D * r2to1;
1397

1398
            float elim2 = -f / k1;
1399
            float elim1 = -e / n2;
1400

1401
            // d 0 0 g2D
1402
            MFloat g2D = gD + l2D * elim2 + q2D * elim1;
1403

1404
            // n2*o + q2 = 0
1405
            // o = -q2 / n2
1406
            h[ch] = -g2D / d;
1407
            v[ch] = -l2D / k1;
1408
        }
1409

1410
        // Undo the local transformation
1411
        h[ch] = h[ch] * 4.0f + o[ch];
1412
        v[ch] = v[ch] * 4.0f + o[ch];
1413
    }
1414

1415
    if (isFakeBT709)
1416
    {
1417
        MFloat oRGB[3];
1418
        MFloat hRGB[3];
1419
        MFloat vRGB[3];
1420

1421
        ConvertFromFakeBT709(oRGB, o);
1422
        ConvertFromFakeBT709(hRGB, h);
1423
        ConvertFromFakeBT709(vRGB, v);
1424

1425
        // Twiddling in fake BT.607 is a mess, just round off for now (the precision is pretty good anyway)
1426
        {
1427
            ParallelMath::RoundTowardNearestForScope rtn;
1428

1429
            for (int ch = 0; ch < 3; ch++)
1430
            {
1431
                MFloat fcoeffs[3] = { oRGB[ch], hRGB[ch], vRGB[ch] };
1432

1433
                for (int c = 0; c < 3; c++)
1434
                {
1435
                    MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
1436
                    if (ch == 1)
1437
                        coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
1438
                    else
1439
                        coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
1440
                    fcoeffs[c] = coeff;
1441
                }
1442

1443
                for (int c = 0; c < 3; c++)
1444
                    bestCoeffs[ch][c] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rtn);
1445
            }
1446
        }
1447

1448
        MUInt15 reconstructed[16][3];
1449
        for (int ch = 0; ch < 3; ch++)
1450
        {
1451
            MUInt15 dO = DecodePlanarCoeff(bestCoeffs[ch][0], ch);
1452
            MUInt15 dH = DecodePlanarCoeff(bestCoeffs[ch][1], ch);
1453
            MUInt15 dV = DecodePlanarCoeff(bestCoeffs[ch][2], ch);
1454

1455
            MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1456
            MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1457

1458
            MFloat error = ParallelMath::MakeFloatZero();
1459

1460
            MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
1461

1462
            for (int px = 0; px < 16; px++)
1463
            {
1464
                MUInt15 pxv = ParallelMath::MakeUInt15(px);
1465
                MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
1466
                MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
1467

1468
                MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
1469
                MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
1470
                reconstructed[px][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
1471
            }
1472
        }
1473

1474
        totalError = ParallelMath::MakeFloatZero();
1475
        for (int px = 0; px < 16; px++)
1476
            totalError = totalError + ComputeErrorFakeBT709(reconstructed[px], preWeightedPixels[px]);
1477
    }
1478
    else
1479
    {
1480
        for (int ch = 0; ch < 3; ch++)
1481
        {
1482
            MFloat fcoeffs[3] = { o[ch], h[ch], v[ch] };
1483
            MUInt15 coeffRanges[3][2];
1484

1485
            for (int c = 0; c < 3; c++)
1486
            {
1487
                MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
1488
                if (ch == 1)
1489
                    coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
1490
                else
1491
                    coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
1492
                fcoeffs[c] = coeff;
1493
            }
1494

1495
            {
1496
                ParallelMath::RoundDownForScope rd;
1497
                for (int c = 0; c < 3; c++)
1498
                    coeffRanges[c][0] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rd);
1499
            }
1500

1501
            {
1502
                ParallelMath::RoundUpForScope ru;
1503
                for (int c = 0; c < 3; c++)
1504
                    coeffRanges[c][1] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &ru);
1505
            }
1506

1507
            MFloat bestChannelError = ParallelMath::MakeFloat(FLT_MAX);
1508
            for (int io = 0; io < 2; io++)
1509
            {
1510
                MUInt15 dO = DecodePlanarCoeff(coeffRanges[0][io], ch);
1511

1512
                for (int ih = 0; ih < 2; ih++)
1513
                {
1514
                    MUInt15 dH = DecodePlanarCoeff(coeffRanges[1][ih], ch);
1515
                    MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1516

1517
                    for (int iv = 0; iv < 2; iv++)
1518
                    {
1519
                        MUInt15 dV = DecodePlanarCoeff(coeffRanges[2][iv], ch);
1520
                        MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1521

1522
                        MFloat error = ParallelMath::MakeFloatZero();
1523

1524
                        MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
1525

1526
                        for (int px = 0; px < 16; px++)
1527
                        {
1528
                            MUInt15 pxv = ParallelMath::MakeUInt15(px);
1529
                            MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
1530
                            MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
1531

1532
                            MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
1533
                            MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
1534
                            MUInt15 dec = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
1535

1536
                            MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(dec);
1537

1538
                            MFloat deltaF = ParallelMath::ToFloat(delta);
1539
                            error = error + deltaF * deltaF;
1540
                        }
1541

1542
                        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestChannelError));
1543
                        if (ParallelMath::AnySet(errorBetter))
1544
                        {
1545
                            bestChannelError = ParallelMath::Min(error, bestChannelError);
1546
                            ParallelMath::ConditionalSet(bestCoeffs[ch][0], errorBetter, coeffRanges[0][io]);
1547
                            ParallelMath::ConditionalSet(bestCoeffs[ch][1], errorBetter, coeffRanges[1][ih]);
1548
                            ParallelMath::ConditionalSet(bestCoeffs[ch][2], errorBetter, coeffRanges[2][iv]);
1549
                        }
1550
                    }
1551
                }
1552
            }
1553

1554
            if (!isUniform)
1555
            {
1556
                switch (ch)
1557
                {
1558
                case 0:
1559
                    bestChannelError = bestChannelError * (options.redWeight * options.redWeight);
1560
                    break;
1561
                case 1:
1562
                    bestChannelError = bestChannelError * (options.greenWeight * options.greenWeight);
1563
                    break;
1564
                case 2:
1565
                    bestChannelError = bestChannelError * (options.blueWeight * options.blueWeight);
1566
                    break;
1567
                default:
1568
                    break;
1569
                }
1570
            }
1571

1572
            totalError = totalError + bestChannelError;
1573
        }
1574
    }
1575

1576
    ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(totalError, bestError));
1577
    if (ParallelMath::AnySet(errorBetter))
1578
    {
1579
        bestError = ParallelMath::Min(bestError, totalError);
1580

1581
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
1582
        {
1583
            if (!ParallelMath::Extract(errorBetter, block))
1584
                continue;
1585

1586
            int ro = ParallelMath::Extract(bestCoeffs[0][0], block);
1587
            int rh = ParallelMath::Extract(bestCoeffs[0][1], block);
1588
            int rv = ParallelMath::Extract(bestCoeffs[0][2], block);
1589

1590
            int go = ParallelMath::Extract(bestCoeffs[1][0], block);
1591
            int gh = ParallelMath::Extract(bestCoeffs[1][1], block);
1592
            int gv = ParallelMath::Extract(bestCoeffs[1][2], block);
1593

1594
            int bo = ParallelMath::Extract(bestCoeffs[2][0], block);
1595
            int bh = ParallelMath::Extract(bestCoeffs[2][1], block);
1596
            int bv = ParallelMath::Extract(bestCoeffs[2][2], block);
1597

1598
            int go1 = go >> 6;
1599
            int go2 = go & 63;
1600

1601
            int bo1 = bo >> 5;
1602
            int bo2 = (bo >> 3) & 3;
1603
            int bo3 = bo & 7;
1604

1605
            int rh1 = (rh >> 1);
1606
            int rh2 = rh & 1;
1607

1608
            int fakeR = ro >> 2;
1609
            int fakeDR = go1 | ((ro & 3) << 1);
1610

1611
            int fakeG = (go2 >> 2);
1612
            int fakeDG = ((go2 & 3) << 1) | bo1;
1613

1614
            int fakeB = bo2;
1615
            int fakeDB = bo3 >> 1;
1616

1617
            uint32_t highBits = 0;
1618
            uint32_t lowBits = 0;
1619

1620
            // Avoid overflowing R
1621
            if ((fakeDR & 4) != 0 && fakeR + fakeDR < 8)
1622
                highBits |= 1 << (63 - 32);
1623

1624
            // Avoid overflowing G
1625
            if ((fakeDG & 4) != 0 && fakeG + fakeDG < 8)
1626
                highBits |= 1 << (55 - 32);
1627

1628
            // Overflow B
1629
            if (fakeB + fakeDB < 4)
1630
            {
1631
                // Overflow low
1632
                highBits |= 1 << (42 - 32);
1633
            }
1634
            else
1635
            {
1636
                // Overflow high
1637
                highBits |= 7 << (45 - 32);
1638
            }
1639

1640
            highBits |= ro << (57 - 32);
1641
            highBits |= go1 << (56 - 32);
1642
            highBits |= go2 << (49 - 32);
1643
            highBits |= bo1 << (48 - 32);
1644
            highBits |= bo2 << (43 - 32);
1645
            highBits |= bo3 << (39 - 32);
1646
            highBits |= rh1 << (34 - 32);
1647
            highBits |= 1 << (33 - 32);
1648
            highBits |= rh2 << (32 - 32);
1649

1650
            lowBits |= gh << 25;
1651
            lowBits |= bh << 19;
1652
            lowBits |= rv << 13;
1653
            lowBits |= gv << 6;
1654
            lowBits |= bv << 0;
1655

1656
            for (int i = 0; i < 4; i++)
1657
                outputBuffer[block * 8 + i] = (highBits >> (24 - i * 8)) & 0xff;
1658
            for (int i = 0; i < 4; i++)
1659
                outputBuffer[block * 8 + i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
1660
        }
1661
    }
1662
}
1663

1664
void cvtt::Internal::ETCComputer::CompressETC2Block(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, ETC2CompressionData *compressionData, const Options &options, bool punchthroughAlpha)
1665
{
1666
    ParallelMath::Int16CompFlag pixelIsTransparent[16];
1667
    ParallelMath::Int16CompFlag anyTransparent = ParallelMath::MakeBoolInt16(false);
1668
    ParallelMath::Int16CompFlag allTransparent = ParallelMath::MakeBoolInt16(true);
1669

1670
    if (punchthroughAlpha)
1671
    {
1672
        const float fThreshold = std::max<float>(std::min<float>(1.0f, options.threshold), 0.0f) * 255.0f;
1673

1674
        // +1.0f is intentional, we want to take the next valid integer (even if it's 256) since everything else lower is transparent
1675
        MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(std::floor(fThreshold + 1.0f)));
1676

1677
        for (int px = 0; px < 16; px++)
1678
        {
1679
            MUInt15 alpha;
1680
            for (int block = 0; block < ParallelMath::ParallelSize; block++)
1681
                ParallelMath::PutUInt15(alpha, block, pixelBlocks[block].m_pixels[px][3]);
1682

1683
            ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(alpha, threshold);
1684
            anyTransparent = (anyTransparent | isTransparent);
1685
            allTransparent = (allTransparent & isTransparent);
1686
            pixelIsTransparent[px] = isTransparent;
1687
        }
1688
    }
1689
    else
1690
    {
1691
        for (int px = 0; px < 16; px++)
1692
            pixelIsTransparent[px] = ParallelMath::MakeBoolInt16(false);
1693

1694
        allTransparent = anyTransparent = ParallelMath::MakeBoolInt16(false);
1695
    }
1696

1697
    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
1698

1699
    ETC2CompressionDataInternal* internalData = static_cast<ETC2CompressionDataInternal*>(compressionData);
1700

1701
    MUInt15 pixels[16][3];
1702
    MFloat preWeightedPixels[16][3];
1703
    ExtractBlocks(pixels, preWeightedPixels, pixelBlocks, options);
1704

1705
    if (ParallelMath::AnySet(anyTransparent))
1706
    {
1707
        for (int px = 0; px < 16; px++)
1708
        {
1709
            ParallelMath::Int16CompFlag flag = pixelIsTransparent[px];
1710
            ParallelMath::FloatCompFlag fflag = ParallelMath::Int16FlagToFloat(flag);
1711

1712
            for (int ch = 0; ch < 3; ch++)
1713
            {
1714
                ParallelMath::ConditionalSet(pixels[px][ch], flag, ParallelMath::MakeUInt15(0));
1715
                ParallelMath::ConditionalSet(preWeightedPixels[px][ch], fflag, ParallelMath::MakeFloat(0.0f));
1716
            }
1717
        }
1718
    }
1719

1720
    if (!ParallelMath::AllSet(allTransparent))
1721
        EncodePlanar(outputBuffer, bestError, pixels, preWeightedPixels, options);
1722

1723
    MFloat chromaDelta[16][2];
1724

1725
    MUInt15 numOpaque = ParallelMath::MakeUInt15(16);
1726
    for (int px = 0; px < 16; px++)
1727
        numOpaque = numOpaque - ParallelMath::SelectOrZero(pixelIsTransparent[px], ParallelMath::MakeUInt15(1));
1728

1729
    if (options.flags & cvtt::Flags::Uniform)
1730
    {
1731
        MSInt16 chromaCoordinates3[16][2];
1732
        for (int px = 0; px < 16; px++)
1733
        {
1734
            chromaCoordinates3[px][0] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
1735
            chromaCoordinates3[px][1] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][1] << 1) + ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
1736
        }
1737

1738
        MSInt16 chromaCoordinateCentroid[2] = { ParallelMath::MakeSInt16(0), ParallelMath::MakeSInt16(0) };
1739
        for (int px = 0; px < 16; px++)
1740
        {
1741
            for (int ch = 0; ch < 2; ch++)
1742
                chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
1743
        }
1744

1745
        if (punchthroughAlpha)
1746
        {
1747
            for (int px = 0; px < 16; px++)
1748
            {
1749
                for (int ch = 0; ch < 2; ch++)
1750
                {
1751
                    MUInt15 chromaCoordinateMultiplied = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(chromaCoordinates3[px][ch], numOpaque));
1752
                    MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(chromaCoordinateMultiplied) - chromaCoordinateCentroid[ch];
1753
                    chromaDelta[px][ch] = ParallelMath::ToFloat(delta);
1754
                }
1755
            }
1756
        }
1757
        else
1758
        {
1759
            for (int px = 0; px < 16; px++)
1760
            {
1761
                for (int ch = 0; ch < 2; ch++)
1762
                    chromaDelta[px][ch] = ParallelMath::ToFloat((chromaCoordinates3[px][ch] << 4) - chromaCoordinateCentroid[ch]);
1763
            }
1764
        }
1765

1766
        const MFloat rcpSqrt3 = ParallelMath::MakeFloat(0.57735026918962576450914878050196f);
1767

1768
        for (int px = 0; px < 16; px++)
1769
            chromaDelta[px][1] = chromaDelta[px][1] * rcpSqrt3;
1770
    }
1771
    else
1772
    {
1773
        const float chromaAxis0[3] = { internalData->m_chromaSideAxis0[0], internalData->m_chromaSideAxis0[1], internalData->m_chromaSideAxis0[2] };
1774
        const float chromaAxis1[3] = { internalData->m_chromaSideAxis1[0], internalData->m_chromaSideAxis1[1], internalData->m_chromaSideAxis1[2] };
1775

1776
        MFloat chromaCoordinates3[16][2];
1777
        for (int px = 0; px < 16; px++)
1778
        {
1779
            const MFloat &px0 = preWeightedPixels[px][0];
1780
            const MFloat &px1 = preWeightedPixels[px][1];
1781
            const MFloat &px2 = preWeightedPixels[px][2];
1782

1783
            chromaCoordinates3[px][0] = px0 * chromaAxis0[0] + px1 * chromaAxis0[1] + px2 * chromaAxis0[2];
1784
            chromaCoordinates3[px][1] = px0 * chromaAxis1[0] + px1 * chromaAxis1[1] + px2 * chromaAxis1[2];
1785
        }
1786

1787
        MFloat chromaCoordinateCentroid[2] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1788
        for (int px = 0; px < 16; px++)
1789
        {
1790
            for (int ch = 0; ch < 2; ch++)
1791
                chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
1792
        }
1793

1794
        if (punchthroughAlpha)
1795
        {
1796
            const MFloat numOpaqueF = ParallelMath::ToFloat(numOpaque);
1797
            for (int px = 0; px < 16; px++)
1798
            {
1799
                for (int ch = 0; ch < 2; ch++)
1800
                {
1801
                    MFloat chromaCoordinateMultiplied = chromaCoordinates3[px][ch] * numOpaqueF;
1802
                    MFloat delta = chromaCoordinateMultiplied - chromaCoordinateCentroid[ch];
1803
                    chromaDelta[px][ch] = delta;
1804
                }
1805
            }
1806
        }
1807
        else
1808
        {
1809
            for (int px = 0; px < 16; px++)
1810
            {
1811
                for (int ch = 0; ch < 2; ch++)
1812
                    chromaDelta[px][ch] = chromaCoordinates3[px][ch] * 16.0f - chromaCoordinateCentroid[ch];
1813
            }
1814
        }
1815
    }
1816

1817

1818
    MFloat covXX = ParallelMath::MakeFloatZero();
1819
    MFloat covYY = ParallelMath::MakeFloatZero();
1820
    MFloat covXY = ParallelMath::MakeFloatZero();
1821

1822
    for (int px = 0; px < 16; px++)
1823
    {
1824
        MFloat nx = chromaDelta[px][0];
1825
        MFloat ny = chromaDelta[px][1];
1826

1827
        covXX = covXX + nx * nx;
1828
        covYY = covYY + ny * ny;
1829
        covXY = covXY + nx * ny;
1830
    }
1831

1832
    MFloat halfTrace = (covXX + covYY) * 0.5f;
1833
    MFloat det = covXX * covYY - covXY * covXY;
1834

1835
    MFloat mm = ParallelMath::Sqrt(ParallelMath::Max(ParallelMath::MakeFloatZero(), halfTrace * halfTrace - det));
1836

1837
    MFloat ev = halfTrace + mm;
1838

1839
    MFloat dx = (covYY - ev + covXY);
1840
    MFloat dy = -(covXX - ev + covXY);
1841

1842
    // If evenly distributed, pick an arbitrary plane
1843
    ParallelMath::FloatCompFlag allZero = ParallelMath::Equal(dx, ParallelMath::MakeFloatZero()) & ParallelMath::Equal(dy, ParallelMath::MakeFloatZero());
1844
    ParallelMath::ConditionalSet(dx, allZero, ParallelMath::MakeFloat(1.f));
1845

1846
    ParallelMath::Int16CompFlag sectorAssignments[16];
1847
    for (int px = 0; px < 16; px++)
1848
        sectorAssignments[px] = ParallelMath::FloatFlagToInt16(ParallelMath::Less(chromaDelta[px][0] * dx + chromaDelta[px][1] * dy, ParallelMath::MakeFloatZero()));
1849

1850
    if (!ParallelMath::AllSet(allTransparent))
1851
    {
1852
        EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
1853

1854
        // Flip sector assignments
1855
        for (int px = 0; px < 16; px++)
1856
            sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1857

1858
        EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
1859

1860
        EncodeHMode(outputBuffer, bestError, sectorAssignments, pixels, internalData->m_h, preWeightedPixels, options);
1861

1862
        CompressETC1BlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, internalData->m_drs, options, true);
1863
    }
1864

1865
    if (ParallelMath::AnySet(anyTransparent))
1866
    {
1867
        if (!ParallelMath::AllSet(allTransparent))
1868
        {
1869
            // Flip sector assignments
1870
            for (int px = 0; px < 16; px++)
1871
                sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1872
        }
1873

1874
        // Reset the error of any transparent blocks to max and retry with punchthrough modes
1875
        ParallelMath::ConditionalSet(bestError, ParallelMath::Int16FlagToFloat(anyTransparent), ParallelMath::MakeFloat(FLT_MAX));
1876

1877
        EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
1878

1879
        // Flip sector assignments
1880
        for (int px = 0; px < 16; px++)
1881
            sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1882

1883
        EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
1884

1885
        CompressETC1PunchthroughBlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, pixelIsTransparent, static_cast<ETC2CompressionDataInternal*>(compressionData)->m_drs, options);
1886
    }
1887
}
1888

1889
void cvtt::Internal::ETCComputer::CompressETC2AlphaBlock(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, const Options &options)
1890
{
1891
    MUInt15 pixels[16];
1892

1893
    for (int px = 0; px < 16; px++)
1894
    {
1895
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
1896
            ParallelMath::PutUInt15(pixels[px], block, pixelBlocks[block].m_pixels[px][3]);
1897
    }
1898

1899
    CompressETC2AlphaBlockInternal(outputBuffer, pixels, false, false, options);
1900
}
1901

1902
void cvtt::Internal::ETCComputer::CompressETC2AlphaBlockInternal(uint8_t *outputBuffer, const MUInt15 pixels[16], bool is11Bit, bool isSigned, const Options &options)
1903
{
1904
    MUInt15 minAlpha = ParallelMath::MakeUInt15(is11Bit ? 2047 : 255);
1905
    MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
1906

1907
    for (int px = 0; px < 16; px++)
1908
    {
1909
        minAlpha = ParallelMath::Min(minAlpha, pixels[px]);
1910
        maxAlpha = ParallelMath::Max(maxAlpha, pixels[px]);
1911
    }
1912

1913
    MUInt15 alphaSpan = maxAlpha - minAlpha;
1914
    MUInt15 alphaSpanMidpointTimes2 = maxAlpha + minAlpha;
1915

1916
    MUInt31 bestTotalError = ParallelMath::MakeUInt31(0x7fffffff);
1917
    MUInt15 bestTableIndex = ParallelMath::MakeUInt15(0);
1918
    MUInt15 bestBaseCodeword = ParallelMath::MakeUInt15(0);
1919
    MUInt15 bestMultiplier = ParallelMath::MakeUInt15(0);
1920
    MUInt15 bestIndexes[16];
1921

1922
    for (int px = 0; px < 16; px++)
1923
        bestIndexes[px] = ParallelMath::MakeUInt15(0);
1924

1925
    const int numAlphaRanges = 10;
1926
    for (uint16_t tableIndex = 0; tableIndex < 16; tableIndex++)
1927
    {
1928
        for (int r = 0; r < numAlphaRanges; r++)
1929
        {
1930
            int subrange = r % 3;
1931
            int mainRange = r / 3;
1932

1933
            int16_t maxOffset = Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - (subrange & 1)];
1934
            int16_t minOffset = -Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - ((subrange >> 1) & 1)] - 1;
1935
            uint16_t offsetSpan = static_cast<uint16_t>(maxOffset - minOffset);
1936

1937
            MSInt16 vminOffset = ParallelMath::MakeSInt16(minOffset);
1938
            MUInt15 vmaxOffset = ParallelMath::MakeUInt15(maxOffset);
1939
            MUInt15 voffsetSpan = ParallelMath::MakeUInt15(offsetSpan);
1940

1941
            MUInt15 minMultiplier = ParallelMath::MakeUInt15(0);
1942
            for (int block = 0; block < ParallelMath::ParallelSize; block++)
1943
            {
1944
                uint16_t singleAlphaSpan = ParallelMath::Extract(alphaSpan, block);
1945

1946
                uint16_t lowMultiplier = singleAlphaSpan / offsetSpan;
1947
                ParallelMath::PutUInt15(minMultiplier, block, lowMultiplier);
1948
            }
1949

1950
            if (is11Bit)
1951
            {
1952
                // Clamps this to valid multipliers under 15 and rounds down to nearest multiple of 8
1953
                minMultiplier = ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(112)) & ParallelMath::MakeUInt15(120);
1954
            }
1955
            else
1956
            {
1957
                // We cap at 1 and 14 so both multipliers are valid and dividable
1958
                // Cases where offset span is 0 should be caught by multiplier 1 of table 13
1959
                minMultiplier = ParallelMath::Max(ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(14)), ParallelMath::MakeUInt15(1));
1960
            }
1961

1962
            for (uint16_t multiplierOffset = 0; multiplierOffset < 2; multiplierOffset++)
1963
            {
1964
                MUInt15 multiplier = minMultiplier;
1965

1966
                if (is11Bit)
1967
                {
1968
                    if (multiplierOffset == 1)
1969
                        multiplier = multiplier + ParallelMath::MakeUInt15(8);
1970
                    else
1971
                        multiplier = ParallelMath::Max(multiplier, ParallelMath::MakeUInt15(1));
1972
                }
1973
                else
1974
                {
1975
                    if (multiplierOffset == 1)
1976
                        multiplier = multiplier + ParallelMath::MakeUInt15(1);
1977
                }
1978

1979
                MSInt16 multipliedMinOffset = ParallelMath::CompactMultiply(ParallelMath::LosslessCast<MSInt16>::Cast(multiplier), vminOffset);
1980
                MUInt15 multipliedMaxOffset = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(multiplier, vmaxOffset));
1981

1982
                // codeword = (maxOffset + minOffset + minAlpha + maxAlpha) / 2
1983
                MSInt16 unclampedBaseAlphaTimes2 = ParallelMath::LosslessCast<MSInt16>::Cast(alphaSpanMidpointTimes2) - ParallelMath::LosslessCast<MSInt16>::Cast(multipliedMaxOffset) - multipliedMinOffset;
1984

1985
                MUInt15 baseAlpha;
1986
                if (is11Bit)
1987
                {
1988
                    // In unsigned, 4 is added to the unquantized alpha, so compensating for that cancels the 4 we have to add to do rounding.
1989
                    if (isSigned)
1990
                        unclampedBaseAlphaTimes2 = unclampedBaseAlphaTimes2 + ParallelMath::MakeSInt16(8);
1991

1992
                    // -128 is illegal for some reason
1993
                    MSInt16 minBaseAlphaTimes2 = isSigned ? ParallelMath::MakeSInt16(16) : ParallelMath::MakeSInt16(0);
1994

1995
                    MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, minBaseAlphaTimes2)), ParallelMath::MakeUInt15(4095));
1996
                    baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2, 1) & ParallelMath::MakeUInt15(2040);
1997

1998
                    if (!isSigned)
1999
                        baseAlpha = baseAlpha + ParallelMath::MakeUInt15(4);
2000
                }
2001
                else
2002
                {
2003
                    MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, ParallelMath::MakeSInt16(0))), ParallelMath::MakeUInt15(510));
2004
                    baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2 + ParallelMath::MakeUInt15(1), 1);
2005
                }
2006

2007
                MUInt15 indexes[16];
2008
                MUInt31 totalError = ParallelMath::MakeUInt31(0);
2009
                for (int px = 0; px < 16; px++)
2010
                {
2011
                    MUInt15 quantizedValues;
2012
                    QuantizeETC2Alpha(tableIndex, pixels[px], baseAlpha, multiplier, is11Bit, isSigned, indexes[px], quantizedValues);
2013

2014
                    if (is11Bit)
2015
                    {
2016
                        MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(quantizedValues) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px]);
2017
                        MSInt32 deltaSq = ParallelMath::XMultiply(delta, delta);
2018
                        totalError = totalError + ParallelMath::LosslessCast<MUInt31>::Cast(deltaSq);
2019
                    }
2020
                    else
2021
                        totalError = totalError + ParallelMath::ToUInt31(ParallelMath::SqDiffUInt8(quantizedValues, pixels[px]));
2022
                }
2023

2024
                ParallelMath::Int16CompFlag isBetter = ParallelMath::Int32FlagToInt16(ParallelMath::Less(totalError, bestTotalError));
2025
                if (ParallelMath::AnySet(isBetter))
2026
                {
2027
                    ParallelMath::ConditionalSet(bestTotalError, isBetter, totalError);
2028
                    ParallelMath::ConditionalSet(bestTableIndex, isBetter, ParallelMath::MakeUInt15(tableIndex));
2029
                    ParallelMath::ConditionalSet(bestBaseCodeword, isBetter, baseAlpha);
2030
                    ParallelMath::ConditionalSet(bestMultiplier, isBetter, multiplier);
2031

2032
                    for (int px = 0; px < 16; px++)
2033
                        ParallelMath::ConditionalSet(bestIndexes[px], isBetter, indexes[px]);
2034
                }
2035

2036
                // TODO: Do one refine pass
2037
            }
2038
        }
2039
    }
2040

2041
    if (is11Bit)
2042
    {
2043
        bestMultiplier = ParallelMath::RightShift(bestMultiplier, 3);
2044

2045
        if (isSigned)
2046
            bestBaseCodeword = bestBaseCodeword ^ ParallelMath::MakeUInt15(0x80);
2047
    }
2048

2049
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
2050
    {
2051
        uint8_t *output = outputBuffer + block * 8;
2052

2053
        output[0] = static_cast<uint8_t>(ParallelMath::Extract(bestBaseCodeword, block));
2054

2055
        ParallelMath::ScalarUInt16 multiplier = ParallelMath::Extract(bestMultiplier, block);
2056
        ParallelMath::ScalarUInt16 tableIndex = ParallelMath::Extract(bestTableIndex, block);
2057

2058
        output[1] = static_cast<uint8_t>((multiplier << 4) | tableIndex);
2059

2060
        static const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2061

2062
        ParallelMath::ScalarUInt16 indexes[16];
2063
        for (int px = 0; px < 16; px++)
2064
            indexes[pixelSelectorOrder[px]] = ParallelMath::Extract(bestIndexes[px], block);
2065

2066
        int outputOffset = 2;
2067
        int outputBits = 0;
2068
        int numOutputBits = 0;
2069
        for (int s = 0; s < 16; s++)
2070
        {
2071
            outputBits = (outputBits << 3) | indexes[s];
2072
            numOutputBits += 3;
2073

2074
            if (numOutputBits >= 8)
2075
            {
2076
                output[outputOffset++] = static_cast<uint8_t>(outputBits >> (numOutputBits - 8));
2077
                numOutputBits -= 8;
2078

2079
                outputBits &= ((1 << numOutputBits) - 1);
2080
            }
2081
        }
2082

2083
        assert(outputOffset == 8 && numOutputBits == 0);
2084
    }
2085
}
2086

2087
void cvtt::Internal::ETCComputer::CompressEACBlock(uint8_t *outputBuffer, const PixelBlockScalarS16 *inputBlocks, bool isSigned, const Options &options)
2088
{
2089
    MUInt15 pixels[16];
2090
    for (int px = 0; px < 16; px++)
2091
    {
2092
        MSInt16 adjustedPixel;
2093
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
2094
            ParallelMath::PutSInt16(adjustedPixel, block, inputBlocks[block].m_pixels[px]);
2095

2096
        // We use a slightly shifted range here so we can keep the unquantized base color in a UInt15
2097
        // That is, signed range is 1..2047, and unsigned range is 0..2047
2098
        if (isSigned)
2099
        {
2100
            adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(1023)) + ParallelMath::MakeSInt16(1024);
2101
            adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(1), adjustedPixel);
2102
        }
2103
        else
2104
        {
2105
            adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(2047));
2106
            adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(0), adjustedPixel);
2107
        }
2108

2109

2110
        pixels[px] = ParallelMath::LosslessCast<MUInt15>::Cast(adjustedPixel);
2111
    }
2112

2113
    CompressETC2AlphaBlockInternal(outputBuffer, pixels, true, isSigned, options);
2114
}
2115

2116
void cvtt::Internal::ETCComputer::CompressETC1Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC1CompressionData *compressionData, const Options &options)
2117
{
2118
    DifferentialResolveStorage &drs = static_cast<ETC1CompressionDataInternal*>(compressionData)->m_drs;
2119
    MFloat bestTotalError = ParallelMath::MakeFloat(FLT_MAX);
2120

2121
    MUInt15 pixels[16][3];
2122
    MFloat preWeightedPixels[16][3];
2123
    ExtractBlocks(pixels, preWeightedPixels, inputBlocks, options);
2124

2125
    CompressETC1BlockInternal(bestTotalError, outputBuffer, pixels, preWeightedPixels, drs, options, false);
2126
}
2127

2128
void cvtt::Internal::ETCComputer::ExtractBlocks(MUInt15 pixels[16][3], MFloat preWeightedPixels[16][3], const PixelBlockU8 *inputBlocks, const Options &options)
2129
{
2130
    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
2131
    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
2132

2133
    for (int px = 0; px < 16; px++)
2134
    {
2135
        for (int ch = 0; ch < 3; ch++)
2136
        {
2137
            for (int block = 0; block < ParallelMath::ParallelSize; block++)
2138
                ParallelMath::PutUInt15(pixels[px][ch], block, inputBlocks[block].m_pixels[px][ch]);
2139
        }
2140

2141
        if (isFakeBT709)
2142
            ConvertToFakeBT709(preWeightedPixels[px], pixels[px]);
2143
        else if (isUniform)
2144
        {
2145
            for (int ch = 0; ch < 3; ch++)
2146
                preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
2147
        }
2148
        else
2149
        {
2150
            preWeightedPixels[px][0] = ParallelMath::ToFloat(pixels[px][0]) * options.redWeight;
2151
            preWeightedPixels[px][1] = ParallelMath::ToFloat(pixels[px][1]) * options.greenWeight;
2152
            preWeightedPixels[px][2] = ParallelMath::ToFloat(pixels[px][2]) * options.blueWeight;
2153
        }
2154
    }
2155
}
2156

2157
void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingAccurate(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
2158
{
2159
    for (int ch = 0; ch < 3; ch++)
2160
    {
2161
        const MUInt15& cu15 = sectorCumulative[ch];
2162

2163
        if (isDifferential)
2164
        {
2165
            //quantized[ch] = (cu * 31 + (cu >> 3)) >> 11;
2166
            quantized[ch] = ParallelMath::ToUInt15(
2167
                ParallelMath::RightShift(
2168
                (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
2169
                    , 11)
2170
            );
2171
        }
2172
        else
2173
        {
2174
            //quantized[ch] = (cu * 30 + (cu >> 3)) >> 12;
2175
            quantized[ch] = ParallelMath::ToUInt15(
2176
                ParallelMath::RightShift(
2177
                (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
2178
                    , 12)
2179
            );
2180
        }
2181
    }
2182

2183
    MFloat lowOctantRGBFloat[3];
2184
    MFloat highOctantRGBFloat[3];
2185

2186
    for (int ch = 0; ch < 3; ch++)
2187
    {
2188
        MUInt15 unquantized;
2189
        MUInt15 unquantizedNext;
2190
        if (isDifferential)
2191
        {
2192
            unquantized = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
2193
            MUInt15 quantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(31), quantized[ch] + ParallelMath::MakeUInt15(1));
2194
            unquantizedNext = (quantizedNext << 3) | ParallelMath::RightShift(quantizedNext, 2);
2195
        }
2196
        else
2197
        {
2198
            unquantized = (quantized[ch] << 4) | quantized[ch];
2199
            unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
2200
        }
2201
        lowOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantized << 3);
2202
        highOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantizedNext << 3);
2203
    }
2204

2205
    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
2206
    MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
2207

2208
    MFloat cumulativeYUV[3];
2209
    ConvertToFakeBT709(cumulativeYUV, sectorCumulative);
2210

2211
    for (uint16_t octant = 0; octant < 8; octant++)
2212
    {
2213
        const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
2214
        const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
2215
        const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
2216

2217
        MFloat octantYUV[3];
2218
        ConvertToFakeBT709(octantYUV, r, g, b);
2219

2220
        MFloat delta[3];
2221
        for (int ch = 0; ch < 3; ch++)
2222
            delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
2223

2224
        MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
2225
        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
2226
        ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
2227
        bestError = ParallelMath::Min(error, bestError);
2228
    }
2229

2230
    for (int ch = 0; ch < 3; ch++)
2231
        quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
2232
}
2233

2234
void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingFast(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
2235
{
2236
    // sectorCumulative range is 0..2040 (11 bits)
2237
    MUInt15 roundingOffset = ParallelMath::MakeUInt15(0);
2238

2239
    MUInt15 rOffset;
2240
    MUInt15 gOffset;
2241
    MUInt15 bOffset;
2242
    MUInt15 quantizedBase[3];
2243
    MUInt15 upperBound;
2244

2245
    MUInt15 sectorCumulativeFillIn[3];
2246
    for (int ch = 0; ch < 3; ch++)
2247
        sectorCumulativeFillIn[ch] = sectorCumulative[ch] + ParallelMath::RightShift(sectorCumulative[ch], 8);
2248

2249
    if (isDifferential)
2250
    {
2251
        rOffset = (sectorCumulativeFillIn[0] << 6) & ParallelMath::MakeUInt15(0xf00);
2252
        gOffset = (sectorCumulativeFillIn[1] << 4) & ParallelMath::MakeUInt15(0x0f0);
2253
        bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 2) & ParallelMath::MakeUInt15(0x00f);
2254

2255
        for (int ch = 0; ch < 3; ch++)
2256
            quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 6);
2257

2258
        upperBound = ParallelMath::MakeUInt15(31);
2259
    }
2260
    else
2261
    {
2262
        rOffset = (sectorCumulativeFillIn[0] << 5) & ParallelMath::MakeUInt15(0xf00);
2263
        gOffset = (sectorCumulativeFillIn[1] << 1) & ParallelMath::MakeUInt15(0x0f0);
2264
        bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 3) & ParallelMath::MakeUInt15(0x00f);
2265

2266
        for (int ch = 0; ch < 3; ch++)
2267
            quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 7);
2268

2269
        upperBound = ParallelMath::MakeUInt15(15);
2270
    }
2271

2272
    MUInt15 lookupIndex = (rOffset | gOffset | bOffset);
2273

2274
    MUInt15 octant;
2275
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
2276
        ParallelMath::PutUInt15(octant, block, Tables::FakeBT709::g_rounding16[ParallelMath::Extract(lookupIndex, block)]);
2277

2278
    quantizedBase[0] = quantizedBase[0] + (octant & ParallelMath::MakeUInt15(1));
2279
    quantizedBase[1] = quantizedBase[1] + (ParallelMath::RightShift(octant, 1) & ParallelMath::MakeUInt15(1));
2280
    quantizedBase[2] = quantizedBase[2] + (ParallelMath::RightShift(octant, 2) & ParallelMath::MakeUInt15(1));
2281

2282
    for (int ch = 0; ch < 3; ch++)
2283
        quantized[ch] = ParallelMath::Min(quantizedBase[ch], upperBound);
2284
}
2285

2286
void cvtt::Internal::ETCComputer::ResolveTHFakeBT709Rounding(MUInt15 quantized[3], const MUInt15 targets[3], const MUInt15 &granularity)
2287
{
2288
    MFloat lowOctantRGBFloat[3];
2289
    MFloat highOctantRGBFloat[3];
2290

2291
    for (int ch = 0; ch < 3; ch++)
2292
    {
2293
        MUInt15 unquantized = (quantized[ch] << 4) | quantized[ch];
2294
        MUInt15 unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
2295

2296
        lowOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantized, granularity) << 1);
2297
        highOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantizedNext, granularity) << 1);
2298
    }
2299

2300
    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
2301
    MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
2302

2303
    MFloat cumulativeYUV[3];
2304
    ConvertToFakeBT709(cumulativeYUV, ParallelMath::ToFloat(targets[0]), ParallelMath::ToFloat(targets[1]), ParallelMath::ToFloat(targets[2]));
2305

2306
    for (uint16_t octant = 0; octant < 8; octant++)
2307
    {
2308
        const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
2309
        const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
2310
        const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
2311

2312
        MFloat octantYUV[3];
2313
        ConvertToFakeBT709(octantYUV, r, g, b);
2314

2315
        MFloat delta[3];
2316
        for (int ch = 0; ch < 3; ch++)
2317
            delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
2318

2319
        MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
2320
        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
2321
        ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
2322
        bestError = ParallelMath::Min(error, bestError);
2323
    }
2324

2325
    for (int ch = 0; ch < 3; ch++)
2326
        quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
2327
}
2328

2329
void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MUInt15 color[3])
2330
{
2331
    MFloat floatRGB[3];
2332
    for (int ch = 0; ch < 3; ch++)
2333
        floatRGB[ch] = ParallelMath::ToFloat(color[ch]);
2334

2335
    ConvertToFakeBT709(yuv, floatRGB);
2336
}
2337

2338
void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat color[3])
2339
{
2340
    ConvertToFakeBT709(yuv, color[0], color[1], color[2]);
2341
}
2342

2343
void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat &pr, const MFloat &pg, const MFloat &pb)
2344
{
2345
    MFloat r = pr;
2346
    MFloat g = pg;
2347
    MFloat b = pb;
2348

2349
    yuv[0] = r * 0.368233989135369f + g * 1.23876274963149f + b * 0.125054068802017f;
2350
    yuv[1] = r * 0.5f - g * 0.4541529f - b * 0.04584709f;
2351
    yuv[2] = r * -0.081014709086133f - g * 0.272538676238785f + b * 0.353553390593274f;
2352
}
2353

2354
void cvtt::Internal::ETCComputer::ConvertFromFakeBT709(MFloat rgb[3], const MFloat yuv[3])
2355
{
2356
    MFloat yy = yuv[0] * 0.57735026466774571071f;
2357
    MFloat u = yuv[1];
2358
    MFloat v = yuv[2];
2359

2360
    rgb[0] = yy + u * 1.5748000207960953486f;
2361
    rgb[1] = yy - u * 0.46812425854364753669f - v * 0.26491652528157560861f;
2362
    rgb[2] = yy + v * 2.6242146882856944069f;
2363
}
2364

2365

2366
void cvtt::Internal::ETCComputer::QuantizeETC2Alpha(int tableIndex, const MUInt15& value, const MUInt15& baseValue, const MUInt15& multiplier, bool is11Bit, bool isSigned, MUInt15& outIndexes, MUInt15& outQuantizedValues)
2367
{
2368
    MSInt16 offset = ParallelMath::LosslessCast<MSInt16>::Cast(value) - ParallelMath::LosslessCast<MSInt16>::Cast(baseValue);
2369
    MSInt16 offsetTimes2 = offset + offset;
2370

2371
    // ETC2's offset tables all have a reflect about 0.5*multiplier
2372
    MSInt16 offsetAboutReflectorTimes2 = offsetTimes2 + ParallelMath::LosslessCast<MSInt16>::Cast(multiplier);
2373

2374
    MUInt15 absOffsetAboutReflectorTimes2 = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Abs(offsetAboutReflectorTimes2));
2375
    MUInt15 lookupIndex = ParallelMath::RightShift(absOffsetAboutReflectorTimes2, 1);
2376

2377
    MUInt15 positiveIndex;
2378
    MUInt15 positiveOffsetUnmultiplied;
2379
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
2380
    {
2381
        uint16_t blockLookupIndex = ParallelMath::Extract(lookupIndex, block) / ParallelMath::Extract(multiplier, block);
2382
        if (blockLookupIndex >= Tables::ETC2::g_alphaRoundingTableWidth)
2383
            blockLookupIndex = Tables::ETC2::g_alphaRoundingTableWidth - 1;
2384
        uint16_t index = Tables::ETC2::g_alphaRoundingTables[tableIndex][blockLookupIndex];
2385
        ParallelMath::PutUInt15(positiveIndex, block, index);
2386
        ParallelMath::PutUInt15(positiveOffsetUnmultiplied, block, Tables::ETC2::g_alphaModifierTablePositive[tableIndex][index]);
2387

2388
        // TODO: This is suboptimal when the offset is capped.  We should detect 0 and 255 values and always map them to the maximum offsets.
2389
        // Doing that will also affect refinement though.
2390
    }
2391

2392
    MSInt16 signBits = ParallelMath::RightShift(offsetAboutReflectorTimes2, 15);
2393
    MSInt16 offsetUnmultiplied = ParallelMath::LosslessCast<MSInt16>::Cast(positiveOffsetUnmultiplied) ^ signBits;
2394
    MSInt16 quantizedOffset = ParallelMath::CompactMultiply(offsetUnmultiplied, multiplier);
2395

2396
    MSInt16 offsetValue = ParallelMath::LosslessCast<MSInt16>::Cast(baseValue) + quantizedOffset;
2397

2398
    if (is11Bit)
2399
    {
2400
        if (isSigned)
2401
            outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(1), offsetValue)));
2402
        else
2403
            outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
2404
    }
2405
    else
2406
        outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(255), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
2407

2408
    MUInt15 indexSub = ParallelMath::LosslessCast<MUInt15>::Cast(signBits) & ParallelMath::MakeUInt15(4);
2409

2410
    outIndexes = positiveIndex + ParallelMath::MakeUInt15(4) - indexSub;
2411
}
2412

2413

2414
void cvtt::Internal::ETCComputer::EmitTModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 lineColor[3], const ParallelMath::ScalarUInt16 isolatedColor[3], int32_t packedSelectors, ParallelMath::ScalarUInt16 table, bool opaque)
2415
{
2416
    static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2417

2418
    uint32_t lowBits = 0;
2419
    uint32_t highBits = 0;
2420

2421
    int rh = ((isolatedColor[0] >> 2) & 3);
2422
    int rl = (isolatedColor[0] & 3);
2423

2424
    if (rh + rl < 4)
2425
    {
2426
        // Overflow low
2427
        highBits |= 1 << (58 - 32);
2428
    }
2429
    else
2430
    {
2431
        // Overflow high
2432
        highBits |= 7 << (61 - 32);
2433
    }
2434

2435
    highBits |= rh << (59 - 32);
2436
    highBits |= rl << (56 - 32);
2437
    highBits |= isolatedColor[1] << (52 - 32);
2438
    highBits |= isolatedColor[2] << (48 - 32);
2439
    highBits |= lineColor[0] << (44 - 32);
2440
    highBits |= lineColor[1] << (40 - 32);
2441
    highBits |= lineColor[2] << (36 - 32);
2442
    highBits |= ((table >> 1) & 3) << (34 - 32);
2443
    if (opaque)
2444
        highBits |= 1 << (33 - 32);
2445
    highBits |= (table & 1) << (32 - 32);
2446

2447
    for (int px = 0; px < 16; px++)
2448
    {
2449
        int sel = (packedSelectors >> (2 * selectorOrder[px])) & 3;
2450
        if ((sel & 0x1) != 0)
2451
            lowBits |= (1 << px);
2452
        if ((sel & 0x2) != 0)
2453
            lowBits |= (1 << (16 + px));
2454
    }
2455

2456
    for (int i = 0; i < 4; i++)
2457
        outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
2458
    for (int i = 0; i < 4; i++)
2459
        outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
2460
}
2461

2462
void cvtt::Internal::ETCComputer::EmitHModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 blockColors[2], ParallelMath::ScalarUInt16 sectorBits, ParallelMath::ScalarUInt16 signBits, ParallelMath::ScalarUInt16 table, bool opaque)
2463
{
2464
    if (blockColors[0] == blockColors[1])
2465
    {
2466
        // Base colors are the same.
2467
        // If the table low bit isn't 1, then we can't encode this, because swapping the block colors will have no effect
2468
        // on their order.
2469
        // Instead, we encode this as T mode where all of the indexes are on the line.
2470

2471
        ParallelMath::ScalarUInt16 lineColor[3];
2472
        ParallelMath::ScalarUInt16 isolatedColor[3];
2473

2474
        lineColor[0] = isolatedColor[0] = (blockColors[0] >> 10) & 0x1f;
2475
        lineColor[1] = isolatedColor[1] = (blockColors[0] >> 5) & 0x1f;
2476
        lineColor[2] = isolatedColor[2] = (blockColors[0] >> 0) & 0x1f;
2477

2478
        int32_t packedSelectors = 0x55555555;
2479
        for (int px = 0; px < 16; px++)
2480
            packedSelectors |= ((signBits >> px) & 1) << ((px * 2) + 1);
2481

2482
        EmitTModeBlock(outputBuffer, lineColor, isolatedColor, packedSelectors, table, opaque);
2483
        return;
2484
    }
2485

2486
    static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2487

2488
    int16_t colors[2][3];
2489
    for (int sector = 0; sector < 2; sector++)
2490
    {
2491
        for (int ch = 0; ch < 3; ch++)
2492
            colors[sector][ch] = (blockColors[sector] >> ((2 - ch) * 5)) & 15;
2493
    }
2494

2495
    uint32_t lowBits = 0;
2496
    uint32_t highBits = 0;
2497

2498
    if (((table & 1) == 1) != (blockColors[0] > blockColors[1]))
2499
    {
2500
        for (int ch = 0; ch < 3; ch++)
2501
            std::swap(colors[0][ch], colors[1][ch]);
2502
        sectorBits ^= 0xffff;
2503
    }
2504

2505
    int r1 = colors[0][0];
2506
    int g1a = colors[0][1] >> 1;
2507
    int g1b = (colors[0][1] & 1);
2508
    int b1a = colors[0][2] >> 3;
2509
    int b1b = colors[0][2] & 7;
2510
    int r2 = colors[1][0];
2511
    int g2 = colors[1][1];
2512
    int b2 = colors[1][2];
2513

2514
    // Avoid overflowing R
2515
    if ((g1a & 4) != 0 && r1 + g1a < 8)
2516
        highBits |= 1 << (63 - 32);
2517

2518
    int fakeDG = b1b >> 1;
2519
    int fakeG = b1a | (g1b << 1);
2520

2521
    if (fakeG + fakeDG < 4)
2522
    {
2523
        // Overflow low
2524
        highBits |= 1 << (50 - 32);
2525
    }
2526
    else
2527
    {
2528
        // Overflow high
2529
        highBits |= 7 << (53 - 32);
2530
    }
2531

2532
    int da = (table >> 2) & 1;
2533
    int db = (table >> 1) & 1;
2534

2535
    highBits |= r1 << (59 - 32);
2536
    highBits |= g1a << (56 - 32);
2537
    highBits |= g1b << (52 - 32);
2538
    highBits |= b1a << (51 - 32);
2539
    highBits |= b1b << (47 - 32);
2540
    highBits |= r2 << (43 - 32);
2541
    highBits |= g2 << (39 - 32);
2542
    highBits |= b2 << (35 - 32);
2543
    highBits |= da << (34 - 32);
2544
    if (opaque)
2545
        highBits |= 1 << (33 - 32);
2546
    highBits |= db << (32 - 32);
2547

2548
    for (int px = 0; px < 16; px++)
2549
    {
2550
        int sectorBit = (sectorBits >> selectorOrder[px]) & 1;
2551
        int signBit = (signBits >> selectorOrder[px]) & 1;
2552

2553
        lowBits |= (signBit << px);
2554
        lowBits |= (sectorBit << (16 + px));
2555
    }
2556

2557
    uint8_t *output = outputBuffer;
2558

2559
    for (int i = 0; i < 4; i++)
2560
        output[i] = (highBits >> (24 - i * 8)) & 0xff;
2561
    for (int i = 0; i < 4; i++)
2562
        output[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
2563
}
2564

2565
void cvtt::Internal::ETCComputer::EmitETC1Block(uint8_t *outputBuffer, int blockBestFlip, int blockBestD, const int blockBestColors[2][3], const int blockBestTables[2], const ParallelMath::ScalarUInt16 blockBestSelectors[2], bool transparent)
2566
{
2567
    uint32_t highBits = 0;
2568
    uint32_t lowBits = 0;
2569

2570
    if (blockBestD == 0)
2571
    {
2572
        highBits |= blockBestColors[0][0] << 28;
2573
        highBits |= blockBestColors[1][0] << 24;
2574
        highBits |= blockBestColors[0][1] << 20;
2575
        highBits |= blockBestColors[1][1] << 16;
2576
        highBits |= blockBestColors[0][2] << 12;
2577
        highBits |= blockBestColors[1][2] << 8;
2578
    }
2579
    else
2580
    {
2581
        highBits |= blockBestColors[0][0] << 27;
2582
        highBits |= ((blockBestColors[1][0] - blockBestColors[0][0]) & 7) << 24;
2583
        highBits |= blockBestColors[0][1] << 19;
2584
        highBits |= ((blockBestColors[1][1] - blockBestColors[0][1]) & 7) << 16;
2585
        highBits |= blockBestColors[0][2] << 11;
2586
        highBits |= ((blockBestColors[1][2] - blockBestColors[0][2]) & 7) << 8;
2587
    }
2588

2589
    highBits |= (blockBestTables[0] << 5);
2590
    highBits |= (blockBestTables[1] << 2);
2591
    if (!transparent)
2592
        highBits |= (blockBestD << 1);
2593
    highBits |= blockBestFlip;
2594

2595
    const uint8_t modifierCodes[4] = { 3, 2, 0, 1 };
2596

2597
    uint8_t unpackedSelectors[16];
2598
    uint8_t unpackedSelectorCodes[16];
2599
    for (int sector = 0; sector < 2; sector++)
2600
    {
2601
        int blockSectorBestSelectors = blockBestSelectors[sector];
2602

2603
        for (int px = 0; px < 8; px++)
2604
        {
2605
            int selector = (blockSectorBestSelectors >> (2 * px)) & 3;
2606
            unpackedSelectorCodes[g_flipTables[blockBestFlip][sector][px]] = modifierCodes[selector];
2607
            unpackedSelectors[g_flipTables[blockBestFlip][sector][px]] = selector;
2608
        }
2609
    }
2610

2611
    const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2612

2613
    int lowBitOffset = 0;
2614
    for (int sb = 0; sb < 2; sb++)
2615
        for (int px = 0; px < 16; px++)
2616
            lowBits |= ((unpackedSelectorCodes[pixelSelectorOrder[px]] >> sb) & 1) << (px + sb * 16);
2617

2618
    for (int i = 0; i < 4; i++)
2619
        outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
2620
    for (int i = 0; i < 4; i++)
2621
        outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
2622
}
2623

2624
void cvtt::Internal::ETCComputer::CompressETC1BlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], DifferentialResolveStorage &drs, const Options &options, bool punchthrough)
2625
{
2626
	int numTries = 0;
2627

2628
    MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
2629
    MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
2630

2631
    MUInt15 bestColors[2] = { zeroU15, zeroU15 };
2632
    MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
2633
    MUInt15 bestTables[2] = { zeroU15, zeroU15 };
2634
    MUInt15 bestFlip = zeroU15;
2635
    MUInt15 bestD = zeroU15;
2636

2637
    MUInt15 sectorPixels[2][2][8][3];
2638
    MFloat sectorPreWeightedPixels[2][2][8][3];
2639
    MUInt15 sectorCumulative[2][2][3];
2640

2641
    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
2642

2643
    for (int flip = 0; flip < 2; flip++)
2644
	{
2645
		for (int sector = 0; sector < 2; sector++)
2646
		{
2647
			for (int ch = 0; ch < 3; ch++)
2648
				sectorCumulative[flip][sector][ch] = zeroU15;
2649

2650
			for (int px = 0; px < 8; px++)
2651
			{
2652
				for (int ch = 0; ch < 3; ch++)
2653
				{
2654
					MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
2655
					sectorPixels[flip][sector][px][ch] = pixelChannelValue;
2656
                    sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
2657
					sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
2658
				}
2659
			}
2660
		}
2661
	}
2662

2663
	static const MSInt16 modifierTables[8][4] =
2664
	{
2665
		{ ParallelMath::MakeSInt16(-8), ParallelMath::MakeSInt16(-2), ParallelMath::MakeSInt16(2), ParallelMath::MakeSInt16(8) },
2666
		{ ParallelMath::MakeSInt16(-17), ParallelMath::MakeSInt16(-5), ParallelMath::MakeSInt16(5), ParallelMath::MakeSInt16(17) },
2667
		{ ParallelMath::MakeSInt16(-29), ParallelMath::MakeSInt16(-9), ParallelMath::MakeSInt16(9), ParallelMath::MakeSInt16(29) },
2668
		{ ParallelMath::MakeSInt16(-42), ParallelMath::MakeSInt16(-13), ParallelMath::MakeSInt16(13), ParallelMath::MakeSInt16(42) },
2669
		{ ParallelMath::MakeSInt16(-60), ParallelMath::MakeSInt16(-18), ParallelMath::MakeSInt16(18), ParallelMath::MakeSInt16(60) },
2670
		{ ParallelMath::MakeSInt16(-80), ParallelMath::MakeSInt16(-24), ParallelMath::MakeSInt16(24), ParallelMath::MakeSInt16(80) },
2671
		{ ParallelMath::MakeSInt16(-106), ParallelMath::MakeSInt16(-33), ParallelMath::MakeSInt16(33), ParallelMath::MakeSInt16(106) },
2672
		{ ParallelMath::MakeSInt16(-183), ParallelMath::MakeSInt16(-47), ParallelMath::MakeSInt16(47), ParallelMath::MakeSInt16(183) },
2673
	};
2674

2675
    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
2676

2677
    int minD = punchthrough ? 1 : 0;
2678

2679
	for (int flip = 0; flip < 2; flip++)
2680
	{
2681
		drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
2682

2683
		MFloat bestIndError[2] = { ParallelMath::MakeFloat(FLT_MAX), ParallelMath::MakeFloat(FLT_MAX) };
2684
		MUInt16 bestIndSelectors[2] = { ParallelMath::MakeUInt16(0), ParallelMath::MakeUInt16(0) };
2685
		MUInt15 bestIndColors[2] = { zeroU15, zeroU15 };
2686
		MUInt15 bestIndTable[2] = { zeroU15, zeroU15 };
2687

2688
		for (int d = minD; d < 2; d++)
2689
		{
2690
			for (int sector = 0; sector < 2; sector++)
2691
			{
2692
				const int16_t *potentialOffsets = cvtt::Tables::ETC1::g_potentialOffsets4;
2693

2694
				for (int table = 0; table < 8; table++)
2695
				{
2696
					int16_t numOffsets = *potentialOffsets++;
2697

2698
					MUInt15 possibleColors[cvtt::Tables::ETC1::g_maxPotentialOffsets];
2699

2700
                    MUInt15 quantized[3];
2701
                    for (int oi = 0; oi < numOffsets; oi++)
2702
                    {
2703
                        if (!isFakeBT709)
2704
                        {
2705
						    for (int ch = 0; ch < 3; ch++)
2706
						    {
2707
                                // cu is in range 0..2040
2708
                                MUInt15 cu15 = ParallelMath::Min(
2709
                                    ParallelMath::MakeUInt15(2040),
2710
                                    ParallelMath::ToUInt15(
2711
                                        ParallelMath::Max(
2712
                                            ParallelMath::MakeSInt16(0),
2713
                                            ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
2714
                                        )
2715
                                    )
2716
                                );
2717

2718
                                if (d == 1)
2719
                                {
2720
                                    //quantized[ch] = (cu * 31 + (cu >> 3) + 1024) >> 11;
2721
                                    quantized[ch] = ParallelMath::ToUInt15(
2722
                                        ParallelMath::RightShift(
2723
                                            (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(1024)
2724
                                            , 11)
2725
                                        );
2726
                                }
2727
                                else
2728
                                {
2729
                                    //quantized[ch] = (cu * 30 + (cu >> 3) + 2048) >> 12;
2730
                                    quantized[ch] = ParallelMath::ToUInt15(
2731
                                        ParallelMath::RightShift(
2732
                                        (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(2048)
2733
                                            , 12)
2734
                                    );
2735
                                }
2736
						    }
2737
                        }
2738
                        else
2739
                        {
2740
                            MUInt15 offsetCumulative[3];
2741
						    for (int ch = 0; ch < 3; ch++)
2742
						    {
2743
                                // cu is in range 0..2040
2744
                                MUInt15 cu15 = ParallelMath::Min(
2745
                                    ParallelMath::MakeUInt15(2040),
2746
                                    ParallelMath::ToUInt15(
2747
                                        ParallelMath::Max(
2748
                                            ParallelMath::MakeSInt16(0),
2749
                                            ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
2750
                                        )
2751
                                    )
2752
                                );
2753

2754
                                offsetCumulative[ch] = cu15;
2755
						    }
2756

2757
                            if ((options.flags & cvtt::Flags::ETC_FakeBT709Accurate) != 0)
2758
                                ResolveHalfBlockFakeBT709RoundingAccurate(quantized, offsetCumulative, d == 1);
2759
                            else
2760
                                ResolveHalfBlockFakeBT709RoundingFast(quantized, offsetCumulative, d == 1);
2761
                        }
2762

2763
						possibleColors[oi] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
2764
					}
2765

2766
					potentialOffsets += numOffsets;
2767

2768
                    ParallelMath::UInt15 numUniqueColors;
2769
                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
2770
                    {
2771
                        uint16_t blockNumUniqueColors = 1;
2772
                        for (int i = 1; i < numOffsets; i++)
2773
                        {
2774
                            uint16_t color = ParallelMath::Extract(possibleColors[i], block);
2775
                            if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
2776
                                ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
2777
                        }
2778

2779
                        ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
2780
                    }
2781

2782
                    int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
2783
                    for (int block = 1; block < ParallelMath::ParallelSize; block++)
2784
                        maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
2785

2786
                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
2787
                    {
2788
                        uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
2789
                        for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
2790
                            ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
2791
                    }
2792

2793
					for (int i = 0; i < maxUniqueColors; i++)
2794
					{
2795
						MFloat error = ParallelMath::MakeFloatZero();
2796
						MUInt16 selectors = ParallelMath::MakeUInt16(0);
2797
                        MUInt15 quantized = possibleColors[i];
2798
						TestHalfBlock(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], modifierTables[table], d == 1, options);
2799

2800
						if (d == 0)
2801
						{
2802
                            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestIndError[sector]));
2803
							if (ParallelMath::AnySet(errorBetter))
2804
							{
2805
								bestIndError[sector] = ParallelMath::Min(error, bestIndError[sector]);
2806
								ParallelMath::ConditionalSet(bestIndSelectors[sector], errorBetter, selectors);
2807
                                ParallelMath::ConditionalSet(bestIndColors[sector], errorBetter, quantized);
2808
                                ParallelMath::ConditionalSet(bestIndTable[sector], errorBetter, ParallelMath::MakeUInt15(table));
2809
							}
2810
						}
2811
						else
2812
						{
2813
                            ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
2814

2815
							MUInt15 storageIndexes = drs.diffNumAttempts[sector];
2816
                            drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
2817

2818
                            for (int block = 0; block < ParallelMath::ParallelSize; block++)
2819
                            {
2820
                                int storageIndex = ParallelMath::Extract(storageIndexes, block);
2821

2822
                                ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
2823
                                ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
2824
                                ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
2825
                                ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
2826
                            }
2827
						}
2828
					}
2829
				}
2830
			}
2831

2832
			if (d == 0)
2833
			{
2834
				MFloat bestIndErrorTotal = bestIndError[0] + bestIndError[1];
2835
                ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(bestIndErrorTotal, bestTotalError));
2836
				if (ParallelMath::AnySet(errorBetter))
2837
				{
2838
                    bestIsThisMode = bestIsThisMode | errorBetter;
2839

2840
					bestTotalError = ParallelMath::Min(bestTotalError, bestIndErrorTotal);
2841
					ParallelMath::ConditionalSet(bestFlip, errorBetter, ParallelMath::MakeUInt15(flip));
2842
                    ParallelMath::ConditionalSet(bestD, errorBetter, ParallelMath::MakeUInt15(d));
2843
					for (int sector = 0; sector < 2; sector++)
2844
					{
2845
                        ParallelMath::ConditionalSet(bestColors[sector], errorBetter, bestIndColors[sector]);
2846
                        ParallelMath::ConditionalSet(bestSelectors[sector], errorBetter, bestIndSelectors[sector]);
2847
                        ParallelMath::ConditionalSet(bestTables[sector], errorBetter, bestIndTable[sector]);
2848
					}
2849
				}
2850
			}
2851
			else
2852
			{
2853
                ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(false), ParallelMath::MakeBoolInt16(false) };
2854
                FindBestDifferentialCombination(flip, d, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestD, bestColors, bestSelectors, bestTables, drs);
2855
			}
2856
		}
2857
	}
2858

2859
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
2860
    {
2861
        if (!ParallelMath::Extract(bestIsThisMode, block))
2862
            continue;
2863

2864
        uint32_t highBits = 0;
2865
        uint32_t lowBits = 0;
2866

2867
        int blockBestFlip = ParallelMath::Extract(bestFlip, block);
2868
        int blockBestD = ParallelMath::Extract(bestD, block);
2869
        int blockBestTables[2] = { ParallelMath::Extract(bestTables[0], block), ParallelMath::Extract(bestTables[1], block) };
2870
        ParallelMath::ScalarUInt16 blockBestSelectors[2] = { ParallelMath::Extract(bestSelectors[0], block), ParallelMath::Extract(bestSelectors[1], block) };
2871

2872
        int colors[2][3];
2873
        for (int sector = 0; sector < 2; sector++)
2874
        {
2875
            int sectorColor = ParallelMath::Extract(bestColors[sector], block);
2876
            for (int ch = 0; ch < 3; ch++)
2877
                colors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
2878
        }
2879

2880
        EmitETC1Block(outputBuffer + block * 8, blockBestFlip, blockBestD, colors, blockBestTables, blockBestSelectors, false);
2881
    }
2882
}
2883

2884

2885
void cvtt::Internal::ETCComputer::CompressETC1PunchthroughBlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], DifferentialResolveStorage &drs, const Options &options)
2886
{
2887
	int numTries = 0;
2888

2889
    MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
2890
    MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
2891

2892
    MUInt15 bestColors[2] = { zeroU15, zeroU15 };
2893
    MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
2894
    MUInt15 bestTables[2] = { zeroU15, zeroU15 };
2895
    MUInt15 bestFlip = zeroU15;
2896

2897
    MUInt15 sectorPixels[2][2][8][3];
2898
    ParallelMath::Int16CompFlag sectorTransparent[2][2][8];
2899
    MFloat sectorPreWeightedPixels[2][2][8][3];
2900
    MUInt15 sectorCumulative[2][2][3];
2901

2902
    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
2903

2904
    for (int flip = 0; flip < 2; flip++)
2905
	{
2906
		for (int sector = 0; sector < 2; sector++)
2907
		{
2908
			for (int ch = 0; ch < 3; ch++)
2909
				sectorCumulative[flip][sector][ch] = zeroU15;
2910

2911
			for (int px = 0; px < 8; px++)
2912
			{
2913
				for (int ch = 0; ch < 3; ch++)
2914
				{
2915
					MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
2916
					sectorPixels[flip][sector][px][ch] = pixelChannelValue;
2917
                    sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
2918
					sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
2919
				}
2920

2921
                sectorTransparent[flip][sector][px] = isTransparent[g_flipTables[flip][sector][px]];
2922
			}
2923
		}
2924
	}
2925

2926
	static const MUInt15 modifiers[8] =
2927
	{
2928
		ParallelMath::MakeUInt15(8),
2929
		ParallelMath::MakeUInt15(17),
2930
		ParallelMath::MakeUInt15(29),
2931
		ParallelMath::MakeUInt15(42),
2932
		ParallelMath::MakeUInt15(60),
2933
		ParallelMath::MakeUInt15(80),
2934
		ParallelMath::MakeUInt15(106),
2935
		ParallelMath::MakeUInt15(183),
2936
	};
2937

2938
    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
2939

2940
    const int maxSectorCumulativeOffsets = 17;
2941

2942
	for (int flip = 0; flip < 2; flip++)
2943
	{
2944
        ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(true), ParallelMath::MakeBoolInt16(false) };
2945

2946
        for (int sector = 0; sector < 2; sector++)
2947
            for (int px = 0; px < 8; px++)
2948
                canIgnoreSector[sector] = canIgnoreSector[sector] & sectorTransparent[flip][sector][px];
2949

2950
		drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
2951

2952
		for (int sector = 0; sector < 2; sector++)
2953
		{
2954
            MUInt15 sectorNumOpaque = ParallelMath::MakeUInt15(0);
2955
            for (int px = 0; px < 8; px++)
2956
                sectorNumOpaque = sectorNumOpaque + ParallelMath::SelectOrZero(sectorTransparent[flip][sector][px], ParallelMath::MakeUInt15(1));
2957

2958
            int sectorMaxOpaque = 0;
2959
            for (int block = 0; block < ParallelMath::ParallelSize; block++)
2960
                sectorMaxOpaque = std::max<int>(sectorMaxOpaque, ParallelMath::Extract(sectorNumOpaque, block));
2961

2962
            int sectorNumOpaqueMultipliers = sectorMaxOpaque * 2 + 1;
2963

2964
            MUInt15 sectorNumOpaqueDenominator = ParallelMath::Max(ParallelMath::MakeUInt15(1), sectorNumOpaque) << 8;
2965
            MUInt15 sectorNumOpaqueAddend = sectorNumOpaque << 7;
2966

2967
            MSInt16 sectorNumOpaqueSigned = ParallelMath::LosslessCast<MSInt16>::Cast(sectorNumOpaque);
2968
            MSInt16 negSectorNumOpaqueSigned = ParallelMath::MakeSInt16(0) - sectorNumOpaqueSigned;
2969

2970
            MUInt15 sectorCumulativeMax = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(255), sectorNumOpaque));
2971

2972
			for (int table = 0; table < 8; table++)
2973
			{
2974
				MUInt15 possibleColors[maxSectorCumulativeOffsets];
2975

2976
                MUInt15 quantized[3];
2977
                for (int om = -sectorMaxOpaque; om <= sectorMaxOpaque; om++)
2978
                {
2979
                    MSInt16 clampedOffsetMult = ParallelMath::Max(ParallelMath::Min(ParallelMath::MakeSInt16(om), sectorNumOpaqueSigned), negSectorNumOpaqueSigned);
2980
                    MSInt16 offset = ParallelMath::CompactMultiply(clampedOffsetMult, modifiers[table]);
2981

2982
                    for (int ch = 0; ch < 3; ch++)
2983
                    {
2984
                        // cu is in range 0..255*numOpaque (at most 0..2040)
2985
                        MUInt15 cu15 = ParallelMath::Min(
2986
                            sectorCumulativeMax,
2987
                            ParallelMath::ToUInt15(
2988
                                ParallelMath::Max(
2989
                                    ParallelMath::MakeSInt16(0),
2990
                                    ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + offset
2991
                                )
2992
                            )
2993
                        );
2994

2995
                        //quantized[ch] = (cu * 31 + (cu >> 3) + (numOpaque * 128)) / (numOpaque * 256)
2996
                        MUInt16 cuTimes31 = (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15);
2997
                        MUInt15 cuDiv8 = ParallelMath::RightShift(cu15, 3);
2998
                        MUInt16 numerator = cuTimes31 + ParallelMath::LosslessCast<MUInt16>::Cast(cuDiv8 + sectorNumOpaqueAddend);
2999
                        for (int block = 0; block < ParallelMath::ParallelSize; block++)
3000
                            ParallelMath::PutUInt15(quantized[ch], block, ParallelMath::Extract(numerator, block) / ParallelMath::Extract(sectorNumOpaqueDenominator, block));
3001
                    }
3002

3003
					possibleColors[om + sectorMaxOpaque] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
3004
				}
3005

3006
                ParallelMath::UInt15 numUniqueColors;
3007
                for (int block = 0; block < ParallelMath::ParallelSize; block++)
3008
                {
3009
                    uint16_t blockNumUniqueColors = 1;
3010
                    for (int i = 1; i < sectorNumOpaqueMultipliers; i++)
3011
                    {
3012
                        uint16_t color = ParallelMath::Extract(possibleColors[i], block);
3013
                        if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
3014
                            ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
3015
                    }
3016

3017
                    ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
3018
                }
3019

3020
                int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
3021
                for (int block = 1; block < ParallelMath::ParallelSize; block++)
3022
                    maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
3023

3024
                for (int block = 0; block < ParallelMath::ParallelSize; block++)
3025
                {
3026
                    uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
3027
                    for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
3028
                        ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
3029
                }
3030

3031
				for (int i = 0; i < maxUniqueColors; i++)
3032
				{
3033
					MFloat error = ParallelMath::MakeFloatZero();
3034
					MUInt16 selectors = ParallelMath::MakeUInt16(0);
3035
                    MUInt15 quantized = possibleColors[i];
3036
					TestHalfBlockPunchthrough(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], sectorTransparent[flip][sector], modifiers[table], options);
3037

3038
                    ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
3039

3040
					MUInt15 storageIndexes = drs.diffNumAttempts[sector];
3041
                    drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
3042

3043
                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
3044
                    {
3045
                        int storageIndex = ParallelMath::Extract(storageIndexes, block);
3046

3047
                        ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
3048
                        ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
3049
                        ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
3050
                        ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
3051
                    }
3052
                }
3053
            }
3054
        }
3055

3056
        MUInt15 bestDDummy = ParallelMath::MakeUInt15(0);
3057
        FindBestDifferentialCombination(flip, 1, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestDDummy, bestColors, bestSelectors, bestTables, drs);
3058
	}
3059

3060
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
3061
    {
3062
        if (!ParallelMath::Extract(bestIsThisMode, block))
3063
            continue;
3064

3065
        int blockBestColors[2][3];
3066
        int blockBestTables[2];
3067
        ParallelMath::ScalarUInt16 blockBestSelectors[2];
3068
        for (int sector = 0; sector < 2; sector++)
3069
        {
3070
            int sectorColor = ParallelMath::Extract(bestColors[sector], block);
3071
            for (int ch = 0; ch < 3; ch++)
3072
                blockBestColors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
3073

3074
            blockBestTables[sector] = ParallelMath::Extract(bestTables[sector], block);
3075
            blockBestSelectors[sector] = ParallelMath::Extract(bestSelectors[sector], block);
3076
        }
3077

3078
        EmitETC1Block(outputBuffer + block * 8, ParallelMath::Extract(bestFlip, block), 1, blockBestColors, blockBestTables, blockBestSelectors, true);
3079
    }
3080
}
3081

3082

3083
cvtt::ETC1CompressionData *cvtt::Internal::ETCComputer::AllocETC1Data(cvtt::Kernels::allocFunc_t allocFunc, void *context)
3084
{
3085
    void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
3086
    if (!buffer)
3087
        return NULL;
3088
    new (buffer) cvtt::Internal::ETCComputer::ETC1CompressionDataInternal(context);
3089
    return static_cast<ETC1CompressionData*>(buffer);
3090
}
3091

3092
void cvtt::Internal::ETCComputer::ReleaseETC1Data(ETC1CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
3093
{
3094
    cvtt::Internal::ETCComputer::ETC1CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC1CompressionDataInternal*>(compressionData);
3095
    void *context = internalData->m_context;
3096
    internalData->~ETC1CompressionDataInternal();
3097
    freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
3098
}
3099

3100
cvtt::ETC2CompressionData *cvtt::Internal::ETCComputer::AllocETC2Data(cvtt::Kernels::allocFunc_t allocFunc, void *context, const cvtt::Options &options)
3101
{
3102
    void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
3103
    if (!buffer)
3104
        return NULL;
3105
    new (buffer) cvtt::Internal::ETCComputer::ETC2CompressionDataInternal(context, options);
3106
    return static_cast<ETC2CompressionData*>(buffer);
3107
}
3108

3109
void cvtt::Internal::ETCComputer::ReleaseETC2Data(ETC2CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
3110
{
3111
    cvtt::Internal::ETCComputer::ETC2CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC2CompressionDataInternal*>(compressionData);
3112
    void *context = internalData->m_context;
3113
    internalData->~ETC2CompressionDataInternal();
3114
    freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
3115
}
3116

3117
cvtt::Internal::ETCComputer::ETC2CompressionDataInternal::ETC2CompressionDataInternal(void *context, const cvtt::Options &options)
3118
    : m_context(context)
3119
{
3120
    const float cd[3] = { options.redWeight, options.greenWeight, options.blueWeight };
3121
    const float rotCD[3] = { cd[1], cd[2], cd[0] };
3122

3123
    const float offs = -(rotCD[0] * cd[0] + rotCD[1] * cd[1] + rotCD[2] * cd[2]) / (cd[0] * cd[0] + cd[1] * cd[1] + cd[2] * cd[2]);
3124

3125
    const float chromaAxis0[3] = { rotCD[0] + cd[0] * offs, rotCD[1] + cd[1] * offs, rotCD[2] + cd[2] * offs };
3126

3127
    const float chromaAxis1Unnormalized[3] =
3128
    {
3129
        chromaAxis0[1] * cd[2] - chromaAxis0[2] * cd[1],
3130
        chromaAxis0[2] * cd[0] - chromaAxis0[0] * cd[2],
3131
        chromaAxis0[0] * cd[1] - chromaAxis0[1] * cd[0]
3132
    };
3133

3134
    const float ca0LengthSq = (chromaAxis0[0] * chromaAxis0[0] + chromaAxis0[1] * chromaAxis0[1] + chromaAxis0[2] * chromaAxis0[2]);
3135
    const float ca1UNLengthSq = (chromaAxis1Unnormalized[0] * chromaAxis1Unnormalized[0] + chromaAxis1Unnormalized[1] * chromaAxis1Unnormalized[1] + chromaAxis1Unnormalized[2] * chromaAxis1Unnormalized[2]);
3136
    const float lengthRatio = static_cast<float>(std::sqrt(ca0LengthSq / ca1UNLengthSq));
3137

3138
    const float chromaAxis1[3] = { chromaAxis1Unnormalized[0] * lengthRatio, chromaAxis1Unnormalized[1] * lengthRatio, chromaAxis1Unnormalized[2] * lengthRatio };
3139

3140
    for (int i = 0; i < 3; i++)
3141
    {
3142
        m_chromaSideAxis0[i] = chromaAxis0[i];
3143
        m_chromaSideAxis1[i] = chromaAxis1[i];
3144
    }
3145
}
3146

3147
#endif
3148

3149
Product

Resources

Company