CoCalc -- ConvectionKernels

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/cvtt/ConvectionKernels_S3TC.cpp
⁹⁹⁰³ views
1
/*
2
Convection Texture Tools
3
Copyright (c) 2018-2019 Eric Lasota
4

5
Permission is hereby granted, free of charge, to any person obtaining
6
a copy of this software and associated documentation files (the
7
"Software"), to deal in the Software without restriction, including
8
without limitation the rights to use, copy, modify, merge, publish,
9
distribute, sublicense, and/or sell copies of the Software, and to
10
permit persons to whom the Software is furnished to do so, subject
11
to the following conditions:
12

13
The above copyright notice and this permission notice shall be included
14
in all copies or substantial portions of the Software.
15

16
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23

24
-------------------------------------------------------------------------------------
25

26
Portions based on DirectX Texture Library (DirectXTex)
27

28
Copyright (c) Microsoft Corporation. All rights reserved.
29
Licensed under the MIT License.
30

31
http://go.microsoft.com/fwlink/?LinkId=248926
32
*/
33
#include "ConvectionKernels_Config.h"
34

35
#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
36

37
#include "ConvectionKernels_S3TC.h"
38

39
#include "ConvectionKernels_AggregatedError.h"
40
#include "ConvectionKernels_BCCommon.h"
41
#include "ConvectionKernels_EndpointRefiner.h"
42
#include "ConvectionKernels_EndpointSelector.h"
43
#include "ConvectionKernels_IndexSelector.h"
44
#include "ConvectionKernels_UnfinishedEndpoints.h"
45
#include "ConvectionKernels_S3TC_SingleColor.h"
46

47
void cvtt::Internal::S3TCComputer::Init(MFloat& error)
48
{
49
    error = ParallelMath::MakeFloat(FLT_MAX);
50
}
51

52
void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v)
53
{
54
    MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
55
    v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
56
}
57

58
void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v)
59
{
60
    MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
61
    v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
62
}
63

64
void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[3])
65
{
66
    QuantizeTo5Bits(endPoint[0]);
67
    QuantizeTo6Bits(endPoint[1]);
68
    QuantizeTo5Bits(endPoint[2]);
69
}
70

71
cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span)
72
{
73
    return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
74
}
75

76
cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
77
{
78
    MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
79
    absDiff = absDiff + d;
80
    return absDiff * absDiff;
81
}
82

83
void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
84
    MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
85
{
86
    float channelWeightsSq[3];
87

88
    for (int ch = 0; ch < 3; ch++)
89
        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
90

91
    MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
92

93
    for (int px = 0; px < 16; px++)
94
    {
95
        for (int ch = 0; ch < 3; ch++)
96
            totals[ch] = totals[ch] + pixels[px][ch];
97
    }
98

99
    MUInt15 average[3];
100
    for (int ch = 0; ch < 3; ch++)
101
        average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
102

103
    const Tables::S3TCSC::TableEntry* rbTable = NULL;
104
    const Tables::S3TCSC::TableEntry* gTable = NULL;
105
    if (flags & cvtt::Flags::S3TC_Paranoid)
106
    {
107
        if (range == 4)
108
        {
109
            rbTable = Tables::S3TCSC::g_singleColor5_3_p;
110
            gTable = Tables::S3TCSC::g_singleColor6_3_p;
111
        }
112
        else
113
        {
114
            assert(range == 3);
115
            rbTable = Tables::S3TCSC::g_singleColor5_2_p;
116
            gTable = Tables::S3TCSC::g_singleColor6_2_p;
117
        }
118
    }
119
    else
120
    {
121
        if (range == 4)
122
        {
123
            rbTable = Tables::S3TCSC::g_singleColor5_3;
124
            gTable = Tables::S3TCSC::g_singleColor6_3;
125
        }
126
        else
127
        {
128
            assert(range == 3);
129
            rbTable = Tables::S3TCSC::g_singleColor5_2;
130
            gTable = Tables::S3TCSC::g_singleColor6_2;
131
        }
132
    }
133

134
    MUInt15 interpolated[3];
135
    MUInt15 eps[2][3];
136
    MSInt16 spans[3];
137
    for (int i = 0; i < ParallelMath::ParallelSize; i++)
138
    {
139
        for (int ch = 0; ch < 3; ch++)
140
        {
141
            uint16_t avg = ParallelMath::Extract(average[ch], i);
142
            const Tables::S3TCSC::TableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
143
            ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
144
            ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
145
            ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
146
            ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
147
        }
148
    }
149

150
    MFloat error = ParallelMath::MakeFloatZero();
151
    if (flags & cvtt::Flags::S3TC_Paranoid)
152
    {
153
        MFloat spanParanoidFactors[3];
154
        for (int ch = 0; ch < 3; ch++)
155
            spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
156

157
        for (int px = 0; px < 16; px++)
158
        {
159
            for (int ch = 0; ch < 3; ch++)
160
                error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
161
        }
162
    }
163
    else
164
    {
165
        for (int px = 0; px < 16; px++)
166
        {
167
            for (int ch = 0; ch < 3; ch++)
168
                error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
169
        }
170
    }
171

172
    ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
173
    ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
174

175
    if (ParallelMath::AnySet(better16))
176
    {
177
        bestError = ParallelMath::Min(bestError, error);
178
        for (int epi = 0; epi < 2; epi++)
179
            for (int ch = 0; ch < 3; ch++)
180
                ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
181

182
        MUInt15 vindexes = ParallelMath::MakeUInt15(1);
183
        for (int px = 0; px < 16; px++)
184
            ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
185

186
        ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
187
    }
188
}
189

190
void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
191
    MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
192
{
193
    float channelWeightsSq[3];
194

195
    for (int ch = 0; ch < 3; ch++)
196
        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
197

198
    MUInt15 endPoints[2][3];
199

200
    for (int ep = 0; ep < 2; ep++)
201
        for (int ch = 0; ch < 3; ch++)
202
            endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
203

204
    QuantizeTo565(endPoints[0]);
205
    QuantizeTo565(endPoints[1]);
206

207
    IndexSelector<3> selector;
208
    selector.Init<false>(channelWeights, endPoints, range);
209

210
    MUInt15 indexes[16];
211

212
    MFloat paranoidFactors[3];
213
    for (int ch = 0; ch < 3; ch++)
214
        paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
215

216
    MFloat error = ParallelMath::MakeFloatZero();
217
    AggregatedError<3> aggError;
218
    for (int px = 0; px < 16; px++)
219
    {
220
        MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
221
        indexes[px] = index;
222

223
        if (refiner)
224
            refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
225

226
        MUInt15 reconstructed[3];
227
        selector.ReconstructLDRPrecise(index, reconstructed);
228

229
        if (flags & Flags::S3TC_Paranoid)
230
        {
231
            for (int ch = 0; ch < 3; ch++)
232
                error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
233
        }
234
        else
235
            BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
236
    }
237

238
    if (!(flags & Flags::S3TC_Paranoid))
239
        error = aggError.Finalize(flags, channelWeightsSq);
240

241
    ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
242

243
    if (ParallelMath::AnySet(better))
244
    {
245
        ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
246

247
        ParallelMath::ConditionalSet(bestError, better, error);
248

249
        for (int ep = 0; ep < 2; ep++)
250
            for (int ch = 0; ch < 3; ch++)
251
                ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
252

253
        for (int px = 0; px < 16; px++)
254
            ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
255

256
        ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
257
    }
258
}
259

260
void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
261
    const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
262
    const ParallelMath::RoundTowardNearestForScope* rtn)
263
{
264
    UNREFERENCED_PARAMETER(alphaTest);
265
    UNREFERENCED_PARAMETER(flags);
266

267
    EndpointRefiner<3> refiner;
268

269
    refiner.Init(nCounts, channelWeights);
270

271
    bool escape = false;
272
    int e = 0;
273
    for (int i = 0; i < nCounts; i++)
274
    {
275
        for (int n = 0; n < counts[i]; n++)
276
        {
277
            ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
278
            if (!ParallelMath::AnySet(valid))
279
            {
280
                escape = true;
281
                break;
282
            }
283

284
            if (ParallelMath::AllSet(valid))
285
                refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
286
            else
287
            {
288
                MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
289
                refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
290
            }
291
        }
292

293
        if (escape)
294
            break;
295
    }
296

297
    MUInt15 endPoints[2][3];
298
    refiner.GetRefinedEndpointsLDR(endPoints, rtn);
299

300
    TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
301
}
302

303
void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
304
{
305
    UNREFERENCED_PARAMETER(flags);
306
    ParallelMath::RoundTowardNearestForScope rtn;
307

308
    float weights[1] = { 1.0f };
309

310
    MUInt15 pixels[16];
311
    MFloat floatPixels[16];
312

313
    for (int px = 0; px < 16; px++)
314
    {
315
        ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
316
        floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
317
    }
318

319
    MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
320

321
    IndexSelector<1> selector;
322
    selector.Init<false>(weights, ep, 16);
323

324
    MUInt15 indexes[16];
325

326
    for (int px = 0; px < 16; px++)
327
        indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
328

329
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
330
    {
331
        for (int px = 0; px < 16; px += 2)
332
        {
333
            int index0 = ParallelMath::Extract(indexes[px], block);
334
            int index1 = ParallelMath::Extract(indexes[px + 1], block);
335

336
            packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
337
        }
338

339
        packedBlocks += packedBlockStride;
340
    }
341
}
342

343
void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
344
{
345
    if (maxTweakRounds < 1)
346
        maxTweakRounds = 1;
347

348
    if (numRefineRounds < 1)
349
        numRefineRounds = 1;
350

351
    ParallelMath::RoundTowardNearestForScope rtn;
352

353
    float oneWeight[1] = { 1.0f };
354

355
    MUInt15 pixels[16];
356
    MFloat floatPixels[16];
357

358
    MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
359
    MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
360

361
    for (int px = 0; px < 16; px++)
362
    {
363
        ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
364

365
        if (isSigned)
366
            pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
367

368
        floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
369
    }
370

371
    MUInt15 sortedPixels[16];
372
    for (int px = 0; px < 16; px++)
373
        sortedPixels[px] = pixels[px];
374

375
    for (int sortEnd = 15; sortEnd > 0; sortEnd--)
376
    {
377
        for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
378
        {
379
            MUInt15 a = sortedPixels[sortOffset];
380
            MUInt15 b = sortedPixels[sortOffset + 1];
381

382
            sortedPixels[sortOffset] = ParallelMath::Min(a, b);
383
            sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
384
        }
385
    }
386

387
    MUInt15 zero = ParallelMath::MakeUInt15(0);
388
    MUInt15 one = ParallelMath::MakeUInt15(1);
389

390
    MUInt15 bestIsFullRange = zero;
391
    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
392
    MUInt15 bestEP[2] = { zero, zero };
393
    MUInt15 bestIndexes[16] = {
394
        zero, zero, zero, zero,
395
        zero, zero, zero, zero,
396
        zero, zero, zero, zero,
397
        zero, zero, zero, zero
398
    };
399

400
    // Full-precision
401
    {
402
        MUInt15 minEP = sortedPixels[0];
403
        MUInt15 maxEP = sortedPixels[15];
404

405
        MFloat base[1] = { ParallelMath::ToFloat(minEP) };
406
        MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
407

408
        UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
409

410
        int numTweakRounds = BCCommon::TweakRoundsForRange(8);
411
        if (numTweakRounds > maxTweakRounds)
412
            numTweakRounds = maxTweakRounds;
413

414
        for (int tweak = 0; tweak < numTweakRounds; tweak++)
415
        {
416
            MUInt15 ep[2][1];
417

418
            ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
419

420
            for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
421
            {
422
                EndpointRefiner<1> refiner;
423
                refiner.Init(8, oneWeight);
424

425
                if (isSigned)
426
                    for (int epi = 0; epi < 2; epi++)
427
                        ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
428

429
                IndexSelector<1> indexSelector;
430
                indexSelector.Init<false>(oneWeight, ep, 8);
431

432
                MUInt15 indexes[16];
433

434
                AggregatedError<1> aggError;
435
                for (int px = 0; px < 16; px++)
436
                {
437
                    MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
438

439
                    MUInt15 reconstructedPixel;
440

441
                    indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
442
                    BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
443

444
                    if (refinePass != numRefineRounds - 1)
445
                        refiner.ContributeUnweightedPW(&floatPixels[px], index);
446

447
                    indexes[px] = index;
448
                }
449
                MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
450

451
                ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
452
                ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
453

454
                if (ParallelMath::AnySet(errorBetter16))
455
                {
456
                    bestError = ParallelMath::Min(error, bestError);
457
                    ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
458
                    for (int px = 0; px < 16; px++)
459
                        ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
460

461
                    for (int epi = 0; epi < 2; epi++)
462
                        ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
463
                }
464

465
                if (refinePass != numRefineRounds - 1)
466
                    refiner.GetRefinedEndpointsLDR(ep, &rtn);
467
            }
468
        }
469
    }
470

471
    // Reduced precision with special endpoints
472
    {
473
        MUInt15 bestHeuristicMin = sortedPixels[0];
474
        MUInt15 bestHeuristicMax = sortedPixels[15];
475

476
        ParallelMath::Int16CompFlag canTryClipping;
477

478
        // In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
479
        // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
480
        // This will usually not find anything, but it's cheap to check.
481

482
        {
483
            MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
484
            MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
485

486
            MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
487
            canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
488
        }
489

490
        if (ParallelMath::AnySet(canTryClipping))
491
        {
492
            MUInt15 lowClearances[16];
493
            MUInt15 highClearances[16];
494
            MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
495

496
            lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
497

498
            for (int px = 1; px < 16; px++)
499
            {
500
                lowClearances[px] = sortedPixels[px - 1];
501
                highClearances[px] = highTerminal - sortedPixels[16 - px];
502
            }
503

504
            for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
505
            {
506
                uint16_t numSkippedLow = firstIndex;
507

508
                MUInt15 lowClearance = lowClearances[firstIndex];
509

510
                for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
511
                {
512
                    uint16_t numSkippedHigh = 15 - lastIndex;
513
                    uint16_t numSkipped = numSkippedLow + numSkippedHigh;
514

515
                    MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
516

517
                    ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
518

519
                    if (!ParallelMath::AnySet(areMoreSkipped))
520
                        continue;
521

522
                    MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
523
                    MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
524

525
                    MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
526

527
                    ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
528
                    ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
529
                    ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
530
                }
531
            }
532
        }
533

534
        MUInt15 bestSimpleMin = one;
535
        MUInt15 bestSimpleMax = highTerminalMinusOne;
536

537
        for (int px = 0; px < 16; px++)
538
        {
539
            ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
540
            ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
541
        }
542

543
        MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
544
        MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
545

546
        int minEPRange = 2;
547
        if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
548
            minEPRange = 1;
549

550
        int maxEPRange = 2;
551
        if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
552
            maxEPRange = 1;
553

554
        for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
555
        {
556
            for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
557
            {
558
                MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
559
                MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
560

561
                UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
562

563
                int numTweakRounds = BCCommon::TweakRoundsForRange(6);
564
                if (numTweakRounds > maxTweakRounds)
565
                    numTweakRounds = maxTweakRounds;
566

567
                for (int tweak = 0; tweak < numTweakRounds; tweak++)
568
                {
569
                    MUInt15 ep[2][1];
570

571
                    ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
572

573
                    for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
574
                    {
575
                        EndpointRefiner<1> refiner;
576
                        refiner.Init(6, oneWeight);
577

578
                        if (isSigned)
579
                            for (int epi = 0; epi < 2; epi++)
580
                                ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
581

582
                        IndexSelector<1> indexSelector;
583
                        indexSelector.Init<false>(oneWeight, ep, 6);
584

585
                        MUInt15 indexes[16];
586
                        MFloat error = ParallelMath::MakeFloatZero();
587

588
                        for (int px = 0; px < 16; px++)
589
                        {
590
                            MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
591

592
                            MUInt15 reconstructedPixel;
593

594
                            indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
595

596
                            MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
597
                            MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
598
                            MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
599

600
                            MFloat bestPixelError = zeroError;
601
                            MUInt15 index = ParallelMath::MakeUInt15(6);
602

603
                            ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
604
                            bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
605

606
                            ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
607

608
                            if (ParallelMath::AllSet(selectedIndexBetter))
609
                            {
610
                                if (refinePass != numRefineRounds - 1)
611
                                    refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
612
                            }
613
                            else
614
                            {
615
                                MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
616

617
                                if (refinePass != numRefineRounds - 1)
618
                                    refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
619
                            }
620

621
                            ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
622
                            bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
623

624
                            error = error + bestPixelError;
625

626
                            indexes[px] = index;
627
                        }
628

629
                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
630
                        ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
631

632
                        if (ParallelMath::AnySet(errorBetter16))
633
                        {
634
                            bestError = ParallelMath::Min(error, bestError);
635
                            ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
636
                            for (int px = 0; px < 16; px++)
637
                                ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
638

639
                            for (int epi = 0; epi < 2; epi++)
640
                                ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
641
                        }
642

643
                        if (refinePass != numRefineRounds - 1)
644
                            refiner.GetRefinedEndpointsLDR(ep, &rtn);
645
                    }
646
                }
647
            }
648
        }
649
    }
650

651
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
652
    {
653
        int ep0 = ParallelMath::Extract(bestEP[0], block);
654
        int ep1 = ParallelMath::Extract(bestEP[1], block);
655
        int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
656

657
        if (isSigned)
658
        {
659
            ep0 -= 127;
660
            ep1 -= 127;
661

662
            assert(ep0 >= -127 && ep0 <= 127);
663
            assert(ep1 >= -127 && ep1 <= 127);
664
        }
665

666

667
        bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
668

669
        if (swapEndpoints)
670
            std::swap(ep0, ep1);
671

672
        uint16_t dumpBits = 0;
673
        int dumpBitsOffset = 0;
674
        int dumpByteOffset = 2;
675
        packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
676
        packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
677

678
        int maxValue = (isFullRange != 0) ? 7 : 5;
679

680
        for (int px = 0; px < 16; px++)
681
        {
682
            int index = ParallelMath::Extract(bestIndexes[px], block);
683

684
            if (swapEndpoints && index <= maxValue)
685
                index = maxValue - index;
686

687
            if (index != 0)
688
            {
689
                if (index == maxValue)
690
                    index = 1;
691
                else if (index < maxValue)
692
                    index++;
693
            }
694

695
            assert(index >= 0 && index < 8);
696

697
            dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
698
            dumpBitsOffset += 3;
699

700
            if (dumpBitsOffset >= 8)
701
            {
702
                assert(dumpByteOffset < 8);
703
                packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
704
                dumpBits >>= 8;
705
                dumpBitsOffset -= 8;
706
                dumpByteOffset++;
707
            }
708
        }
709

710
        assert(dumpBitsOffset == 0);
711
        assert(dumpByteOffset == 8);
712

713
        packedBlocks += packedBlockStride;
714
    }
715
}
716

717
void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
718
{
719
    ParallelMath::RoundTowardNearestForScope rtn;
720

721
    if (numRefineRounds < 1)
722
        numRefineRounds = 1;
723

724
    if (maxTweakRounds < 1)
725
        maxTweakRounds = 1;
726

727
    EndpointSelector<3, 8> endpointSelector;
728

729
    MUInt15 pixels[16][4];
730
    MFloat floatPixels[16][4];
731

732
    MFloat preWeightedPixels[16][4];
733

734
    for (int px = 0; px < 16; px++)
735
    {
736
        for (int ch = 0; ch < 4; ch++)
737
            ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
738
    }
739

740
    for (int px = 0; px < 16; px++)
741
    {
742
        for (int ch = 0; ch < 4; ch++)
743
            floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
744
    }
745

746
    if (alphaTest)
747
    {
748
        MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
749

750
        for (int px = 0; px < 16; px++)
751
        {
752
            ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
753
            pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
754
        }
755
    }
756

757
    BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
758

759
    MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
760

761
    for (int px = 0; px < 16; px++)
762
        minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
763

764
    MFloat pixelWeights[16];
765
    for (int px = 0; px < 16; px++)
766
    {
767
        pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
768
        if (alphaTest)
769
        {
770
            ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
771

772
            ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
773
        }
774
    }
775

776
    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
777
    {
778
        for (int px = 0; px < 16; px++)
779
            endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
780

781
        endpointSelector.FinishPass(pass);
782
    }
783

784
    UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
785

786
    MUInt15 bestEndpoints[2][3];
787
    MUInt15 bestIndexes[16];
788
    MUInt15 bestRange = ParallelMath::MakeUInt15(0);
789
    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
790

791
    for (int px = 0; px < 16; px++)
792
        bestIndexes[px] = ParallelMath::MakeUInt15(0);
793

794
    for (int ep = 0; ep < 2; ep++)
795
        for (int ch = 0; ch < 3; ch++)
796
            bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
797

798
    if (exhaustive)
799
    {
800
        MSInt16 sortBins[16];
801

802
        {
803
            // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
804
            // and pack the original indexes into the low bits.
805

806
            MUInt15 sortEP[2][3];
807
            ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
808

809
            IndexSelector<3> sortSelector;
810
            sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
811

812
            for (int16_t px = 0; px < 16; px++)
813
            {
814
                MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
815

816
                if (alphaTest)
817
                {
818
                    ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
819

820
                    ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
821
                }
822

823
                sortBin = sortBin + ParallelMath::MakeSInt16(px);
824

825
                sortBins[px] = sortBin;
826
            }
827
        }
828

829
        // Sort bins
830
        for (int sortEnd = 1; sortEnd < 16; sortEnd++)
831
        {
832
            for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
833
            {
834
                MSInt16 a = sortBins[sortLoc];
835
                MSInt16 b = sortBins[sortLoc - 1];
836

837
                sortBins[sortLoc] = ParallelMath::Max(a, b);
838
                sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
839
            }
840
        }
841

842
        MUInt15 firstElement = ParallelMath::MakeUInt15(0);
843
        for (uint16_t e = 0; e < 16; e++)
844
        {
845
            ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
846
            ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
847
            if (!ParallelMath::AnySet(isInvalid))
848
                break;
849
        }
850

851
        MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
852

853
        MUInt15 sortedInputs[16][4];
854
        MFloat floatSortedInputs[16][4];
855
        MFloat pwFloatSortedInputs[16][4];
856

857
        for (int e = 0; e < 16; e++)
858
        {
859
            for (int ch = 0; ch < 4; ch++)
860
                sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
861
        }
862

863
        for (int block = 0; block < ParallelMath::ParallelSize; block++)
864
        {
865
            for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
866
            {
867
                ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
868
                int originalIndex = (sortBin & 15);
869

870
                for (int ch = 0; ch < 4; ch++)
871
                    ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
872
            }
873
        }
874

875
        for (int e = 0; e < 16; e++)
876
        {
877
            for (int ch = 0; ch < 4; ch++)
878
            {
879
                MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
880
                floatSortedInputs[e][ch] = f;
881
                pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
882
            }
883
        }
884

885
        for (int n0 = 0; n0 <= 15; n0++)
886
        {
887
            int remainingFor1 = 16 - n0;
888
            if (remainingFor1 == 16)
889
                remainingFor1 = 15;
890

891
            for (int n1 = 0; n1 <= remainingFor1; n1++)
892
            {
893
                int remainingFor2 = 16 - n1 - n0;
894
                if (remainingFor2 == 16)
895
                    remainingFor2 = 15;
896

897
                for (int n2 = 0; n2 <= remainingFor2; n2++)
898
                {
899
                    int n3 = 16 - n2 - n1 - n0;
900

901
                    if (n3 == 16)
902
                        continue;
903

904
                    int counts[4] = { n0, n1, n2, n3 };
905

906
                    TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
907
                }
908
            }
909
        }
910

911
        TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
912

913
        if (alphaTest)
914
        {
915
            for (int n0 = 0; n0 <= 15; n0++)
916
            {
917
                int remainingFor1 = 16 - n0;
918
                if (remainingFor1 == 16)
919
                    remainingFor1 = 15;
920

921
                for (int n1 = 0; n1 <= remainingFor1; n1++)
922
                {
923
                    int n2 = 16 - n1 - n0;
924

925
                    if (n2 == 16)
926
                        continue;
927

928
                    int counts[3] = { n0, n1, n2 };
929

930
                    TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
931
                }
932
            }
933

934
            TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
935
        }
936
    }
937
    else
938
    {
939
        int minRange = alphaTest ? 3 : 4;
940

941
        for (int range = minRange; range <= 4; range++)
942
        {
943
            int tweakRounds = BCCommon::TweakRoundsForRange(range);
944
            if (tweakRounds > maxTweakRounds)
945
                tweakRounds = maxTweakRounds;
946

947
            for (int tweak = 0; tweak < tweakRounds; tweak++)
948
            {
949
                MUInt15 endPoints[2][3];
950

951
                ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
952

953
                for (int refine = 0; refine < numRefineRounds; refine++)
954
                {
955
                    EndpointRefiner<3> refiner;
956
                    refiner.Init(range, channelWeights);
957

958
                    TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
959

960
                    if (refine != numRefineRounds - 1)
961
                        refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
962
                }
963
            }
964
        }
965
    }
966

967
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
968
    {
969
        ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
970
        assert(range == 3 || range == 4);
971

972
        ParallelMath::ScalarUInt16 compressedEP[2];
973
        for (int ep = 0; ep < 2; ep++)
974
        {
975
            ParallelMath::ScalarUInt16 endPoint[3];
976
            for (int ch = 0; ch < 3; ch++)
977
                endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
978

979
            int compressed = (endPoint[0] & 0xf8) << 8;
980
            compressed |= (endPoint[1] & 0xfc) << 3;
981
            compressed |= (endPoint[2] & 0xf8) >> 3;
982

983
            compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
984
        }
985

986
        int indexOrder[4];
987

988
        if (range == 4)
989
        {
990
            if (compressedEP[0] == compressedEP[1])
991
            {
992
                indexOrder[0] = 0;
993
                indexOrder[1] = 0;
994
                indexOrder[2] = 0;
995
                indexOrder[3] = 0;
996
            }
997
            else if (compressedEP[0] < compressedEP[1])
998
            {
999
                std::swap(compressedEP[0], compressedEP[1]);
1000
                indexOrder[0] = 1;
1001
                indexOrder[1] = 3;
1002
                indexOrder[2] = 2;
1003
                indexOrder[3] = 0;
1004
            }
1005
            else
1006
            {
1007
                indexOrder[0] = 0;
1008
                indexOrder[1] = 2;
1009
                indexOrder[2] = 3;
1010
                indexOrder[3] = 1;
1011
            }
1012
        }
1013
        else
1014
        {
1015
            assert(range == 3);
1016

1017
            if (compressedEP[0] > compressedEP[1])
1018
            {
1019
                std::swap(compressedEP[0], compressedEP[1]);
1020
                indexOrder[0] = 1;
1021
                indexOrder[1] = 2;
1022
                indexOrder[2] = 0;
1023
            }
1024
            else
1025
            {
1026
                indexOrder[0] = 0;
1027
                indexOrder[1] = 2;
1028
                indexOrder[2] = 1;
1029
            }
1030
            indexOrder[3] = 3;
1031
        }
1032

1033
        packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
1034
        packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
1035
        packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
1036
        packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
1037

1038
        for (int i = 0; i < 16; i += 4)
1039
        {
1040
            int packedIndexes = 0;
1041
            for (int subi = 0; subi < 4; subi++)
1042
            {
1043
                ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
1044
                packedIndexes |= (indexOrder[index] << (subi * 2));
1045
            }
1046

1047
            packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
1048
        }
1049

1050
        packedBlocks += packedBlockStride;
1051
    }
1052
}
1053

1054
#endif
1055

1056
Product

Resources

Company