Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/cvtt/ConvectionKernels_S3TC.cpp
9903 views
1
/*
2
Convection Texture Tools
3
Copyright (c) 2018-2019 Eric Lasota
4
5
Permission is hereby granted, free of charge, to any person obtaining
6
a copy of this software and associated documentation files (the
7
"Software"), to deal in the Software without restriction, including
8
without limitation the rights to use, copy, modify, merge, publish,
9
distribute, sublicense, and/or sell copies of the Software, and to
10
permit persons to whom the Software is furnished to do so, subject
11
to the following conditions:
12
13
The above copyright notice and this permission notice shall be included
14
in all copies or substantial portions of the Software.
15
16
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24
-------------------------------------------------------------------------------------
25
26
Portions based on DirectX Texture Library (DirectXTex)
27
28
Copyright (c) Microsoft Corporation. All rights reserved.
29
Licensed under the MIT License.
30
31
http://go.microsoft.com/fwlink/?LinkId=248926
32
*/
33
#include "ConvectionKernels_Config.h"
34
35
#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
36
37
#include "ConvectionKernels_S3TC.h"
38
39
#include "ConvectionKernels_AggregatedError.h"
40
#include "ConvectionKernels_BCCommon.h"
41
#include "ConvectionKernels_EndpointRefiner.h"
42
#include "ConvectionKernels_EndpointSelector.h"
43
#include "ConvectionKernels_IndexSelector.h"
44
#include "ConvectionKernels_UnfinishedEndpoints.h"
45
#include "ConvectionKernels_S3TC_SingleColor.h"
46
47
void cvtt::Internal::S3TCComputer::Init(MFloat& error)
48
{
49
error = ParallelMath::MakeFloat(FLT_MAX);
50
}
51
52
void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v)
53
{
54
MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
55
v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
56
}
57
58
void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v)
59
{
60
MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
61
v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
62
}
63
64
void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[3])
65
{
66
QuantizeTo5Bits(endPoint[0]);
67
QuantizeTo6Bits(endPoint[1]);
68
QuantizeTo5Bits(endPoint[2]);
69
}
70
71
cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span)
72
{
73
return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
74
}
75
76
cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
77
{
78
MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
79
absDiff = absDiff + d;
80
return absDiff * absDiff;
81
}
82
83
void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
84
MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
85
{
86
float channelWeightsSq[3];
87
88
for (int ch = 0; ch < 3; ch++)
89
channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
90
91
MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
92
93
for (int px = 0; px < 16; px++)
94
{
95
for (int ch = 0; ch < 3; ch++)
96
totals[ch] = totals[ch] + pixels[px][ch];
97
}
98
99
MUInt15 average[3];
100
for (int ch = 0; ch < 3; ch++)
101
average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
102
103
const Tables::S3TCSC::TableEntry* rbTable = NULL;
104
const Tables::S3TCSC::TableEntry* gTable = NULL;
105
if (flags & cvtt::Flags::S3TC_Paranoid)
106
{
107
if (range == 4)
108
{
109
rbTable = Tables::S3TCSC::g_singleColor5_3_p;
110
gTable = Tables::S3TCSC::g_singleColor6_3_p;
111
}
112
else
113
{
114
assert(range == 3);
115
rbTable = Tables::S3TCSC::g_singleColor5_2_p;
116
gTable = Tables::S3TCSC::g_singleColor6_2_p;
117
}
118
}
119
else
120
{
121
if (range == 4)
122
{
123
rbTable = Tables::S3TCSC::g_singleColor5_3;
124
gTable = Tables::S3TCSC::g_singleColor6_3;
125
}
126
else
127
{
128
assert(range == 3);
129
rbTable = Tables::S3TCSC::g_singleColor5_2;
130
gTable = Tables::S3TCSC::g_singleColor6_2;
131
}
132
}
133
134
MUInt15 interpolated[3];
135
MUInt15 eps[2][3];
136
MSInt16 spans[3];
137
for (int i = 0; i < ParallelMath::ParallelSize; i++)
138
{
139
for (int ch = 0; ch < 3; ch++)
140
{
141
uint16_t avg = ParallelMath::Extract(average[ch], i);
142
const Tables::S3TCSC::TableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
143
ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
144
ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
145
ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
146
ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
147
}
148
}
149
150
MFloat error = ParallelMath::MakeFloatZero();
151
if (flags & cvtt::Flags::S3TC_Paranoid)
152
{
153
MFloat spanParanoidFactors[3];
154
for (int ch = 0; ch < 3; ch++)
155
spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
156
157
for (int px = 0; px < 16; px++)
158
{
159
for (int ch = 0; ch < 3; ch++)
160
error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
161
}
162
}
163
else
164
{
165
for (int px = 0; px < 16; px++)
166
{
167
for (int ch = 0; ch < 3; ch++)
168
error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
169
}
170
}
171
172
ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
173
ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
174
175
if (ParallelMath::AnySet(better16))
176
{
177
bestError = ParallelMath::Min(bestError, error);
178
for (int epi = 0; epi < 2; epi++)
179
for (int ch = 0; ch < 3; ch++)
180
ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
181
182
MUInt15 vindexes = ParallelMath::MakeUInt15(1);
183
for (int px = 0; px < 16; px++)
184
ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
185
186
ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
187
}
188
}
189
190
void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
191
MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
192
{
193
float channelWeightsSq[3];
194
195
for (int ch = 0; ch < 3; ch++)
196
channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
197
198
MUInt15 endPoints[2][3];
199
200
for (int ep = 0; ep < 2; ep++)
201
for (int ch = 0; ch < 3; ch++)
202
endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
203
204
QuantizeTo565(endPoints[0]);
205
QuantizeTo565(endPoints[1]);
206
207
IndexSelector<3> selector;
208
selector.Init<false>(channelWeights, endPoints, range);
209
210
MUInt15 indexes[16];
211
212
MFloat paranoidFactors[3];
213
for (int ch = 0; ch < 3; ch++)
214
paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
215
216
MFloat error = ParallelMath::MakeFloatZero();
217
AggregatedError<3> aggError;
218
for (int px = 0; px < 16; px++)
219
{
220
MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
221
indexes[px] = index;
222
223
if (refiner)
224
refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
225
226
MUInt15 reconstructed[3];
227
selector.ReconstructLDRPrecise(index, reconstructed);
228
229
if (flags & Flags::S3TC_Paranoid)
230
{
231
for (int ch = 0; ch < 3; ch++)
232
error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
233
}
234
else
235
BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
236
}
237
238
if (!(flags & Flags::S3TC_Paranoid))
239
error = aggError.Finalize(flags, channelWeightsSq);
240
241
ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
242
243
if (ParallelMath::AnySet(better))
244
{
245
ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
246
247
ParallelMath::ConditionalSet(bestError, better, error);
248
249
for (int ep = 0; ep < 2; ep++)
250
for (int ch = 0; ch < 3; ch++)
251
ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
252
253
for (int px = 0; px < 16; px++)
254
ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
255
256
ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
257
}
258
}
259
260
void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
261
const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
262
const ParallelMath::RoundTowardNearestForScope* rtn)
263
{
264
UNREFERENCED_PARAMETER(alphaTest);
265
UNREFERENCED_PARAMETER(flags);
266
267
EndpointRefiner<3> refiner;
268
269
refiner.Init(nCounts, channelWeights);
270
271
bool escape = false;
272
int e = 0;
273
for (int i = 0; i < nCounts; i++)
274
{
275
for (int n = 0; n < counts[i]; n++)
276
{
277
ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
278
if (!ParallelMath::AnySet(valid))
279
{
280
escape = true;
281
break;
282
}
283
284
if (ParallelMath::AllSet(valid))
285
refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
286
else
287
{
288
MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
289
refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
290
}
291
}
292
293
if (escape)
294
break;
295
}
296
297
MUInt15 endPoints[2][3];
298
refiner.GetRefinedEndpointsLDR(endPoints, rtn);
299
300
TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
301
}
302
303
void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
304
{
305
UNREFERENCED_PARAMETER(flags);
306
ParallelMath::RoundTowardNearestForScope rtn;
307
308
float weights[1] = { 1.0f };
309
310
MUInt15 pixels[16];
311
MFloat floatPixels[16];
312
313
for (int px = 0; px < 16; px++)
314
{
315
ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
316
floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
317
}
318
319
MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
320
321
IndexSelector<1> selector;
322
selector.Init<false>(weights, ep, 16);
323
324
MUInt15 indexes[16];
325
326
for (int px = 0; px < 16; px++)
327
indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
328
329
for (int block = 0; block < ParallelMath::ParallelSize; block++)
330
{
331
for (int px = 0; px < 16; px += 2)
332
{
333
int index0 = ParallelMath::Extract(indexes[px], block);
334
int index1 = ParallelMath::Extract(indexes[px + 1], block);
335
336
packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
337
}
338
339
packedBlocks += packedBlockStride;
340
}
341
}
342
343
void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
344
{
345
if (maxTweakRounds < 1)
346
maxTweakRounds = 1;
347
348
if (numRefineRounds < 1)
349
numRefineRounds = 1;
350
351
ParallelMath::RoundTowardNearestForScope rtn;
352
353
float oneWeight[1] = { 1.0f };
354
355
MUInt15 pixels[16];
356
MFloat floatPixels[16];
357
358
MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
359
MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
360
361
for (int px = 0; px < 16; px++)
362
{
363
ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
364
365
if (isSigned)
366
pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
367
368
floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
369
}
370
371
MUInt15 sortedPixels[16];
372
for (int px = 0; px < 16; px++)
373
sortedPixels[px] = pixels[px];
374
375
for (int sortEnd = 15; sortEnd > 0; sortEnd--)
376
{
377
for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
378
{
379
MUInt15 a = sortedPixels[sortOffset];
380
MUInt15 b = sortedPixels[sortOffset + 1];
381
382
sortedPixels[sortOffset] = ParallelMath::Min(a, b);
383
sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
384
}
385
}
386
387
MUInt15 zero = ParallelMath::MakeUInt15(0);
388
MUInt15 one = ParallelMath::MakeUInt15(1);
389
390
MUInt15 bestIsFullRange = zero;
391
MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
392
MUInt15 bestEP[2] = { zero, zero };
393
MUInt15 bestIndexes[16] = {
394
zero, zero, zero, zero,
395
zero, zero, zero, zero,
396
zero, zero, zero, zero,
397
zero, zero, zero, zero
398
};
399
400
// Full-precision
401
{
402
MUInt15 minEP = sortedPixels[0];
403
MUInt15 maxEP = sortedPixels[15];
404
405
MFloat base[1] = { ParallelMath::ToFloat(minEP) };
406
MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
407
408
UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
409
410
int numTweakRounds = BCCommon::TweakRoundsForRange(8);
411
if (numTweakRounds > maxTweakRounds)
412
numTweakRounds = maxTweakRounds;
413
414
for (int tweak = 0; tweak < numTweakRounds; tweak++)
415
{
416
MUInt15 ep[2][1];
417
418
ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
419
420
for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
421
{
422
EndpointRefiner<1> refiner;
423
refiner.Init(8, oneWeight);
424
425
if (isSigned)
426
for (int epi = 0; epi < 2; epi++)
427
ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
428
429
IndexSelector<1> indexSelector;
430
indexSelector.Init<false>(oneWeight, ep, 8);
431
432
MUInt15 indexes[16];
433
434
AggregatedError<1> aggError;
435
for (int px = 0; px < 16; px++)
436
{
437
MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
438
439
MUInt15 reconstructedPixel;
440
441
indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
442
BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
443
444
if (refinePass != numRefineRounds - 1)
445
refiner.ContributeUnweightedPW(&floatPixels[px], index);
446
447
indexes[px] = index;
448
}
449
MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
450
451
ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
452
ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
453
454
if (ParallelMath::AnySet(errorBetter16))
455
{
456
bestError = ParallelMath::Min(error, bestError);
457
ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
458
for (int px = 0; px < 16; px++)
459
ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
460
461
for (int epi = 0; epi < 2; epi++)
462
ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
463
}
464
465
if (refinePass != numRefineRounds - 1)
466
refiner.GetRefinedEndpointsLDR(ep, &rtn);
467
}
468
}
469
}
470
471
// Reduced precision with special endpoints
472
{
473
MUInt15 bestHeuristicMin = sortedPixels[0];
474
MUInt15 bestHeuristicMax = sortedPixels[15];
475
476
ParallelMath::Int16CompFlag canTryClipping;
477
478
// In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
479
// The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
480
// This will usually not find anything, but it's cheap to check.
481
482
{
483
MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
484
MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
485
486
MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
487
canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
488
}
489
490
if (ParallelMath::AnySet(canTryClipping))
491
{
492
MUInt15 lowClearances[16];
493
MUInt15 highClearances[16];
494
MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
495
496
lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
497
498
for (int px = 1; px < 16; px++)
499
{
500
lowClearances[px] = sortedPixels[px - 1];
501
highClearances[px] = highTerminal - sortedPixels[16 - px];
502
}
503
504
for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
505
{
506
uint16_t numSkippedLow = firstIndex;
507
508
MUInt15 lowClearance = lowClearances[firstIndex];
509
510
for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
511
{
512
uint16_t numSkippedHigh = 15 - lastIndex;
513
uint16_t numSkipped = numSkippedLow + numSkippedHigh;
514
515
MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
516
517
ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
518
519
if (!ParallelMath::AnySet(areMoreSkipped))
520
continue;
521
522
MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
523
MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
524
525
MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
526
527
ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
528
ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
529
ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
530
}
531
}
532
}
533
534
MUInt15 bestSimpleMin = one;
535
MUInt15 bestSimpleMax = highTerminalMinusOne;
536
537
for (int px = 0; px < 16; px++)
538
{
539
ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
540
ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
541
}
542
543
MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
544
MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
545
546
int minEPRange = 2;
547
if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
548
minEPRange = 1;
549
550
int maxEPRange = 2;
551
if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
552
maxEPRange = 1;
553
554
for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
555
{
556
for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
557
{
558
MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
559
MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
560
561
UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
562
563
int numTweakRounds = BCCommon::TweakRoundsForRange(6);
564
if (numTweakRounds > maxTweakRounds)
565
numTweakRounds = maxTweakRounds;
566
567
for (int tweak = 0; tweak < numTweakRounds; tweak++)
568
{
569
MUInt15 ep[2][1];
570
571
ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
572
573
for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
574
{
575
EndpointRefiner<1> refiner;
576
refiner.Init(6, oneWeight);
577
578
if (isSigned)
579
for (int epi = 0; epi < 2; epi++)
580
ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
581
582
IndexSelector<1> indexSelector;
583
indexSelector.Init<false>(oneWeight, ep, 6);
584
585
MUInt15 indexes[16];
586
MFloat error = ParallelMath::MakeFloatZero();
587
588
for (int px = 0; px < 16; px++)
589
{
590
MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
591
592
MUInt15 reconstructedPixel;
593
594
indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
595
596
MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
597
MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
598
MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
599
600
MFloat bestPixelError = zeroError;
601
MUInt15 index = ParallelMath::MakeUInt15(6);
602
603
ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
604
bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
605
606
ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
607
608
if (ParallelMath::AllSet(selectedIndexBetter))
609
{
610
if (refinePass != numRefineRounds - 1)
611
refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
612
}
613
else
614
{
615
MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
616
617
if (refinePass != numRefineRounds - 1)
618
refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
619
}
620
621
ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
622
bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
623
624
error = error + bestPixelError;
625
626
indexes[px] = index;
627
}
628
629
ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
630
ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
631
632
if (ParallelMath::AnySet(errorBetter16))
633
{
634
bestError = ParallelMath::Min(error, bestError);
635
ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
636
for (int px = 0; px < 16; px++)
637
ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
638
639
for (int epi = 0; epi < 2; epi++)
640
ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
641
}
642
643
if (refinePass != numRefineRounds - 1)
644
refiner.GetRefinedEndpointsLDR(ep, &rtn);
645
}
646
}
647
}
648
}
649
}
650
651
for (int block = 0; block < ParallelMath::ParallelSize; block++)
652
{
653
int ep0 = ParallelMath::Extract(bestEP[0], block);
654
int ep1 = ParallelMath::Extract(bestEP[1], block);
655
int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
656
657
if (isSigned)
658
{
659
ep0 -= 127;
660
ep1 -= 127;
661
662
assert(ep0 >= -127 && ep0 <= 127);
663
assert(ep1 >= -127 && ep1 <= 127);
664
}
665
666
667
bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
668
669
if (swapEndpoints)
670
std::swap(ep0, ep1);
671
672
uint16_t dumpBits = 0;
673
int dumpBitsOffset = 0;
674
int dumpByteOffset = 2;
675
packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
676
packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
677
678
int maxValue = (isFullRange != 0) ? 7 : 5;
679
680
for (int px = 0; px < 16; px++)
681
{
682
int index = ParallelMath::Extract(bestIndexes[px], block);
683
684
if (swapEndpoints && index <= maxValue)
685
index = maxValue - index;
686
687
if (index != 0)
688
{
689
if (index == maxValue)
690
index = 1;
691
else if (index < maxValue)
692
index++;
693
}
694
695
assert(index >= 0 && index < 8);
696
697
dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
698
dumpBitsOffset += 3;
699
700
if (dumpBitsOffset >= 8)
701
{
702
assert(dumpByteOffset < 8);
703
packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
704
dumpBits >>= 8;
705
dumpBitsOffset -= 8;
706
dumpByteOffset++;
707
}
708
}
709
710
assert(dumpBitsOffset == 0);
711
assert(dumpByteOffset == 8);
712
713
packedBlocks += packedBlockStride;
714
}
715
}
716
717
void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
718
{
719
ParallelMath::RoundTowardNearestForScope rtn;
720
721
if (numRefineRounds < 1)
722
numRefineRounds = 1;
723
724
if (maxTweakRounds < 1)
725
maxTweakRounds = 1;
726
727
EndpointSelector<3, 8> endpointSelector;
728
729
MUInt15 pixels[16][4];
730
MFloat floatPixels[16][4];
731
732
MFloat preWeightedPixels[16][4];
733
734
for (int px = 0; px < 16; px++)
735
{
736
for (int ch = 0; ch < 4; ch++)
737
ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
738
}
739
740
for (int px = 0; px < 16; px++)
741
{
742
for (int ch = 0; ch < 4; ch++)
743
floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
744
}
745
746
if (alphaTest)
747
{
748
MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
749
750
for (int px = 0; px < 16; px++)
751
{
752
ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
753
pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
754
}
755
}
756
757
BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
758
759
MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
760
761
for (int px = 0; px < 16; px++)
762
minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
763
764
MFloat pixelWeights[16];
765
for (int px = 0; px < 16; px++)
766
{
767
pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
768
if (alphaTest)
769
{
770
ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
771
772
ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
773
}
774
}
775
776
for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
777
{
778
for (int px = 0; px < 16; px++)
779
endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
780
781
endpointSelector.FinishPass(pass);
782
}
783
784
UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
785
786
MUInt15 bestEndpoints[2][3];
787
MUInt15 bestIndexes[16];
788
MUInt15 bestRange = ParallelMath::MakeUInt15(0);
789
MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
790
791
for (int px = 0; px < 16; px++)
792
bestIndexes[px] = ParallelMath::MakeUInt15(0);
793
794
for (int ep = 0; ep < 2; ep++)
795
for (int ch = 0; ch < 3; ch++)
796
bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
797
798
if (exhaustive)
799
{
800
MSInt16 sortBins[16];
801
802
{
803
// Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
804
// and pack the original indexes into the low bits.
805
806
MUInt15 sortEP[2][3];
807
ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
808
809
IndexSelector<3> sortSelector;
810
sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
811
812
for (int16_t px = 0; px < 16; px++)
813
{
814
MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
815
816
if (alphaTest)
817
{
818
ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
819
820
ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
821
}
822
823
sortBin = sortBin + ParallelMath::MakeSInt16(px);
824
825
sortBins[px] = sortBin;
826
}
827
}
828
829
// Sort bins
830
for (int sortEnd = 1; sortEnd < 16; sortEnd++)
831
{
832
for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
833
{
834
MSInt16 a = sortBins[sortLoc];
835
MSInt16 b = sortBins[sortLoc - 1];
836
837
sortBins[sortLoc] = ParallelMath::Max(a, b);
838
sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
839
}
840
}
841
842
MUInt15 firstElement = ParallelMath::MakeUInt15(0);
843
for (uint16_t e = 0; e < 16; e++)
844
{
845
ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
846
ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
847
if (!ParallelMath::AnySet(isInvalid))
848
break;
849
}
850
851
MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
852
853
MUInt15 sortedInputs[16][4];
854
MFloat floatSortedInputs[16][4];
855
MFloat pwFloatSortedInputs[16][4];
856
857
for (int e = 0; e < 16; e++)
858
{
859
for (int ch = 0; ch < 4; ch++)
860
sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
861
}
862
863
for (int block = 0; block < ParallelMath::ParallelSize; block++)
864
{
865
for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
866
{
867
ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
868
int originalIndex = (sortBin & 15);
869
870
for (int ch = 0; ch < 4; ch++)
871
ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
872
}
873
}
874
875
for (int e = 0; e < 16; e++)
876
{
877
for (int ch = 0; ch < 4; ch++)
878
{
879
MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
880
floatSortedInputs[e][ch] = f;
881
pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
882
}
883
}
884
885
for (int n0 = 0; n0 <= 15; n0++)
886
{
887
int remainingFor1 = 16 - n0;
888
if (remainingFor1 == 16)
889
remainingFor1 = 15;
890
891
for (int n1 = 0; n1 <= remainingFor1; n1++)
892
{
893
int remainingFor2 = 16 - n1 - n0;
894
if (remainingFor2 == 16)
895
remainingFor2 = 15;
896
897
for (int n2 = 0; n2 <= remainingFor2; n2++)
898
{
899
int n3 = 16 - n2 - n1 - n0;
900
901
if (n3 == 16)
902
continue;
903
904
int counts[4] = { n0, n1, n2, n3 };
905
906
TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
907
}
908
}
909
}
910
911
TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
912
913
if (alphaTest)
914
{
915
for (int n0 = 0; n0 <= 15; n0++)
916
{
917
int remainingFor1 = 16 - n0;
918
if (remainingFor1 == 16)
919
remainingFor1 = 15;
920
921
for (int n1 = 0; n1 <= remainingFor1; n1++)
922
{
923
int n2 = 16 - n1 - n0;
924
925
if (n2 == 16)
926
continue;
927
928
int counts[3] = { n0, n1, n2 };
929
930
TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
931
}
932
}
933
934
TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
935
}
936
}
937
else
938
{
939
int minRange = alphaTest ? 3 : 4;
940
941
for (int range = minRange; range <= 4; range++)
942
{
943
int tweakRounds = BCCommon::TweakRoundsForRange(range);
944
if (tweakRounds > maxTweakRounds)
945
tweakRounds = maxTweakRounds;
946
947
for (int tweak = 0; tweak < tweakRounds; tweak++)
948
{
949
MUInt15 endPoints[2][3];
950
951
ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
952
953
for (int refine = 0; refine < numRefineRounds; refine++)
954
{
955
EndpointRefiner<3> refiner;
956
refiner.Init(range, channelWeights);
957
958
TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
959
960
if (refine != numRefineRounds - 1)
961
refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
962
}
963
}
964
}
965
}
966
967
for (int block = 0; block < ParallelMath::ParallelSize; block++)
968
{
969
ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
970
assert(range == 3 || range == 4);
971
972
ParallelMath::ScalarUInt16 compressedEP[2];
973
for (int ep = 0; ep < 2; ep++)
974
{
975
ParallelMath::ScalarUInt16 endPoint[3];
976
for (int ch = 0; ch < 3; ch++)
977
endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
978
979
int compressed = (endPoint[0] & 0xf8) << 8;
980
compressed |= (endPoint[1] & 0xfc) << 3;
981
compressed |= (endPoint[2] & 0xf8) >> 3;
982
983
compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
984
}
985
986
int indexOrder[4];
987
988
if (range == 4)
989
{
990
if (compressedEP[0] == compressedEP[1])
991
{
992
indexOrder[0] = 0;
993
indexOrder[1] = 0;
994
indexOrder[2] = 0;
995
indexOrder[3] = 0;
996
}
997
else if (compressedEP[0] < compressedEP[1])
998
{
999
std::swap(compressedEP[0], compressedEP[1]);
1000
indexOrder[0] = 1;
1001
indexOrder[1] = 3;
1002
indexOrder[2] = 2;
1003
indexOrder[3] = 0;
1004
}
1005
else
1006
{
1007
indexOrder[0] = 0;
1008
indexOrder[1] = 2;
1009
indexOrder[2] = 3;
1010
indexOrder[3] = 1;
1011
}
1012
}
1013
else
1014
{
1015
assert(range == 3);
1016
1017
if (compressedEP[0] > compressedEP[1])
1018
{
1019
std::swap(compressedEP[0], compressedEP[1]);
1020
indexOrder[0] = 1;
1021
indexOrder[1] = 2;
1022
indexOrder[2] = 0;
1023
}
1024
else
1025
{
1026
indexOrder[0] = 0;
1027
indexOrder[1] = 2;
1028
indexOrder[2] = 1;
1029
}
1030
indexOrder[3] = 3;
1031
}
1032
1033
packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
1034
packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
1035
packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
1036
packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
1037
1038
for (int i = 0; i < 16; i += 4)
1039
{
1040
int packedIndexes = 0;
1041
for (int subi = 0; subi < 4; subi++)
1042
{
1043
ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
1044
packedIndexes |= (indexOrder[index] << (subi * 2));
1045
}
1046
1047
packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
1048
}
1049
1050
packedBlocks += packedBlockStride;
1051
}
1052
}
1053
1054
#endif
1055
1056