Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/cvtt/ConvectionKernels_ETC.cpp
9902 views
1
/*
2
Convection Texture Tools
3
Copyright (c) 2018-2019 Eric Lasota
4
5
Permission is hereby granted, free of charge, to any person obtaining
6
a copy of this software and associated documentation files (the
7
"Software"), to deal in the Software without restriction, including
8
without limitation the rights to use, copy, modify, merge, publish,
9
distribute, sublicense, and/or sell copies of the Software, and to
10
permit persons to whom the Software is furnished to do so, subject
11
to the following conditions:
12
13
The above copyright notice and this permission notice shall be included
14
in all copies or substantial portions of the Software.
15
16
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24
-------------------------------------------------------------------------------------
25
26
Portions based on DirectX Texture Library (DirectXTex)
27
28
Copyright (c) Microsoft Corporation. All rights reserved.
29
Licensed under the MIT License.
30
31
http://go.microsoft.com/fwlink/?LinkId=248926
32
*/
33
#include "ConvectionKernels_Config.h"
34
35
#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
36
37
#include "ConvectionKernels.h"
38
#include "ConvectionKernels_ETC.h"
39
#include "ConvectionKernels_ETC1.h"
40
#include "ConvectionKernels_ETC2.h"
41
#include "ConvectionKernels_ETC2_Rounding.h"
42
#include "ConvectionKernels_ParallelMath.h"
43
#include "ConvectionKernels_FakeBT709_Rounding.h"
44
45
#include <cmath>
46
47
const int cvtt::Internal::ETCComputer::g_flipTables[2][2][8] =
48
{
49
{
50
{ 0, 1, 4, 5, 8, 9, 12, 13 },
51
{ 2, 3, 6, 7, 10, 11, 14, 15 }
52
},
53
{
54
{ 0, 1, 2, 3, 4, 5, 6, 7 },
55
{ 8, 9, 10, 11, 12, 13, 14, 15 }
56
},
57
};
58
59
cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorUniform(const MUInt15 pixelA[3], const MUInt15 pixelB[3])
60
{
61
MSInt16 d0 = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[0]);
62
MFloat fd0 = ParallelMath::ToFloat(d0);
63
MFloat error = fd0 * fd0;
64
for (int ch = 1; ch < 3; ch++)
65
{
66
MSInt16 d = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[ch]);
67
MFloat fd = ParallelMath::ToFloat(d);
68
error = error + fd * fd;
69
}
70
return error;
71
}
72
73
cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorWeighted(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3], const Options options)
74
{
75
MFloat dr = ParallelMath::ToFloat(reconstructed[0]) * options.redWeight - preWeightedPixel[0];
76
MFloat dg = ParallelMath::ToFloat(reconstructed[1]) * options.greenWeight - preWeightedPixel[1];
77
MFloat db = ParallelMath::ToFloat(reconstructed[2]) * options.blueWeight - preWeightedPixel[2];
78
79
return dr * dr + dg * dg + db * db;
80
}
81
82
cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorFakeBT709(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3])
83
{
84
MFloat yuv[3];
85
ConvertToFakeBT709(yuv, reconstructed);
86
87
MFloat dy = yuv[0] - preWeightedPixel[0];
88
MFloat du = yuv[1] - preWeightedPixel[1];
89
MFloat dv = yuv[2] - preWeightedPixel[2];
90
91
return dy * dy + du * du + dv * dv;
92
}
93
94
void cvtt::Internal::ETCComputer::TestHalfBlock(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const MSInt16 modifiers[4], bool isDifferential, const Options &options)
95
{
96
MUInt15 quantized[3];
97
MUInt15 unquantized[3];
98
99
for (int ch = 0; ch < 3; ch++)
100
{
101
quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
102
103
if (isDifferential)
104
unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
105
else
106
unquantized[ch] = (quantized[ch] << 4) | quantized[ch];
107
}
108
109
MUInt16 selectors = ParallelMath::MakeUInt16(0);
110
MFloat totalError = ParallelMath::MakeFloatZero();
111
112
MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
113
MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
114
115
MUInt15 unquantizedModified[4][3];
116
for (unsigned int s = 0; s < 4; s++)
117
for (int ch = 0; ch < 3; ch++)
118
unquantizedModified[s][ch] = ParallelMath::Min(ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::ToSInt16(unquantized[ch]) + modifiers[s], s16_zero)), u15_255);
119
120
bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
121
bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
122
123
for (int px = 0; px < 8; px++)
124
{
125
MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
126
MUInt16 bestSelector = ParallelMath::MakeUInt16(0);
127
128
for (unsigned int s = 0; s < 4; s++)
129
{
130
MFloat error;
131
if (isFakeBT709)
132
error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
133
else if (isUniform)
134
error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
135
else
136
error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
137
138
ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
139
bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt16(s), bestSelector);
140
bestError = ParallelMath::Min(error, bestError);
141
}
142
143
totalError = totalError + bestError;
144
selectors = selectors | (bestSelector << (px * 2));
145
}
146
147
outError = totalError;
148
outSelectors = selectors;
149
}
150
151
void cvtt::Internal::ETCComputer::TestHalfBlockPunchthrough(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const ParallelMath::Int16CompFlag isTransparent[8], const MUInt15 modifier, const Options &options)
152
{
153
MUInt15 quantized[3];
154
MUInt15 unquantized[3];
155
156
for (int ch = 0; ch < 3; ch++)
157
{
158
quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
159
unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
160
}
161
162
MUInt16 selectors = ParallelMath::MakeUInt16(0);
163
MFloat totalError = ParallelMath::MakeFloatZero();
164
165
MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
166
MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
167
168
MUInt15 unquantizedModified[3][3];
169
for (int ch = 0; ch < 3; ch++)
170
{
171
unquantizedModified[0][ch] = ParallelMath::Max(unquantized[ch], modifier) - modifier;
172
unquantizedModified[1][ch] = unquantized[ch];
173
unquantizedModified[2][ch] = ParallelMath::Min(unquantized[ch] + modifier, u15_255);
174
}
175
176
bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
177
bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
178
179
for (int px = 0; px < 8; px++)
180
{
181
ParallelMath::FloatCompFlag isTransparentFloat = ParallelMath::Int16FlagToFloat(isTransparent[px]);
182
183
MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
184
MUInt15 bestSelector = ParallelMath::MakeUInt15(0);
185
186
for (unsigned int s = 0; s < 3; s++)
187
{
188
MFloat error;
189
if (isFakeBT709)
190
error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
191
else if (isUniform)
192
error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
193
else
194
error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
195
196
ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
197
bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(s), bestSelector);
198
bestError = ParallelMath::Min(error, bestError);
199
}
200
201
// Annoying quirk: The ETC encoding machinery assumes that selectors are in the table order in the spec, which isn't
202
// the same as their encoding bits, so the transparent index is actually 1 and the valid indexes are 0, 2, and 3.
203
204
// Remap selector 1 to 2, and 2 to 3
205
bestSelector = ParallelMath::Min(ParallelMath::MakeUInt15(3), bestSelector << 1);
206
207
// Mark zero transparent as
208
ParallelMath::ConditionalSet(bestError, isTransparentFloat, ParallelMath::MakeFloatZero());
209
ParallelMath::ConditionalSet(bestSelector, isTransparent[px], ParallelMath::MakeUInt15(1));
210
211
totalError = totalError + bestError;
212
selectors = selectors | (ParallelMath::LosslessCast<MUInt16>::Cast(bestSelector) << (px * 2));
213
}
214
215
outError = totalError;
216
outSelectors = selectors;
217
}
218
219
void cvtt::Internal::ETCComputer::FindBestDifferentialCombination(int flip, int d, const ParallelMath::Int16CompFlag canIgnoreSector[2], ParallelMath::Int16CompFlag& bestIsThisMode, MFloat& bestTotalError, MUInt15& bestFlip, MUInt15& bestD, MUInt15 bestColors[2], MUInt16 bestSelectors[2], MUInt15 bestTables[2], DifferentialResolveStorage &drs)
220
{
221
// We do this part scalar because most of the cost benefit of parallelization is in error evaluation,
222
// and this code has a LOT of early-outs and disjointed index lookups that vary heavily between blocks
223
// and save a lot of time.
224
for (int block = 0; block < ParallelMath::ParallelSize; block++)
225
{
226
bool canIgnore[2] = { ParallelMath::Extract(canIgnoreSector[0], block), ParallelMath::Extract(canIgnoreSector[1], block) };
227
bool canIgnoreEither = canIgnore[0] || canIgnore[1];
228
float blockBestTotalError = ParallelMath::Extract(bestTotalError, block);
229
float bestDiffErrors[2] = { FLT_MAX, FLT_MAX };
230
uint16_t bestDiffSelectors[2] = { 0, 0 };
231
uint16_t bestDiffColors[2] = { 0, 0 };
232
uint16_t bestDiffTables[2] = { 0, 0 };
233
for (int sector = 0; sector < 2; sector++)
234
{
235
unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
236
for (unsigned int i = 0; i < sectorNumAttempts; i++)
237
{
238
float error = ParallelMath::Extract(drs.diffErrors[sector][i], block);
239
if (error < bestDiffErrors[sector])
240
{
241
bestDiffErrors[sector] = error;
242
bestDiffSelectors[sector] = ParallelMath::Extract(drs.diffSelectors[sector][i], block);
243
bestDiffColors[sector] = ParallelMath::Extract(drs.diffColors[sector][i], block);
244
bestDiffTables[sector] = ParallelMath::Extract(drs.diffTables[sector][i], block);
245
}
246
}
247
}
248
249
if (canIgnore[0])
250
bestDiffColors[0] = bestDiffColors[1];
251
else if (canIgnore[1])
252
bestDiffColors[1] = bestDiffColors[0];
253
254
// The best differential possibilities must be better than the best total error
255
if (bestDiffErrors[0] + bestDiffErrors[1] < blockBestTotalError)
256
{
257
// Fast path if the best possible case is legal
258
if (canIgnoreEither || ETCDifferentialIsLegalScalar(bestDiffColors[0], bestDiffColors[1]))
259
{
260
ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
261
ParallelMath::PutFloat(bestTotalError, block, bestDiffErrors[0] + bestDiffErrors[1]);
262
ParallelMath::PutUInt15(bestFlip, block, flip);
263
ParallelMath::PutUInt15(bestD, block, d);
264
for (int sector = 0; sector < 2; sector++)
265
{
266
ParallelMath::PutUInt15(bestColors[sector], block, bestDiffColors[sector]);
267
ParallelMath::PutUInt16(bestSelectors[sector], block, bestDiffSelectors[sector]);
268
ParallelMath::PutUInt15(bestTables[sector], block, bestDiffTables[sector]);
269
}
270
}
271
else
272
{
273
// Slow path: Sort the possible cases by quality, and search valid combinations
274
// TODO: Pre-flatten the error lists so this is nicer to cache
275
unsigned int numSortIndexes[2] = { 0, 0 };
276
for (int sector = 0; sector < 2; sector++)
277
{
278
unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
279
280
for (unsigned int i = 0; i < sectorNumAttempts; i++)
281
{
282
if (ParallelMath::Extract(drs.diffErrors[sector][i], block) < blockBestTotalError)
283
drs.attemptSortIndexes[sector][numSortIndexes[sector]++] = i;
284
}
285
286
struct SortPredicate
287
{
288
const MFloat *diffErrors;
289
int block;
290
291
bool operator()(uint16_t a, uint16_t b) const
292
{
293
float errorA = ParallelMath::Extract(diffErrors[a], block);
294
float errorB = ParallelMath::Extract(diffErrors[b], block);
295
296
if (errorA < errorB)
297
return true;
298
if (errorA > errorB)
299
return false;
300
301
return a < b;
302
}
303
};
304
305
SortPredicate sp;
306
sp.diffErrors = drs.diffErrors[sector];
307
sp.block = block;
308
309
std::sort<uint16_t*, const SortPredicate&>(drs.attemptSortIndexes[sector], drs.attemptSortIndexes[sector] + numSortIndexes[sector], sp);
310
}
311
312
int scannedElements = 0;
313
for (unsigned int i = 0; i < numSortIndexes[0]; i++)
314
{
315
unsigned int attemptIndex0 = drs.attemptSortIndexes[0][i];
316
float error0 = ParallelMath::Extract(drs.diffErrors[0][attemptIndex0], block);
317
318
scannedElements++;
319
320
if (error0 >= blockBestTotalError)
321
break;
322
323
float maxError1 = ParallelMath::Extract(bestTotalError, block) - error0;
324
uint16_t diffColor0 = ParallelMath::Extract(drs.diffColors[0][attemptIndex0], block);
325
326
if (maxError1 < bestDiffErrors[1])
327
break;
328
329
for (unsigned int j = 0; j < numSortIndexes[1]; j++)
330
{
331
unsigned int attemptIndex1 = drs.attemptSortIndexes[1][j];
332
float error1 = ParallelMath::Extract(drs.diffErrors[1][attemptIndex1], block);
333
334
scannedElements++;
335
336
if (error1 >= maxError1)
337
break;
338
339
uint16_t diffColor1 = ParallelMath::Extract(drs.diffColors[1][attemptIndex1], block);
340
341
if (ETCDifferentialIsLegalScalar(diffColor0, diffColor1))
342
{
343
blockBestTotalError = error0 + error1;
344
345
ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
346
ParallelMath::PutFloat(bestTotalError, block, blockBestTotalError);
347
ParallelMath::PutUInt15(bestFlip, block, flip);
348
ParallelMath::PutUInt15(bestD, block, d);
349
ParallelMath::PutUInt15(bestColors[0], block, diffColor0);
350
ParallelMath::PutUInt15(bestColors[1], block, diffColor1);
351
ParallelMath::PutUInt16(bestSelectors[0], block, ParallelMath::Extract(drs.diffSelectors[0][attemptIndex0], block));
352
ParallelMath::PutUInt16(bestSelectors[1], block, ParallelMath::Extract(drs.diffSelectors[1][attemptIndex1], block));
353
ParallelMath::PutUInt15(bestTables[0], block, ParallelMath::Extract(drs.diffTables[0][attemptIndex0], block));
354
ParallelMath::PutUInt15(bestTables[1], block, ParallelMath::Extract(drs.diffTables[1][attemptIndex1], block));
355
break;
356
}
357
}
358
}
359
}
360
}
361
}
362
}
363
364
cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannel(const MUInt15 &a, const MUInt15 &b)
365
{
366
MSInt16 diff = ParallelMath::LosslessCast<MSInt16>::Cast(b) - ParallelMath::LosslessCast<MSInt16>::Cast(a);
367
368
return ParallelMath::Less(ParallelMath::MakeSInt16(-5), diff) & ParallelMath::Less(diff, ParallelMath::MakeSInt16(4));
369
}
370
371
cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegal(const MUInt15 &a, const MUInt15 &b)
372
{
373
MUInt15 mask = ParallelMath::MakeUInt15(31);
374
375
return ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 10), ParallelMath::RightShift(b, 10))
376
& ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 5) & mask, ParallelMath::RightShift(b, 5) & mask)
377
& ETCDifferentialIsLegalForChannel(a & mask, b & mask);
378
}
379
380
bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannelScalar(const uint16_t &a, const uint16_t &b)
381
{
382
int16_t diff = static_cast<int16_t>(b) - static_cast<int16_t>(a);
383
384
return (-4 <= diff) && (diff <= 3);
385
}
386
387
bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalScalar(const uint16_t &a, const uint16_t &b)
388
{
389
MUInt15 mask = ParallelMath::MakeUInt15(31);
390
391
return ETCDifferentialIsLegalForChannelScalar((a >> 10), (b >> 10))
392
& ETCDifferentialIsLegalForChannelScalar((a >> 5) & 31, (b >> 5) & 31)
393
& ETCDifferentialIsLegalForChannelScalar(a & 31, b & 31);
394
}
395
396
void cvtt::Internal::ETCComputer::EncodeTMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
397
{
398
bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
399
bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
400
401
ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
402
403
MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
404
MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
405
406
MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
407
408
// To speed this up, we compute line total as the sum, then subtract out isolated
409
for (unsigned int px = 0; px < 16; px++)
410
{
411
for (int ch = 0; ch < 3; ch++)
412
{
413
isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
414
lineTotal[ch] = lineTotal[ch] + pixels[px][ch];
415
}
416
numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
417
}
418
419
for (int ch = 0; ch < 3; ch++)
420
lineTotal[ch] = lineTotal[ch] - isolatedTotal[ch];
421
422
MUInt15 numPixelsLine = ParallelMath::MakeUInt15(16) - numPixelsIsolated;
423
424
MUInt15 isolatedAverageQuantized[3];
425
MUInt15 isolatedAverageTargets[3];
426
{
427
int divisors[ParallelMath::ParallelSize];
428
for (int block = 0; block < ParallelMath::ParallelSize; block++)
429
divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
430
431
MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
432
for (int ch = 0; ch < 3; ch++)
433
{
434
// isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
435
436
MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
437
if (!isFakeBT709)
438
numerator = numerator + addend;
439
440
for (int block = 0; block < ParallelMath::ParallelSize; block++)
441
{
442
int divisor = divisors[block];
443
if (divisor == 0)
444
ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
445
else
446
ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
447
}
448
449
isolatedAverageTargets[ch] = numerator;
450
}
451
}
452
453
if (isFakeBT709)
454
ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
455
456
MUInt15 isolatedColor[3];
457
for (int ch = 0; ch < 3; ch++)
458
isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
459
460
MFloat isolatedError[16];
461
for (int px = 0; px < 16; px++)
462
{
463
if (isFakeBT709)
464
isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
465
else if (isUniform)
466
isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
467
else
468
isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
469
}
470
471
MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
472
MUInt15 bestTable = ParallelMath::MakeUInt15(0);
473
MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
474
475
MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
476
MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
477
478
int16_t clusterMaxLine = 0;
479
for (int block = 0; block < ParallelMath::ParallelSize; block++)
480
{
481
int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
482
if (blockMaxLine > clusterMaxLine)
483
clusterMaxLine = blockMaxLine;
484
}
485
486
int16_t clusterMinLine = -clusterMaxLine;
487
488
int lineDivisors[ParallelMath::ParallelSize];
489
for (int block = 0; block < ParallelMath::ParallelSize; block++)
490
lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
491
492
MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
493
494
for (int table = 0; table < 8; table++)
495
{
496
int numUniqueColors[ParallelMath::ParallelSize];
497
MUInt15 uniqueQuantizedColors[31];
498
499
for (int block = 0; block < ParallelMath::ParallelSize; block++)
500
numUniqueColors[block] = 0;
501
502
MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
503
MUInt15 modifierOffset = (modifier + modifier);
504
505
for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier++)
506
{
507
MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
508
MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
509
510
MUInt15 quantized[3];
511
if (isFakeBT709)
512
{
513
MUInt15 targets[3];
514
for (int ch = 0; ch < 3; ch++)
515
{
516
//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
517
MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
518
MUInt15 divided = ParallelMath::MakeUInt15(0);
519
for (int block = 0; block < ParallelMath::ParallelSize; block++)
520
{
521
int divisor = lineDivisors[block];
522
if (divisor == 0)
523
ParallelMath::PutUInt15(divided, block, 0);
524
else
525
ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
526
}
527
quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
528
targets[ch] = numerator;
529
}
530
531
ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
532
}
533
else
534
{
535
for (int ch = 0; ch < 3; ch++)
536
{
537
//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
538
MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
539
MUInt15 divided = ParallelMath::MakeUInt15(0);
540
for (int block = 0; block < ParallelMath::ParallelSize; block++)
541
{
542
int divisor = lineDivisors[block];
543
if (divisor == 0)
544
ParallelMath::PutUInt15(divided, block, 0);
545
else
546
ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
547
}
548
quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
549
}
550
}
551
552
MUInt15 packedColor = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
553
554
for (int block = 0; block < ParallelMath::ParallelSize; block++)
555
{
556
uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
557
if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
558
ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
559
}
560
}
561
562
// Stripe unfilled unique colors
563
int maxUniqueColors = 0;
564
for (int block = 0; block < ParallelMath::ParallelSize; block++)
565
{
566
if (numUniqueColors[block] > maxUniqueColors)
567
maxUniqueColors = numUniqueColors[block];
568
}
569
570
for (int block = 0; block < ParallelMath::ParallelSize; block++)
571
{
572
uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
573
574
int numUnique = numUniqueColors[block];
575
for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
576
ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
577
}
578
579
for (int ci = 0; ci < maxUniqueColors; ci++)
580
{
581
MUInt15 lineColors[3][3];
582
for (int ch = 0; ch < 3; ch++)
583
{
584
MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], (ch * 5)) & ParallelMath::MakeUInt15(15));
585
586
MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
587
lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
588
lineColors[1][ch] = unquantizedColor;
589
lineColors[2][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
590
}
591
592
MSInt32 selectors = ParallelMath::MakeSInt32(0);
593
MFloat error = ParallelMath::MakeFloatZero();
594
for (int px = 0; px < 16; px++)
595
{
596
MFloat pixelError = isolatedError[px];
597
598
MUInt15 pixelBestSelector = ParallelMath::MakeUInt15(0);
599
for (int i = 0; i < 3; i++)
600
{
601
MFloat error = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
602
ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, pixelError);
603
pixelError = ParallelMath::Min(error, pixelError);
604
pixelBestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(i + 1), pixelBestSelector);
605
}
606
607
error = error + pixelError;
608
selectors = selectors | (ParallelMath::ToInt32(pixelBestSelector) << (px * 2));
609
}
610
611
ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
612
bestError = ParallelMath::Min(error, bestError);
613
614
if (ParallelMath::AnySet(errorBetter))
615
{
616
ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
617
ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
618
ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
619
bestIsThisMode = bestIsThisMode | errorBetter;
620
}
621
}
622
}
623
624
for (int block = 0; block < ParallelMath::ParallelSize; block++)
625
{
626
if (ParallelMath::Extract(bestIsThisMode, block))
627
{
628
uint32_t lowBits = 0;
629
uint32_t highBits = 0;
630
631
uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
632
ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
633
634
for (int ch = 0; ch < 3; ch++)
635
blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
636
637
uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
638
int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
639
640
ParallelMath::ScalarUInt16 lineColor[3];
641
for (int ch = 0; ch < 3; ch++)
642
lineColor[ch] = (blockBestLineColor >> (ch * 5)) & 15;
643
644
EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, true);
645
}
646
}
647
}
648
649
void cvtt::Internal::ETCComputer::EncodeHMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag groupings[16], const MUInt15 pixels[16][3], HModeEval &he, const MFloat preWeightedPixels[16][3], const Options &options)
650
{
651
bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
652
bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
653
654
MUInt15 zero15 = ParallelMath::MakeUInt15(0);
655
656
MUInt15 counts[2] = { zero15, zero15 };
657
658
ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
659
660
MUInt15 totals[2][3] =
661
{
662
{ zero15, zero15, zero15 },
663
{ zero15, zero15, zero15 }
664
};
665
666
for (unsigned int px = 0; px < 16; px++)
667
{
668
for (int ch = 0; ch < 3; ch++)
669
{
670
totals[0][ch] = totals[0][ch] + pixels[px][ch];
671
totals[1][ch] = totals[1][ch] + ParallelMath::SelectOrZero(groupings[px], pixels[px][ch]);
672
}
673
counts[1] = counts[1] + ParallelMath::SelectOrZero(groupings[px], ParallelMath::MakeUInt15(1));
674
}
675
676
for (int ch = 0; ch < 3; ch++)
677
totals[0][ch] = totals[0][ch] - totals[1][ch];
678
counts[0] = ParallelMath::MakeUInt15(16) - counts[1];
679
680
MUInt16 bestSectorBits = ParallelMath::MakeUInt16(0);
681
MUInt16 bestSignBits = ParallelMath::MakeUInt16(0);
682
MUInt15 bestColors[2] = { zero15, zero15 };
683
MUInt15 bestTable = ParallelMath::MakeUInt15(0);
684
685
for (int table = 0; table < 8; table++)
686
{
687
MUInt15 numUniqueColors = zero15;
688
689
int modifier = cvtt::Tables::ETC1::g_thModifierTable[table];
690
691
for (int sector = 0; sector < 2; sector++)
692
{
693
for (int block = 0; block < ParallelMath::ParallelSize; block++)
694
{
695
int blockNumUniqueColors = 0;
696
uint16_t blockUniqueQuantizedColors[31];
697
698
int maxOffsetMultiplier = ParallelMath::Extract(counts[sector], block);
699
int minOffsetMultiplier = -maxOffsetMultiplier;
700
701
int modifierOffset = modifier * 2;
702
703
int blockSectorCounts = ParallelMath::Extract(counts[sector], block);
704
int blockSectorTotals[3];
705
for (int ch = 0; ch < 3; ch++)
706
blockSectorTotals[ch] = ParallelMath::Extract(totals[sector][ch], block);
707
708
for (int offsetPremultiplier = minOffsetMultiplier; offsetPremultiplier <= maxOffsetMultiplier; offsetPremultiplier++)
709
{
710
// TODO: This isn't ideal for FakeBT709
711
int16_t quantized[3];
712
for (int ch = 0; ch < 3; ch++)
713
{
714
if (blockSectorCounts == 0)
715
quantized[ch] = 0;
716
else
717
quantized[ch] = std::min<int16_t>(15, std::max<int16_t>(0, (blockSectorTotals[ch] * 2 + blockSectorCounts * 17 + modifierOffset * offsetPremultiplier)) / (blockSectorCounts * 34));
718
}
719
720
uint16_t packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
721
if (blockNumUniqueColors == 0 || packedColor != blockUniqueQuantizedColors[blockNumUniqueColors - 1])
722
{
723
assert(blockNumUniqueColors < 32);
724
blockUniqueQuantizedColors[blockNumUniqueColors++] = packedColor;
725
}
726
}
727
728
ParallelMath::PutUInt15(he.numUniqueColors[sector], block, blockNumUniqueColors);
729
730
int baseIndex = 0;
731
if (sector == 1)
732
baseIndex = ParallelMath::Extract(he.numUniqueColors[0], block);
733
734
for (int i = 0; i < blockNumUniqueColors; i++)
735
ParallelMath::PutUInt15(he.uniqueQuantizedColors[baseIndex + i], block, blockUniqueQuantizedColors[i]);
736
}
737
}
738
739
MUInt15 totalColors = he.numUniqueColors[0] + he.numUniqueColors[1];
740
int maxErrorColors = 0;
741
for (int block = 0; block < ParallelMath::ParallelSize; block++)
742
maxErrorColors = std::max<int>(maxErrorColors, ParallelMath::Extract(totalColors, block));
743
744
for (int block = 0; block < ParallelMath::ParallelSize; block++)
745
{
746
int lastColor = ParallelMath::Extract(totalColors, block);
747
uint16_t stripeColor = ParallelMath::Extract(he.uniqueQuantizedColors[0], block);
748
for (int i = lastColor; i < maxErrorColors; i++)
749
ParallelMath::PutUInt15(he.uniqueQuantizedColors[i], block, stripeColor);
750
}
751
752
for (int ci = 0; ci < maxErrorColors; ci++)
753
{
754
MUInt15 fifteen = ParallelMath::MakeUInt15(15);
755
MUInt15 twoFiftyFive = ParallelMath::MakeUInt15(255);
756
MSInt16 zeroS16 = ParallelMath::MakeSInt16(0);
757
758
MUInt15 colors[2][3];
759
for (int ch = 0; ch < 3; ch++)
760
{
761
MUInt15 quantizedChannel = ParallelMath::RightShift(he.uniqueQuantizedColors[ci], ((2 - ch) * 5)) & fifteen;
762
763
MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
764
colors[0][ch] = ParallelMath::Min(twoFiftyFive, unquantizedColor + modifier);
765
colors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(zeroS16, ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::MakeSInt16(modifier)));
766
}
767
768
MUInt16 signBits = ParallelMath::MakeUInt16(0);
769
for (int px = 0; px < 16; px++)
770
{
771
MFloat errors[2];
772
for (int i = 0; i < 2; i++)
773
{
774
if (isFakeBT709)
775
errors[i] = ComputeErrorFakeBT709(colors[i], preWeightedPixels[px]);
776
else if (isUniform)
777
errors[i] = ComputeErrorUniform(colors[i], pixels[px]);
778
else
779
errors[i] = ComputeErrorWeighted(colors[i], preWeightedPixels[px], options);
780
}
781
782
ParallelMath::Int16CompFlag errorOneLess = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errors[1], errors[0]));
783
he.errors[ci][px] = ParallelMath::Min(errors[0], errors[1]);
784
signBits = signBits | ParallelMath::SelectOrZero(errorOneLess, ParallelMath::MakeUInt16(1 << px));
785
}
786
he.signBits[ci] = signBits;
787
}
788
789
int maxUniqueColorCombos = 0;
790
for (int block = 0; block < ParallelMath::ParallelSize; block++)
791
{
792
int numUniqueColorCombos = ParallelMath::Extract(he.numUniqueColors[0], block) * ParallelMath::Extract(he.numUniqueColors[1], block);
793
if (numUniqueColorCombos > maxUniqueColorCombos)
794
maxUniqueColorCombos = numUniqueColorCombos;
795
}
796
797
MUInt15 indexes[2] = { zero15, zero15 };
798
MUInt15 maxIndex[2] = { he.numUniqueColors[0] - ParallelMath::MakeUInt15(1), he.numUniqueColors[1] - ParallelMath::MakeUInt15(1) };
799
800
int block1Starts[ParallelMath::ParallelSize];
801
for (int block = 0; block < ParallelMath::ParallelSize; block++)
802
block1Starts[block] = ParallelMath::Extract(he.numUniqueColors[0], block);
803
804
for (int combo = 0; combo < maxUniqueColorCombos; combo++)
805
{
806
MUInt15 index0 = indexes[0] + ParallelMath::MakeUInt15(1);
807
ParallelMath::Int16CompFlag index0Overflow = ParallelMath::Less(maxIndex[0], index0);
808
ParallelMath::ConditionalSet(index0, index0Overflow, ParallelMath::MakeUInt15(0));
809
810
MUInt15 index1 = ParallelMath::Min(maxIndex[1], indexes[1] + ParallelMath::SelectOrZero(index0Overflow, ParallelMath::MakeUInt15(1)));
811
indexes[0] = index0;
812
indexes[1] = index1;
813
814
int ci0[ParallelMath::ParallelSize];
815
int ci1[ParallelMath::ParallelSize];
816
MUInt15 color0;
817
MUInt15 color1;
818
819
for (int block = 0; block < ParallelMath::ParallelSize; block++)
820
{
821
ci0[block] = ParallelMath::Extract(index0, block);
822
ci1[block] = ParallelMath::Extract(index1, block) + block1Starts[block];
823
ParallelMath::PutUInt15(color0, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci0[block]], block));
824
ParallelMath::PutUInt15(color1, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci1[block]], block));
825
}
826
827
MFloat totalError = ParallelMath::MakeFloatZero();
828
MUInt16 sectorBits = ParallelMath::MakeUInt16(0);
829
MUInt16 signBits = ParallelMath::MakeUInt16(0);
830
for (int px = 0; px < 16; px++)
831
{
832
MFloat errorCI0;
833
MFloat errorCI1;
834
MUInt16 signBits0;
835
MUInt16 signBits1;
836
837
for (int block = 0; block < ParallelMath::ParallelSize; block++)
838
{
839
ParallelMath::PutFloat(errorCI0, block, ParallelMath::Extract(he.errors[ci0[block]][px], block));
840
ParallelMath::PutFloat(errorCI1, block, ParallelMath::Extract(he.errors[ci1[block]][px], block));
841
ParallelMath::PutUInt16(signBits0, block, ParallelMath::Extract(he.signBits[ci0[block]], block));
842
ParallelMath::PutUInt16(signBits1, block, ParallelMath::Extract(he.signBits[ci1[block]], block));
843
}
844
845
totalError = totalError + ParallelMath::Min(errorCI0, errorCI1);
846
847
MUInt16 bitPosition = ParallelMath::MakeUInt16(1 << px);
848
849
ParallelMath::Int16CompFlag error1Better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errorCI1, errorCI0));
850
851
sectorBits = sectorBits | ParallelMath::SelectOrZero(error1Better, bitPosition);
852
signBits = signBits | (bitPosition & ParallelMath::Select(error1Better, signBits1, signBits0));
853
}
854
855
ParallelMath::FloatCompFlag totalErrorBetter = ParallelMath::Less(totalError, bestError);
856
ParallelMath::Int16CompFlag totalErrorBetter16 = ParallelMath::FloatFlagToInt16(totalErrorBetter);
857
if (ParallelMath::AnySet(totalErrorBetter16))
858
{
859
bestIsThisMode = bestIsThisMode | totalErrorBetter16;
860
ParallelMath::ConditionalSet(bestTable, totalErrorBetter16, ParallelMath::MakeUInt15(table));
861
ParallelMath::ConditionalSet(bestColors[0], totalErrorBetter16, color0);
862
ParallelMath::ConditionalSet(bestColors[1], totalErrorBetter16, color1);
863
ParallelMath::ConditionalSet(bestSectorBits, totalErrorBetter16, sectorBits);
864
ParallelMath::ConditionalSet(bestSignBits, totalErrorBetter16, signBits);
865
bestError = ParallelMath::Min(totalError, bestError);
866
}
867
}
868
}
869
870
if (ParallelMath::AnySet(bestIsThisMode))
871
{
872
for (int block = 0; block < ParallelMath::ParallelSize; block++)
873
{
874
if (!ParallelMath::Extract(bestIsThisMode, block))
875
continue;
876
877
ParallelMath::ScalarUInt16 blockBestColors[2] = { ParallelMath::Extract(bestColors[0], block), ParallelMath::Extract(bestColors[1], block) };
878
ParallelMath::ScalarUInt16 blockBestSectorBits = ParallelMath::Extract(bestSectorBits, block);
879
ParallelMath::ScalarUInt16 blockBestSignBits = ParallelMath::Extract(bestSignBits, block);
880
ParallelMath::ScalarUInt16 blockBestTable = ParallelMath::Extract(bestTable, block);
881
882
EmitHModeBlock(outputBuffer + block * 8, blockBestColors, blockBestSectorBits, blockBestSignBits, blockBestTable, true);
883
}
884
}
885
}
886
887
void cvtt::Internal::ETCComputer::EncodeVirtualTModePunchthrough(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolatedBase[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], const ParallelMath::Int16CompFlag& anyTransparent, const ParallelMath::Int16CompFlag& allTransparent, const Options &options)
888
{
889
// We treat T and H mode as the same mode ("Virtual T mode") with punchthrough, because of how the colors work:
890
//
891
// T mode: C1, C2+M, Transparent, C2-M
892
// H mode: C1+M, C1-M, Transparent, C2-M
893
//
894
// So in either case, we have 2 colors +/- a modifier, and a third unique color, which is basically T mode except without the middle color.
895
// The only thing that matters is whether it's better to store the isolated color as T mode color 1, or store it offset in H mode color 2.
896
//
897
// Sometimes it won't even be possible to store it in H mode color 2 because the table low bit derives from a numeric comparison of the colors,
898
// but unlike opaque blocks, we can't flip them.
899
bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
900
bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
901
902
ParallelMath::FloatCompFlag isTransparentF[16];
903
for (int px = 0; px < 16; px++)
904
isTransparentF[px] = ParallelMath::Int16FlagToFloat(isTransparent[px]);
905
906
ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
907
ParallelMath::Int16CompFlag bestIsHMode = ParallelMath::MakeBoolInt16(false);
908
909
MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
910
MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
911
912
MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
913
MUInt15 numPixelsLine = ParallelMath::MakeUInt15(0);
914
915
ParallelMath::Int16CompFlag isIsolated[16];
916
ParallelMath::Int16CompFlag isLine[16];
917
918
for (unsigned int px = 0; px < 16; px++)
919
{
920
ParallelMath::Int16CompFlag isOpaque = ParallelMath::Not(isTransparent[px]);
921
isIsolated[px] = isIsolatedBase[px] & isOpaque;
922
isLine[px] = ParallelMath::Not(isIsolatedBase[px]) & isOpaque;
923
}
924
925
for (unsigned int px = 0; px < 16; px++)
926
{
927
for (int ch = 0; ch < 3; ch++)
928
{
929
isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
930
lineTotal[ch] = lineTotal[ch] + ParallelMath::SelectOrZero(isLine[px], pixels[px][ch]);
931
}
932
numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
933
numPixelsLine = numPixelsLine + ParallelMath::SelectOrZero(isLine[px], ParallelMath::MakeUInt15(1));
934
}
935
936
MUInt15 isolatedAverageQuantized[3];
937
MUInt15 hModeIsolatedQuantized[8][3];
938
MUInt15 isolatedAverageTargets[3];
939
{
940
int divisors[ParallelMath::ParallelSize];
941
for (int block = 0; block < ParallelMath::ParallelSize; block++)
942
divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
943
944
MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
945
for (int ch = 0; ch < 3; ch++)
946
{
947
// isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
948
949
MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
950
if (!isFakeBT709)
951
numerator = numerator + addend;
952
953
MUInt15 hModeIsolatedNumerators[8];
954
for (int table = 0; table < 8; table++)
955
{
956
// FIXME: Handle fake BT.709 correctly
957
MUInt15 offsetTotal = isolatedTotal[ch] + ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]), numPixelsIsolated));
958
959
hModeIsolatedNumerators[table] = (offsetTotal + offsetTotal) + addend;
960
}
961
962
for (int block = 0; block < ParallelMath::ParallelSize; block++)
963
{
964
int divisor = divisors[block];
965
if (divisor == 0)
966
{
967
ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
968
for (int table = 0; table < 8; table++)
969
ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, 0);
970
}
971
else
972
{
973
ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
974
for (int table = 0; table < 8; table++)
975
ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, ParallelMath::Extract(hModeIsolatedNumerators[table], block) / divisor);
976
}
977
}
978
979
isolatedAverageTargets[ch] = numerator;
980
}
981
}
982
983
if (isFakeBT709)
984
ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
985
986
for (int table = 0; table < 8; table++)
987
for (int ch = 0; ch < 3; ch++)
988
hModeIsolatedQuantized[table][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), hModeIsolatedQuantized[table][ch]);
989
990
MUInt15 isolatedColor[3];
991
for (int ch = 0; ch < 3; ch++)
992
isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
993
994
MFloat isolatedError[16];
995
for (int px = 0; px < 16; px++)
996
{
997
if (isFakeBT709)
998
isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
999
else if (isUniform)
1000
isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
1001
else
1002
isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
1003
1004
ParallelMath::ConditionalSet(isolatedError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1005
}
1006
1007
MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
1008
MUInt15 bestTable = ParallelMath::MakeUInt15(0);
1009
MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
1010
MUInt15 bestIsolatedColor = ParallelMath::MakeUInt15(0);
1011
MUInt15 bestHModeColor2 = ParallelMath::MakeUInt15(0);
1012
ParallelMath::Int16CompFlag bestUseHMode = ParallelMath::MakeBoolInt16(false);
1013
1014
MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
1015
MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
1016
1017
int16_t clusterMaxLine = 0;
1018
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1019
{
1020
int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
1021
if (blockMaxLine > clusterMaxLine)
1022
clusterMaxLine = blockMaxLine;
1023
}
1024
1025
int16_t clusterMinLine = -clusterMaxLine;
1026
1027
int lineDivisors[ParallelMath::ParallelSize];
1028
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1029
lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
1030
1031
MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
1032
1033
for (int table = 0; table < 8; table++)
1034
{
1035
int numUniqueColors[ParallelMath::ParallelSize];
1036
MUInt15 uniqueQuantizedColors[31];
1037
1038
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1039
numUniqueColors[block] = 0;
1040
1041
MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
1042
MUInt15 modifierOffset = (modifier + modifier);
1043
1044
for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier += 2)
1045
{
1046
MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
1047
MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
1048
1049
MUInt15 quantized[3];
1050
if (isFakeBT709)
1051
{
1052
MUInt15 targets[3];
1053
for (int ch = 0; ch < 3; ch++)
1054
{
1055
//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
1056
MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
1057
MUInt15 divided = ParallelMath::MakeUInt15(0);
1058
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1059
{
1060
int divisor = lineDivisors[block];
1061
if (divisor == 0)
1062
ParallelMath::PutUInt15(divided, block, 0);
1063
else
1064
ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
1065
}
1066
quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
1067
targets[ch] = numerator;
1068
}
1069
1070
ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
1071
}
1072
else
1073
{
1074
for (int ch = 0; ch < 3; ch++)
1075
{
1076
//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
1077
MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
1078
MUInt15 divided = ParallelMath::MakeUInt15(0);
1079
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1080
{
1081
int divisor = lineDivisors[block];
1082
if (divisor == 0)
1083
ParallelMath::PutUInt15(divided, block, 0);
1084
else
1085
ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
1086
}
1087
quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
1088
}
1089
}
1090
1091
MUInt15 packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
1092
1093
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1094
{
1095
uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
1096
if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
1097
ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
1098
}
1099
}
1100
1101
// Stripe unfilled unique colors
1102
int maxUniqueColors = 0;
1103
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1104
{
1105
if (numUniqueColors[block] > maxUniqueColors)
1106
maxUniqueColors = numUniqueColors[block];
1107
}
1108
1109
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1110
{
1111
uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
1112
1113
int numUnique = numUniqueColors[block];
1114
for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
1115
ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
1116
}
1117
1118
MFloat hModeErrors[16];
1119
MUInt15 hModeUnquantizedColor[3];
1120
for (int ch = 0; ch < 3; ch++)
1121
{
1122
MUInt15 quantizedChannel = hModeIsolatedQuantized[table][ch];
1123
1124
MUInt15 unquantizedCh = (quantizedChannel << 4) | quantizedChannel;
1125
hModeUnquantizedColor[ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedCh) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
1126
}
1127
1128
for (int px = 0; px < 16; px++)
1129
{
1130
hModeErrors[px] = isUniform ? ComputeErrorUniform(hModeUnquantizedColor, pixels[px]) : ComputeErrorWeighted(hModeUnquantizedColor, preWeightedPixels[px], options);
1131
ParallelMath::ConditionalSet(hModeErrors[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1132
}
1133
1134
MUInt15 packedHModeColor2 = (hModeIsolatedQuantized[table][0] << 10) | (hModeIsolatedQuantized[table][1] << 5) | hModeIsolatedQuantized[table][2];
1135
ParallelMath::Int16CompFlag tableLowBitIsZero = ((table & 1) == 0) ? ParallelMath::MakeBoolInt16(true) : ParallelMath::MakeBoolInt16(false);
1136
1137
for (int ci = 0; ci < maxUniqueColors; ci++)
1138
{
1139
MUInt15 lineColors[2][3];
1140
for (int ch = 0; ch < 3; ch++)
1141
{
1142
MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], 10 - (ch * 5)) & ParallelMath::MakeUInt15(15));
1143
1144
MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
1145
lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
1146
lineColors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
1147
}
1148
1149
MUInt15 bestLineSelector[16];
1150
MFloat bestLineError[16];
1151
for (int px = 0; px < 16; px++)
1152
{
1153
MFloat lineErrors[2];
1154
for (int i = 0; i < 2; i++)
1155
lineErrors[i] = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
1156
1157
ParallelMath::Int16CompFlag firstIsBetter = ParallelMath::FloatFlagToInt16(ParallelMath::LessOrEqual(lineErrors[0], lineErrors[1]));
1158
bestLineSelector[px] = ParallelMath::Select(firstIsBetter, ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(3));
1159
bestLineError[px] = ParallelMath::Min(lineErrors[0], lineErrors[1]);
1160
1161
ParallelMath::ConditionalSet(bestLineError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
1162
}
1163
1164
// One case considered here was if it was possible to force H mode to be valid when the line color is unused.
1165
// That case isn't actually useful because it's equivalent to the isolated color being unused at maximum offset,
1166
// which is always checked after a swap.
1167
MFloat tModeError = ParallelMath::MakeFloatZero();
1168
MFloat hModeError = ParallelMath::MakeFloatZero();
1169
for (int px = 0; px < 16; px++)
1170
{
1171
tModeError = tModeError + ParallelMath::Min(bestLineError[px], isolatedError[px]);
1172
hModeError = hModeError + ParallelMath::Min(bestLineError[px], hModeErrors[px]);
1173
}
1174
1175
ParallelMath::FloatCompFlag hLessError = ParallelMath::Less(hModeError, tModeError);
1176
1177
MUInt15 packedHModeColor1 = uniqueQuantizedColors[ci];
1178
1179
ParallelMath::Int16CompFlag hModeTableLowBitMustBeZero = ParallelMath::Less(packedHModeColor1, packedHModeColor2);
1180
1181
ParallelMath::Int16CompFlag hModeIsLegal = ParallelMath::Equal(hModeTableLowBitMustBeZero, tableLowBitIsZero);
1182
ParallelMath::Int16CompFlag useHMode = ParallelMath::FloatFlagToInt16(hLessError) & hModeIsLegal;
1183
1184
MFloat roundBestError = tModeError;
1185
ParallelMath::ConditionalSet(roundBestError, ParallelMath::Int16FlagToFloat(useHMode), hModeError);
1186
1187
ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(roundBestError, bestError));
1188
ParallelMath::FloatCompFlag useHModeF = ParallelMath::Int16FlagToFloat(useHMode);
1189
1190
if (ParallelMath::AnySet(errorBetter))
1191
{
1192
MSInt32 selectors = ParallelMath::MakeSInt32(0);
1193
for (int px = 0; px < 16; px++)
1194
{
1195
MUInt15 selector = bestLineSelector[px];
1196
1197
MFloat isolatedPixelError = ParallelMath::Select(useHModeF, hModeErrors[px], isolatedError[px]);
1198
ParallelMath::Int16CompFlag isolatedBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(isolatedPixelError, bestLineError[px]));
1199
1200
ParallelMath::ConditionalSet(selector, isolatedBetter, ParallelMath::MakeUInt15(0));
1201
ParallelMath::ConditionalSet(selector, isTransparent[px], ParallelMath::MakeUInt15(2));
1202
selectors = selectors | (ParallelMath::ToInt32(selector) << (px * 2));
1203
}
1204
1205
bestError = ParallelMath::Min(bestError, roundBestError);
1206
ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
1207
ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
1208
ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
1209
ParallelMath::ConditionalSet(bestIsHMode, errorBetter, useHMode);
1210
ParallelMath::ConditionalSet(bestHModeColor2, errorBetter, packedHModeColor2);
1211
1212
bestIsThisMode = bestIsThisMode | errorBetter;
1213
}
1214
}
1215
}
1216
1217
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1218
{
1219
if (ParallelMath::Extract(bestIsThisMode, block))
1220
{
1221
uint32_t lowBits = 0;
1222
uint32_t highBits = 0;
1223
1224
uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
1225
ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
1226
1227
for (int ch = 0; ch < 3; ch++)
1228
blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
1229
1230
uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
1231
int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
1232
1233
ParallelMath::ScalarUInt16 lineColor[3];
1234
for (int ch = 0; ch < 3; ch++)
1235
lineColor[ch] = (blockBestLineColor >> (10 - (ch * 5))) & 15;
1236
1237
if (ParallelMath::Extract(bestIsHMode, block))
1238
{
1239
// T mode: C1, C2+M, Transparent, C2-M
1240
// H mode: C1+M, C1-M, Transparent, C2-M
1241
static const ParallelMath::ScalarUInt16 selectorRemapSector[4] = { 1, 0, 1, 0 };
1242
static const ParallelMath::ScalarUInt16 selectorRemapSign[4] = { 1, 0, 0, 1 };
1243
1244
// Remap selectors
1245
ParallelMath::ScalarUInt16 signBits = 0;
1246
ParallelMath::ScalarUInt16 sectorBits = 0;
1247
int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
1248
for (int px = 0; px < 16; px++)
1249
{
1250
int32_t selector = (blockBestSelectors >> (px * 2)) & 3;
1251
sectorBits |= (selectorRemapSector[selector] << px);
1252
signBits |= (selectorRemapSign[selector] << px);
1253
}
1254
1255
ParallelMath::ScalarUInt16 blockColors[2] = { blockBestLineColor, ParallelMath::Extract(bestHModeColor2, block) };
1256
1257
EmitHModeBlock(outputBuffer + block * 8, blockColors, sectorBits, signBits, blockBestTable, false);
1258
}
1259
else
1260
EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, false);
1261
}
1262
}
1263
}
1264
1265
1266
cvtt::ParallelMath::UInt15 cvtt::Internal::ETCComputer::DecodePlanarCoeff(const MUInt15 &coeff, int ch)
1267
{
1268
if (ch == 1)
1269
return (coeff << 1) | (ParallelMath::RightShift(coeff, 6));
1270
else
1271
return (coeff << 2) | (ParallelMath::RightShift(coeff, 4));
1272
}
1273
1274
void cvtt::Internal::ETCComputer::EncodePlanar(uint8_t *outputBuffer, MFloat &bestError, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
1275
{
1276
// NOTE: If it's desired to do this in another color space, the best way to do it would probably be
1277
// to do everything in that color space and then transform it back to RGB.
1278
1279
// We compute H = (H-O)/4 and V= (V-O)/4 to simplify the math
1280
1281
// error = (x*H + y*V + O - C)^2
1282
MFloat h[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1283
MFloat v[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1284
MFloat o[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1285
1286
bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
1287
bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
1288
1289
MFloat totalError = ParallelMath::MakeFloatZero();
1290
MUInt15 bestCoeffs[3][3]; // [Channel][Coeff]
1291
for (int ch = 0; ch < 3; ch++)
1292
{
1293
float fhh = 0.f;
1294
float fho = 0.f;
1295
float fhv = 0.f;
1296
float foo = 0.f;
1297
float fov = 0.f;
1298
float fvv = 0.f;
1299
MFloat fc = ParallelMath::MakeFloatZero();
1300
MFloat fh = ParallelMath::MakeFloatZero();
1301
MFloat fv = ParallelMath::MakeFloatZero();
1302
MFloat fo = ParallelMath::MakeFloatZero();
1303
1304
float &foh = fho;
1305
float &fvh = fhv;
1306
float &fvo = fov;
1307
1308
for (int px = 0; px < 16; px++)
1309
{
1310
float x = static_cast<float>(px % 4);
1311
float y = static_cast<float>(px / 4);
1312
MFloat c = isFakeBT709 ? preWeightedPixels[px][ch] : ParallelMath::ToFloat(pixels[px][ch]);
1313
1314
// (x*H + y*V + O - C)^2
1315
fhh += x * x;
1316
fhv += x * y;
1317
fho += x;
1318
fh = fh - c * x;
1319
1320
fvh += y * x;
1321
fvv += y * y;
1322
fvo += y;
1323
fv = fv - c * y;
1324
1325
foh += x;
1326
fov += y;
1327
foo += 1;
1328
fo = fo - c;
1329
1330
fh = fh - c * x;
1331
fv = fv - c * y;
1332
fo = fo - c;
1333
fc = fc + c * c;
1334
}
1335
1336
//float totalError = fhh * h * h + fho * h*o + fhv * h*v + foo * o * o + fov * o*v + fvv * v * v + fh * h + fv * v + fo * o + fc;
1337
1338
// error = fhh*h^2 + fho*h*o + fhv*h*v + foo*o^2 + fov*o*v + fvv*v^2 + fh*h + fv*v + fo*o + fc
1339
// derror/dh = 2*fhh*h + fho*o + fhv*v + fh
1340
// derror/dv = fhv*h + fov*o + 2*fvv*v + fv
1341
// derror/do = fho*h + 2*foo*o + fov*v + fo
1342
1343
// Solve system of equations
1344
// h o v 1 = 0
1345
// -------
1346
// d e f g R0
1347
// i j k l R1
1348
// m n p q R2
1349
1350
float d = 2.0f * fhh;
1351
float e = fho;
1352
float f = fhv;
1353
MFloat gD = fh;
1354
1355
float i = fhv;
1356
float j = fov;
1357
float k = 2.0f * fvv;
1358
MFloat lD = fv;
1359
1360
float m = fho;
1361
float n = 2.0f * foo;
1362
float p = fov;
1363
MFloat qD = fo;
1364
1365
{
1366
// Factor out first column from R1 and R2
1367
float r0to1 = -i / d;
1368
float r0to2 = -m / d;
1369
1370
// 0 j1 k1 l1D
1371
float j1 = j + r0to1 * e;
1372
float k1 = k + r0to1 * f;
1373
MFloat l1D = lD + gD * r0to1;
1374
1375
// 0 n1 p1 q1D
1376
float n1 = n + r0to2 * e;
1377
float p1 = p + r0to2 * f;
1378
MFloat q1D = qD + gD * r0to2;
1379
1380
// Factor out third column from R2
1381
float r1to2 = -p1 / k1;
1382
1383
// 0 n2 0 q2D
1384
float n2 = n1 + r1to2 * j1;
1385
MFloat q2D = q1D + l1D * r1to2;
1386
1387
o[ch] = -q2D / n2;
1388
1389
// Factor out second column from R1
1390
// 0 n2 0 q2D
1391
1392
float r2to1 = -j1 / n2;
1393
1394
// 0 0 k1 l2D
1395
// 0 n2 0 q2D
1396
MFloat l2D = l1D + q2D * r2to1;
1397
1398
float elim2 = -f / k1;
1399
float elim1 = -e / n2;
1400
1401
// d 0 0 g2D
1402
MFloat g2D = gD + l2D * elim2 + q2D * elim1;
1403
1404
// n2*o + q2 = 0
1405
// o = -q2 / n2
1406
h[ch] = -g2D / d;
1407
v[ch] = -l2D / k1;
1408
}
1409
1410
// Undo the local transformation
1411
h[ch] = h[ch] * 4.0f + o[ch];
1412
v[ch] = v[ch] * 4.0f + o[ch];
1413
}
1414
1415
if (isFakeBT709)
1416
{
1417
MFloat oRGB[3];
1418
MFloat hRGB[3];
1419
MFloat vRGB[3];
1420
1421
ConvertFromFakeBT709(oRGB, o);
1422
ConvertFromFakeBT709(hRGB, h);
1423
ConvertFromFakeBT709(vRGB, v);
1424
1425
// Twiddling in fake BT.607 is a mess, just round off for now (the precision is pretty good anyway)
1426
{
1427
ParallelMath::RoundTowardNearestForScope rtn;
1428
1429
for (int ch = 0; ch < 3; ch++)
1430
{
1431
MFloat fcoeffs[3] = { oRGB[ch], hRGB[ch], vRGB[ch] };
1432
1433
for (int c = 0; c < 3; c++)
1434
{
1435
MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
1436
if (ch == 1)
1437
coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
1438
else
1439
coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
1440
fcoeffs[c] = coeff;
1441
}
1442
1443
for (int c = 0; c < 3; c++)
1444
bestCoeffs[ch][c] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rtn);
1445
}
1446
}
1447
1448
MUInt15 reconstructed[16][3];
1449
for (int ch = 0; ch < 3; ch++)
1450
{
1451
MUInt15 dO = DecodePlanarCoeff(bestCoeffs[ch][0], ch);
1452
MUInt15 dH = DecodePlanarCoeff(bestCoeffs[ch][1], ch);
1453
MUInt15 dV = DecodePlanarCoeff(bestCoeffs[ch][2], ch);
1454
1455
MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1456
MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1457
1458
MFloat error = ParallelMath::MakeFloatZero();
1459
1460
MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
1461
1462
for (int px = 0; px < 16; px++)
1463
{
1464
MUInt15 pxv = ParallelMath::MakeUInt15(px);
1465
MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
1466
MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
1467
1468
MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
1469
MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
1470
reconstructed[px][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
1471
}
1472
}
1473
1474
totalError = ParallelMath::MakeFloatZero();
1475
for (int px = 0; px < 16; px++)
1476
totalError = totalError + ComputeErrorFakeBT709(reconstructed[px], preWeightedPixels[px]);
1477
}
1478
else
1479
{
1480
for (int ch = 0; ch < 3; ch++)
1481
{
1482
MFloat fcoeffs[3] = { o[ch], h[ch], v[ch] };
1483
MUInt15 coeffRanges[3][2];
1484
1485
for (int c = 0; c < 3; c++)
1486
{
1487
MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
1488
if (ch == 1)
1489
coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
1490
else
1491
coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
1492
fcoeffs[c] = coeff;
1493
}
1494
1495
{
1496
ParallelMath::RoundDownForScope rd;
1497
for (int c = 0; c < 3; c++)
1498
coeffRanges[c][0] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rd);
1499
}
1500
1501
{
1502
ParallelMath::RoundUpForScope ru;
1503
for (int c = 0; c < 3; c++)
1504
coeffRanges[c][1] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &ru);
1505
}
1506
1507
MFloat bestChannelError = ParallelMath::MakeFloat(FLT_MAX);
1508
for (int io = 0; io < 2; io++)
1509
{
1510
MUInt15 dO = DecodePlanarCoeff(coeffRanges[0][io], ch);
1511
1512
for (int ih = 0; ih < 2; ih++)
1513
{
1514
MUInt15 dH = DecodePlanarCoeff(coeffRanges[1][ih], ch);
1515
MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1516
1517
for (int iv = 0; iv < 2; iv++)
1518
{
1519
MUInt15 dV = DecodePlanarCoeff(coeffRanges[2][iv], ch);
1520
MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
1521
1522
MFloat error = ParallelMath::MakeFloatZero();
1523
1524
MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
1525
1526
for (int px = 0; px < 16; px++)
1527
{
1528
MUInt15 pxv = ParallelMath::MakeUInt15(px);
1529
MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
1530
MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
1531
1532
MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
1533
MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
1534
MUInt15 dec = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
1535
1536
MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(dec);
1537
1538
MFloat deltaF = ParallelMath::ToFloat(delta);
1539
error = error + deltaF * deltaF;
1540
}
1541
1542
ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestChannelError));
1543
if (ParallelMath::AnySet(errorBetter))
1544
{
1545
bestChannelError = ParallelMath::Min(error, bestChannelError);
1546
ParallelMath::ConditionalSet(bestCoeffs[ch][0], errorBetter, coeffRanges[0][io]);
1547
ParallelMath::ConditionalSet(bestCoeffs[ch][1], errorBetter, coeffRanges[1][ih]);
1548
ParallelMath::ConditionalSet(bestCoeffs[ch][2], errorBetter, coeffRanges[2][iv]);
1549
}
1550
}
1551
}
1552
}
1553
1554
if (!isUniform)
1555
{
1556
switch (ch)
1557
{
1558
case 0:
1559
bestChannelError = bestChannelError * (options.redWeight * options.redWeight);
1560
break;
1561
case 1:
1562
bestChannelError = bestChannelError * (options.greenWeight * options.greenWeight);
1563
break;
1564
case 2:
1565
bestChannelError = bestChannelError * (options.blueWeight * options.blueWeight);
1566
break;
1567
default:
1568
break;
1569
}
1570
}
1571
1572
totalError = totalError + bestChannelError;
1573
}
1574
}
1575
1576
ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(totalError, bestError));
1577
if (ParallelMath::AnySet(errorBetter))
1578
{
1579
bestError = ParallelMath::Min(bestError, totalError);
1580
1581
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1582
{
1583
if (!ParallelMath::Extract(errorBetter, block))
1584
continue;
1585
1586
int ro = ParallelMath::Extract(bestCoeffs[0][0], block);
1587
int rh = ParallelMath::Extract(bestCoeffs[0][1], block);
1588
int rv = ParallelMath::Extract(bestCoeffs[0][2], block);
1589
1590
int go = ParallelMath::Extract(bestCoeffs[1][0], block);
1591
int gh = ParallelMath::Extract(bestCoeffs[1][1], block);
1592
int gv = ParallelMath::Extract(bestCoeffs[1][2], block);
1593
1594
int bo = ParallelMath::Extract(bestCoeffs[2][0], block);
1595
int bh = ParallelMath::Extract(bestCoeffs[2][1], block);
1596
int bv = ParallelMath::Extract(bestCoeffs[2][2], block);
1597
1598
int go1 = go >> 6;
1599
int go2 = go & 63;
1600
1601
int bo1 = bo >> 5;
1602
int bo2 = (bo >> 3) & 3;
1603
int bo3 = bo & 7;
1604
1605
int rh1 = (rh >> 1);
1606
int rh2 = rh & 1;
1607
1608
int fakeR = ro >> 2;
1609
int fakeDR = go1 | ((ro & 3) << 1);
1610
1611
int fakeG = (go2 >> 2);
1612
int fakeDG = ((go2 & 3) << 1) | bo1;
1613
1614
int fakeB = bo2;
1615
int fakeDB = bo3 >> 1;
1616
1617
uint32_t highBits = 0;
1618
uint32_t lowBits = 0;
1619
1620
// Avoid overflowing R
1621
if ((fakeDR & 4) != 0 && fakeR + fakeDR < 8)
1622
highBits |= 1 << (63 - 32);
1623
1624
// Avoid overflowing G
1625
if ((fakeDG & 4) != 0 && fakeG + fakeDG < 8)
1626
highBits |= 1 << (55 - 32);
1627
1628
// Overflow B
1629
if (fakeB + fakeDB < 4)
1630
{
1631
// Overflow low
1632
highBits |= 1 << (42 - 32);
1633
}
1634
else
1635
{
1636
// Overflow high
1637
highBits |= 7 << (45 - 32);
1638
}
1639
1640
highBits |= ro << (57 - 32);
1641
highBits |= go1 << (56 - 32);
1642
highBits |= go2 << (49 - 32);
1643
highBits |= bo1 << (48 - 32);
1644
highBits |= bo2 << (43 - 32);
1645
highBits |= bo3 << (39 - 32);
1646
highBits |= rh1 << (34 - 32);
1647
highBits |= 1 << (33 - 32);
1648
highBits |= rh2 << (32 - 32);
1649
1650
lowBits |= gh << 25;
1651
lowBits |= bh << 19;
1652
lowBits |= rv << 13;
1653
lowBits |= gv << 6;
1654
lowBits |= bv << 0;
1655
1656
for (int i = 0; i < 4; i++)
1657
outputBuffer[block * 8 + i] = (highBits >> (24 - i * 8)) & 0xff;
1658
for (int i = 0; i < 4; i++)
1659
outputBuffer[block * 8 + i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
1660
}
1661
}
1662
}
1663
1664
void cvtt::Internal::ETCComputer::CompressETC2Block(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, ETC2CompressionData *compressionData, const Options &options, bool punchthroughAlpha)
1665
{
1666
ParallelMath::Int16CompFlag pixelIsTransparent[16];
1667
ParallelMath::Int16CompFlag anyTransparent = ParallelMath::MakeBoolInt16(false);
1668
ParallelMath::Int16CompFlag allTransparent = ParallelMath::MakeBoolInt16(true);
1669
1670
if (punchthroughAlpha)
1671
{
1672
const float fThreshold = std::max<float>(std::min<float>(1.0f, options.threshold), 0.0f) * 255.0f;
1673
1674
// +1.0f is intentional, we want to take the next valid integer (even if it's 256) since everything else lower is transparent
1675
MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(std::floor(fThreshold + 1.0f)));
1676
1677
for (int px = 0; px < 16; px++)
1678
{
1679
MUInt15 alpha;
1680
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1681
ParallelMath::PutUInt15(alpha, block, pixelBlocks[block].m_pixels[px][3]);
1682
1683
ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(alpha, threshold);
1684
anyTransparent = (anyTransparent | isTransparent);
1685
allTransparent = (allTransparent & isTransparent);
1686
pixelIsTransparent[px] = isTransparent;
1687
}
1688
}
1689
else
1690
{
1691
for (int px = 0; px < 16; px++)
1692
pixelIsTransparent[px] = ParallelMath::MakeBoolInt16(false);
1693
1694
allTransparent = anyTransparent = ParallelMath::MakeBoolInt16(false);
1695
}
1696
1697
MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
1698
1699
ETC2CompressionDataInternal* internalData = static_cast<ETC2CompressionDataInternal*>(compressionData);
1700
1701
MUInt15 pixels[16][3];
1702
MFloat preWeightedPixels[16][3];
1703
ExtractBlocks(pixels, preWeightedPixels, pixelBlocks, options);
1704
1705
if (ParallelMath::AnySet(anyTransparent))
1706
{
1707
for (int px = 0; px < 16; px++)
1708
{
1709
ParallelMath::Int16CompFlag flag = pixelIsTransparent[px];
1710
ParallelMath::FloatCompFlag fflag = ParallelMath::Int16FlagToFloat(flag);
1711
1712
for (int ch = 0; ch < 3; ch++)
1713
{
1714
ParallelMath::ConditionalSet(pixels[px][ch], flag, ParallelMath::MakeUInt15(0));
1715
ParallelMath::ConditionalSet(preWeightedPixels[px][ch], fflag, ParallelMath::MakeFloat(0.0f));
1716
}
1717
}
1718
}
1719
1720
if (!ParallelMath::AllSet(allTransparent))
1721
EncodePlanar(outputBuffer, bestError, pixels, preWeightedPixels, options);
1722
1723
MFloat chromaDelta[16][2];
1724
1725
MUInt15 numOpaque = ParallelMath::MakeUInt15(16);
1726
for (int px = 0; px < 16; px++)
1727
numOpaque = numOpaque - ParallelMath::SelectOrZero(pixelIsTransparent[px], ParallelMath::MakeUInt15(1));
1728
1729
if (options.flags & cvtt::Flags::Uniform)
1730
{
1731
MSInt16 chromaCoordinates3[16][2];
1732
for (int px = 0; px < 16; px++)
1733
{
1734
chromaCoordinates3[px][0] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
1735
chromaCoordinates3[px][1] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][1] << 1) + ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
1736
}
1737
1738
MSInt16 chromaCoordinateCentroid[2] = { ParallelMath::MakeSInt16(0), ParallelMath::MakeSInt16(0) };
1739
for (int px = 0; px < 16; px++)
1740
{
1741
for (int ch = 0; ch < 2; ch++)
1742
chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
1743
}
1744
1745
if (punchthroughAlpha)
1746
{
1747
for (int px = 0; px < 16; px++)
1748
{
1749
for (int ch = 0; ch < 2; ch++)
1750
{
1751
MUInt15 chromaCoordinateMultiplied = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(chromaCoordinates3[px][ch], numOpaque));
1752
MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(chromaCoordinateMultiplied) - chromaCoordinateCentroid[ch];
1753
chromaDelta[px][ch] = ParallelMath::ToFloat(delta);
1754
}
1755
}
1756
}
1757
else
1758
{
1759
for (int px = 0; px < 16; px++)
1760
{
1761
for (int ch = 0; ch < 2; ch++)
1762
chromaDelta[px][ch] = ParallelMath::ToFloat((chromaCoordinates3[px][ch] << 4) - chromaCoordinateCentroid[ch]);
1763
}
1764
}
1765
1766
const MFloat rcpSqrt3 = ParallelMath::MakeFloat(0.57735026918962576450914878050196f);
1767
1768
for (int px = 0; px < 16; px++)
1769
chromaDelta[px][1] = chromaDelta[px][1] * rcpSqrt3;
1770
}
1771
else
1772
{
1773
const float chromaAxis0[3] = { internalData->m_chromaSideAxis0[0], internalData->m_chromaSideAxis0[1], internalData->m_chromaSideAxis0[2] };
1774
const float chromaAxis1[3] = { internalData->m_chromaSideAxis1[0], internalData->m_chromaSideAxis1[1], internalData->m_chromaSideAxis1[2] };
1775
1776
MFloat chromaCoordinates3[16][2];
1777
for (int px = 0; px < 16; px++)
1778
{
1779
const MFloat &px0 = preWeightedPixels[px][0];
1780
const MFloat &px1 = preWeightedPixels[px][1];
1781
const MFloat &px2 = preWeightedPixels[px][2];
1782
1783
chromaCoordinates3[px][0] = px0 * chromaAxis0[0] + px1 * chromaAxis0[1] + px2 * chromaAxis0[2];
1784
chromaCoordinates3[px][1] = px0 * chromaAxis1[0] + px1 * chromaAxis1[1] + px2 * chromaAxis1[2];
1785
}
1786
1787
MFloat chromaCoordinateCentroid[2] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
1788
for (int px = 0; px < 16; px++)
1789
{
1790
for (int ch = 0; ch < 2; ch++)
1791
chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
1792
}
1793
1794
if (punchthroughAlpha)
1795
{
1796
const MFloat numOpaqueF = ParallelMath::ToFloat(numOpaque);
1797
for (int px = 0; px < 16; px++)
1798
{
1799
for (int ch = 0; ch < 2; ch++)
1800
{
1801
MFloat chromaCoordinateMultiplied = chromaCoordinates3[px][ch] * numOpaqueF;
1802
MFloat delta = chromaCoordinateMultiplied - chromaCoordinateCentroid[ch];
1803
chromaDelta[px][ch] = delta;
1804
}
1805
}
1806
}
1807
else
1808
{
1809
for (int px = 0; px < 16; px++)
1810
{
1811
for (int ch = 0; ch < 2; ch++)
1812
chromaDelta[px][ch] = chromaCoordinates3[px][ch] * 16.0f - chromaCoordinateCentroid[ch];
1813
}
1814
}
1815
}
1816
1817
1818
MFloat covXX = ParallelMath::MakeFloatZero();
1819
MFloat covYY = ParallelMath::MakeFloatZero();
1820
MFloat covXY = ParallelMath::MakeFloatZero();
1821
1822
for (int px = 0; px < 16; px++)
1823
{
1824
MFloat nx = chromaDelta[px][0];
1825
MFloat ny = chromaDelta[px][1];
1826
1827
covXX = covXX + nx * nx;
1828
covYY = covYY + ny * ny;
1829
covXY = covXY + nx * ny;
1830
}
1831
1832
MFloat halfTrace = (covXX + covYY) * 0.5f;
1833
MFloat det = covXX * covYY - covXY * covXY;
1834
1835
MFloat mm = ParallelMath::Sqrt(ParallelMath::Max(ParallelMath::MakeFloatZero(), halfTrace * halfTrace - det));
1836
1837
MFloat ev = halfTrace + mm;
1838
1839
MFloat dx = (covYY - ev + covXY);
1840
MFloat dy = -(covXX - ev + covXY);
1841
1842
// If evenly distributed, pick an arbitrary plane
1843
ParallelMath::FloatCompFlag allZero = ParallelMath::Equal(dx, ParallelMath::MakeFloatZero()) & ParallelMath::Equal(dy, ParallelMath::MakeFloatZero());
1844
ParallelMath::ConditionalSet(dx, allZero, ParallelMath::MakeFloat(1.f));
1845
1846
ParallelMath::Int16CompFlag sectorAssignments[16];
1847
for (int px = 0; px < 16; px++)
1848
sectorAssignments[px] = ParallelMath::FloatFlagToInt16(ParallelMath::Less(chromaDelta[px][0] * dx + chromaDelta[px][1] * dy, ParallelMath::MakeFloatZero()));
1849
1850
if (!ParallelMath::AllSet(allTransparent))
1851
{
1852
EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
1853
1854
// Flip sector assignments
1855
for (int px = 0; px < 16; px++)
1856
sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1857
1858
EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
1859
1860
EncodeHMode(outputBuffer, bestError, sectorAssignments, pixels, internalData->m_h, preWeightedPixels, options);
1861
1862
CompressETC1BlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, internalData->m_drs, options, true);
1863
}
1864
1865
if (ParallelMath::AnySet(anyTransparent))
1866
{
1867
if (!ParallelMath::AllSet(allTransparent))
1868
{
1869
// Flip sector assignments
1870
for (int px = 0; px < 16; px++)
1871
sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1872
}
1873
1874
// Reset the error of any transparent blocks to max and retry with punchthrough modes
1875
ParallelMath::ConditionalSet(bestError, ParallelMath::Int16FlagToFloat(anyTransparent), ParallelMath::MakeFloat(FLT_MAX));
1876
1877
EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
1878
1879
// Flip sector assignments
1880
for (int px = 0; px < 16; px++)
1881
sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
1882
1883
EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
1884
1885
CompressETC1PunchthroughBlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, pixelIsTransparent, static_cast<ETC2CompressionDataInternal*>(compressionData)->m_drs, options);
1886
}
1887
}
1888
1889
void cvtt::Internal::ETCComputer::CompressETC2AlphaBlock(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, const Options &options)
1890
{
1891
MUInt15 pixels[16];
1892
1893
for (int px = 0; px < 16; px++)
1894
{
1895
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1896
ParallelMath::PutUInt15(pixels[px], block, pixelBlocks[block].m_pixels[px][3]);
1897
}
1898
1899
CompressETC2AlphaBlockInternal(outputBuffer, pixels, false, false, options);
1900
}
1901
1902
void cvtt::Internal::ETCComputer::CompressETC2AlphaBlockInternal(uint8_t *outputBuffer, const MUInt15 pixels[16], bool is11Bit, bool isSigned, const Options &options)
1903
{
1904
MUInt15 minAlpha = ParallelMath::MakeUInt15(is11Bit ? 2047 : 255);
1905
MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
1906
1907
for (int px = 0; px < 16; px++)
1908
{
1909
minAlpha = ParallelMath::Min(minAlpha, pixels[px]);
1910
maxAlpha = ParallelMath::Max(maxAlpha, pixels[px]);
1911
}
1912
1913
MUInt15 alphaSpan = maxAlpha - minAlpha;
1914
MUInt15 alphaSpanMidpointTimes2 = maxAlpha + minAlpha;
1915
1916
MUInt31 bestTotalError = ParallelMath::MakeUInt31(0x7fffffff);
1917
MUInt15 bestTableIndex = ParallelMath::MakeUInt15(0);
1918
MUInt15 bestBaseCodeword = ParallelMath::MakeUInt15(0);
1919
MUInt15 bestMultiplier = ParallelMath::MakeUInt15(0);
1920
MUInt15 bestIndexes[16];
1921
1922
for (int px = 0; px < 16; px++)
1923
bestIndexes[px] = ParallelMath::MakeUInt15(0);
1924
1925
const int numAlphaRanges = 10;
1926
for (uint16_t tableIndex = 0; tableIndex < 16; tableIndex++)
1927
{
1928
for (int r = 0; r < numAlphaRanges; r++)
1929
{
1930
int subrange = r % 3;
1931
int mainRange = r / 3;
1932
1933
int16_t maxOffset = Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - (subrange & 1)];
1934
int16_t minOffset = -Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - ((subrange >> 1) & 1)] - 1;
1935
uint16_t offsetSpan = static_cast<uint16_t>(maxOffset - minOffset);
1936
1937
MSInt16 vminOffset = ParallelMath::MakeSInt16(minOffset);
1938
MUInt15 vmaxOffset = ParallelMath::MakeUInt15(maxOffset);
1939
MUInt15 voffsetSpan = ParallelMath::MakeUInt15(offsetSpan);
1940
1941
MUInt15 minMultiplier = ParallelMath::MakeUInt15(0);
1942
for (int block = 0; block < ParallelMath::ParallelSize; block++)
1943
{
1944
uint16_t singleAlphaSpan = ParallelMath::Extract(alphaSpan, block);
1945
1946
uint16_t lowMultiplier = singleAlphaSpan / offsetSpan;
1947
ParallelMath::PutUInt15(minMultiplier, block, lowMultiplier);
1948
}
1949
1950
if (is11Bit)
1951
{
1952
// Clamps this to valid multipliers under 15 and rounds down to nearest multiple of 8
1953
minMultiplier = ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(112)) & ParallelMath::MakeUInt15(120);
1954
}
1955
else
1956
{
1957
// We cap at 1 and 14 so both multipliers are valid and dividable
1958
// Cases where offset span is 0 should be caught by multiplier 1 of table 13
1959
minMultiplier = ParallelMath::Max(ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(14)), ParallelMath::MakeUInt15(1));
1960
}
1961
1962
for (uint16_t multiplierOffset = 0; multiplierOffset < 2; multiplierOffset++)
1963
{
1964
MUInt15 multiplier = minMultiplier;
1965
1966
if (is11Bit)
1967
{
1968
if (multiplierOffset == 1)
1969
multiplier = multiplier + ParallelMath::MakeUInt15(8);
1970
else
1971
multiplier = ParallelMath::Max(multiplier, ParallelMath::MakeUInt15(1));
1972
}
1973
else
1974
{
1975
if (multiplierOffset == 1)
1976
multiplier = multiplier + ParallelMath::MakeUInt15(1);
1977
}
1978
1979
MSInt16 multipliedMinOffset = ParallelMath::CompactMultiply(ParallelMath::LosslessCast<MSInt16>::Cast(multiplier), vminOffset);
1980
MUInt15 multipliedMaxOffset = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(multiplier, vmaxOffset));
1981
1982
// codeword = (maxOffset + minOffset + minAlpha + maxAlpha) / 2
1983
MSInt16 unclampedBaseAlphaTimes2 = ParallelMath::LosslessCast<MSInt16>::Cast(alphaSpanMidpointTimes2) - ParallelMath::LosslessCast<MSInt16>::Cast(multipliedMaxOffset) - multipliedMinOffset;
1984
1985
MUInt15 baseAlpha;
1986
if (is11Bit)
1987
{
1988
// In unsigned, 4 is added to the unquantized alpha, so compensating for that cancels the 4 we have to add to do rounding.
1989
if (isSigned)
1990
unclampedBaseAlphaTimes2 = unclampedBaseAlphaTimes2 + ParallelMath::MakeSInt16(8);
1991
1992
// -128 is illegal for some reason
1993
MSInt16 minBaseAlphaTimes2 = isSigned ? ParallelMath::MakeSInt16(16) : ParallelMath::MakeSInt16(0);
1994
1995
MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, minBaseAlphaTimes2)), ParallelMath::MakeUInt15(4095));
1996
baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2, 1) & ParallelMath::MakeUInt15(2040);
1997
1998
if (!isSigned)
1999
baseAlpha = baseAlpha + ParallelMath::MakeUInt15(4);
2000
}
2001
else
2002
{
2003
MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, ParallelMath::MakeSInt16(0))), ParallelMath::MakeUInt15(510));
2004
baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2 + ParallelMath::MakeUInt15(1), 1);
2005
}
2006
2007
MUInt15 indexes[16];
2008
MUInt31 totalError = ParallelMath::MakeUInt31(0);
2009
for (int px = 0; px < 16; px++)
2010
{
2011
MUInt15 quantizedValues;
2012
QuantizeETC2Alpha(tableIndex, pixels[px], baseAlpha, multiplier, is11Bit, isSigned, indexes[px], quantizedValues);
2013
2014
if (is11Bit)
2015
{
2016
MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(quantizedValues) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px]);
2017
MSInt32 deltaSq = ParallelMath::XMultiply(delta, delta);
2018
totalError = totalError + ParallelMath::LosslessCast<MUInt31>::Cast(deltaSq);
2019
}
2020
else
2021
totalError = totalError + ParallelMath::ToUInt31(ParallelMath::SqDiffUInt8(quantizedValues, pixels[px]));
2022
}
2023
2024
ParallelMath::Int16CompFlag isBetter = ParallelMath::Int32FlagToInt16(ParallelMath::Less(totalError, bestTotalError));
2025
if (ParallelMath::AnySet(isBetter))
2026
{
2027
ParallelMath::ConditionalSet(bestTotalError, isBetter, totalError);
2028
ParallelMath::ConditionalSet(bestTableIndex, isBetter, ParallelMath::MakeUInt15(tableIndex));
2029
ParallelMath::ConditionalSet(bestBaseCodeword, isBetter, baseAlpha);
2030
ParallelMath::ConditionalSet(bestMultiplier, isBetter, multiplier);
2031
2032
for (int px = 0; px < 16; px++)
2033
ParallelMath::ConditionalSet(bestIndexes[px], isBetter, indexes[px]);
2034
}
2035
2036
// TODO: Do one refine pass
2037
}
2038
}
2039
}
2040
2041
if (is11Bit)
2042
{
2043
bestMultiplier = ParallelMath::RightShift(bestMultiplier, 3);
2044
2045
if (isSigned)
2046
bestBaseCodeword = bestBaseCodeword ^ ParallelMath::MakeUInt15(0x80);
2047
}
2048
2049
for (int block = 0; block < ParallelMath::ParallelSize; block++)
2050
{
2051
uint8_t *output = outputBuffer + block * 8;
2052
2053
output[0] = static_cast<uint8_t>(ParallelMath::Extract(bestBaseCodeword, block));
2054
2055
ParallelMath::ScalarUInt16 multiplier = ParallelMath::Extract(bestMultiplier, block);
2056
ParallelMath::ScalarUInt16 tableIndex = ParallelMath::Extract(bestTableIndex, block);
2057
2058
output[1] = static_cast<uint8_t>((multiplier << 4) | tableIndex);
2059
2060
static const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2061
2062
ParallelMath::ScalarUInt16 indexes[16];
2063
for (int px = 0; px < 16; px++)
2064
indexes[pixelSelectorOrder[px]] = ParallelMath::Extract(bestIndexes[px], block);
2065
2066
int outputOffset = 2;
2067
int outputBits = 0;
2068
int numOutputBits = 0;
2069
for (int s = 0; s < 16; s++)
2070
{
2071
outputBits = (outputBits << 3) | indexes[s];
2072
numOutputBits += 3;
2073
2074
if (numOutputBits >= 8)
2075
{
2076
output[outputOffset++] = static_cast<uint8_t>(outputBits >> (numOutputBits - 8));
2077
numOutputBits -= 8;
2078
2079
outputBits &= ((1 << numOutputBits) - 1);
2080
}
2081
}
2082
2083
assert(outputOffset == 8 && numOutputBits == 0);
2084
}
2085
}
2086
2087
void cvtt::Internal::ETCComputer::CompressEACBlock(uint8_t *outputBuffer, const PixelBlockScalarS16 *inputBlocks, bool isSigned, const Options &options)
2088
{
2089
MUInt15 pixels[16];
2090
for (int px = 0; px < 16; px++)
2091
{
2092
MSInt16 adjustedPixel;
2093
for (int block = 0; block < ParallelMath::ParallelSize; block++)
2094
ParallelMath::PutSInt16(adjustedPixel, block, inputBlocks[block].m_pixels[px]);
2095
2096
// We use a slightly shifted range here so we can keep the unquantized base color in a UInt15
2097
// That is, signed range is 1..2047, and unsigned range is 0..2047
2098
if (isSigned)
2099
{
2100
adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(1023)) + ParallelMath::MakeSInt16(1024);
2101
adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(1), adjustedPixel);
2102
}
2103
else
2104
{
2105
adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(2047));
2106
adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(0), adjustedPixel);
2107
}
2108
2109
2110
pixels[px] = ParallelMath::LosslessCast<MUInt15>::Cast(adjustedPixel);
2111
}
2112
2113
CompressETC2AlphaBlockInternal(outputBuffer, pixels, true, isSigned, options);
2114
}
2115
2116
void cvtt::Internal::ETCComputer::CompressETC1Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC1CompressionData *compressionData, const Options &options)
2117
{
2118
DifferentialResolveStorage &drs = static_cast<ETC1CompressionDataInternal*>(compressionData)->m_drs;
2119
MFloat bestTotalError = ParallelMath::MakeFloat(FLT_MAX);
2120
2121
MUInt15 pixels[16][3];
2122
MFloat preWeightedPixels[16][3];
2123
ExtractBlocks(pixels, preWeightedPixels, inputBlocks, options);
2124
2125
CompressETC1BlockInternal(bestTotalError, outputBuffer, pixels, preWeightedPixels, drs, options, false);
2126
}
2127
2128
void cvtt::Internal::ETCComputer::ExtractBlocks(MUInt15 pixels[16][3], MFloat preWeightedPixels[16][3], const PixelBlockU8 *inputBlocks, const Options &options)
2129
{
2130
bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
2131
bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
2132
2133
for (int px = 0; px < 16; px++)
2134
{
2135
for (int ch = 0; ch < 3; ch++)
2136
{
2137
for (int block = 0; block < ParallelMath::ParallelSize; block++)
2138
ParallelMath::PutUInt15(pixels[px][ch], block, inputBlocks[block].m_pixels[px][ch]);
2139
}
2140
2141
if (isFakeBT709)
2142
ConvertToFakeBT709(preWeightedPixels[px], pixels[px]);
2143
else if (isUniform)
2144
{
2145
for (int ch = 0; ch < 3; ch++)
2146
preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
2147
}
2148
else
2149
{
2150
preWeightedPixels[px][0] = ParallelMath::ToFloat(pixels[px][0]) * options.redWeight;
2151
preWeightedPixels[px][1] = ParallelMath::ToFloat(pixels[px][1]) * options.greenWeight;
2152
preWeightedPixels[px][2] = ParallelMath::ToFloat(pixels[px][2]) * options.blueWeight;
2153
}
2154
}
2155
}
2156
2157
void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingAccurate(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
2158
{
2159
for (int ch = 0; ch < 3; ch++)
2160
{
2161
const MUInt15& cu15 = sectorCumulative[ch];
2162
2163
if (isDifferential)
2164
{
2165
//quantized[ch] = (cu * 31 + (cu >> 3)) >> 11;
2166
quantized[ch] = ParallelMath::ToUInt15(
2167
ParallelMath::RightShift(
2168
(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
2169
, 11)
2170
);
2171
}
2172
else
2173
{
2174
//quantized[ch] = (cu * 30 + (cu >> 3)) >> 12;
2175
quantized[ch] = ParallelMath::ToUInt15(
2176
ParallelMath::RightShift(
2177
(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
2178
, 12)
2179
);
2180
}
2181
}
2182
2183
MFloat lowOctantRGBFloat[3];
2184
MFloat highOctantRGBFloat[3];
2185
2186
for (int ch = 0; ch < 3; ch++)
2187
{
2188
MUInt15 unquantized;
2189
MUInt15 unquantizedNext;
2190
if (isDifferential)
2191
{
2192
unquantized = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
2193
MUInt15 quantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(31), quantized[ch] + ParallelMath::MakeUInt15(1));
2194
unquantizedNext = (quantizedNext << 3) | ParallelMath::RightShift(quantizedNext, 2);
2195
}
2196
else
2197
{
2198
unquantized = (quantized[ch] << 4) | quantized[ch];
2199
unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
2200
}
2201
lowOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantized << 3);
2202
highOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantizedNext << 3);
2203
}
2204
2205
MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
2206
MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
2207
2208
MFloat cumulativeYUV[3];
2209
ConvertToFakeBT709(cumulativeYUV, sectorCumulative);
2210
2211
for (uint16_t octant = 0; octant < 8; octant++)
2212
{
2213
const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
2214
const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
2215
const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
2216
2217
MFloat octantYUV[3];
2218
ConvertToFakeBT709(octantYUV, r, g, b);
2219
2220
MFloat delta[3];
2221
for (int ch = 0; ch < 3; ch++)
2222
delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
2223
2224
MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
2225
ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
2226
ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
2227
bestError = ParallelMath::Min(error, bestError);
2228
}
2229
2230
for (int ch = 0; ch < 3; ch++)
2231
quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
2232
}
2233
2234
void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingFast(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
2235
{
2236
// sectorCumulative range is 0..2040 (11 bits)
2237
MUInt15 roundingOffset = ParallelMath::MakeUInt15(0);
2238
2239
MUInt15 rOffset;
2240
MUInt15 gOffset;
2241
MUInt15 bOffset;
2242
MUInt15 quantizedBase[3];
2243
MUInt15 upperBound;
2244
2245
MUInt15 sectorCumulativeFillIn[3];
2246
for (int ch = 0; ch < 3; ch++)
2247
sectorCumulativeFillIn[ch] = sectorCumulative[ch] + ParallelMath::RightShift(sectorCumulative[ch], 8);
2248
2249
if (isDifferential)
2250
{
2251
rOffset = (sectorCumulativeFillIn[0] << 6) & ParallelMath::MakeUInt15(0xf00);
2252
gOffset = (sectorCumulativeFillIn[1] << 4) & ParallelMath::MakeUInt15(0x0f0);
2253
bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 2) & ParallelMath::MakeUInt15(0x00f);
2254
2255
for (int ch = 0; ch < 3; ch++)
2256
quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 6);
2257
2258
upperBound = ParallelMath::MakeUInt15(31);
2259
}
2260
else
2261
{
2262
rOffset = (sectorCumulativeFillIn[0] << 5) & ParallelMath::MakeUInt15(0xf00);
2263
gOffset = (sectorCumulativeFillIn[1] << 1) & ParallelMath::MakeUInt15(0x0f0);
2264
bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 3) & ParallelMath::MakeUInt15(0x00f);
2265
2266
for (int ch = 0; ch < 3; ch++)
2267
quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 7);
2268
2269
upperBound = ParallelMath::MakeUInt15(15);
2270
}
2271
2272
MUInt15 lookupIndex = (rOffset | gOffset | bOffset);
2273
2274
MUInt15 octant;
2275
for (int block = 0; block < ParallelMath::ParallelSize; block++)
2276
ParallelMath::PutUInt15(octant, block, Tables::FakeBT709::g_rounding16[ParallelMath::Extract(lookupIndex, block)]);
2277
2278
quantizedBase[0] = quantizedBase[0] + (octant & ParallelMath::MakeUInt15(1));
2279
quantizedBase[1] = quantizedBase[1] + (ParallelMath::RightShift(octant, 1) & ParallelMath::MakeUInt15(1));
2280
quantizedBase[2] = quantizedBase[2] + (ParallelMath::RightShift(octant, 2) & ParallelMath::MakeUInt15(1));
2281
2282
for (int ch = 0; ch < 3; ch++)
2283
quantized[ch] = ParallelMath::Min(quantizedBase[ch], upperBound);
2284
}
2285
2286
void cvtt::Internal::ETCComputer::ResolveTHFakeBT709Rounding(MUInt15 quantized[3], const MUInt15 targets[3], const MUInt15 &granularity)
2287
{
2288
MFloat lowOctantRGBFloat[3];
2289
MFloat highOctantRGBFloat[3];
2290
2291
for (int ch = 0; ch < 3; ch++)
2292
{
2293
MUInt15 unquantized = (quantized[ch] << 4) | quantized[ch];
2294
MUInt15 unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
2295
2296
lowOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantized, granularity) << 1);
2297
highOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantizedNext, granularity) << 1);
2298
}
2299
2300
MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
2301
MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
2302
2303
MFloat cumulativeYUV[3];
2304
ConvertToFakeBT709(cumulativeYUV, ParallelMath::ToFloat(targets[0]), ParallelMath::ToFloat(targets[1]), ParallelMath::ToFloat(targets[2]));
2305
2306
for (uint16_t octant = 0; octant < 8; octant++)
2307
{
2308
const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
2309
const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
2310
const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
2311
2312
MFloat octantYUV[3];
2313
ConvertToFakeBT709(octantYUV, r, g, b);
2314
2315
MFloat delta[3];
2316
for (int ch = 0; ch < 3; ch++)
2317
delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
2318
2319
MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
2320
ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
2321
ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
2322
bestError = ParallelMath::Min(error, bestError);
2323
}
2324
2325
for (int ch = 0; ch < 3; ch++)
2326
quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
2327
}
2328
2329
void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MUInt15 color[3])
2330
{
2331
MFloat floatRGB[3];
2332
for (int ch = 0; ch < 3; ch++)
2333
floatRGB[ch] = ParallelMath::ToFloat(color[ch]);
2334
2335
ConvertToFakeBT709(yuv, floatRGB);
2336
}
2337
2338
void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat color[3])
2339
{
2340
ConvertToFakeBT709(yuv, color[0], color[1], color[2]);
2341
}
2342
2343
void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat &pr, const MFloat &pg, const MFloat &pb)
2344
{
2345
MFloat r = pr;
2346
MFloat g = pg;
2347
MFloat b = pb;
2348
2349
yuv[0] = r * 0.368233989135369f + g * 1.23876274963149f + b * 0.125054068802017f;
2350
yuv[1] = r * 0.5f - g * 0.4541529f - b * 0.04584709f;
2351
yuv[2] = r * -0.081014709086133f - g * 0.272538676238785f + b * 0.353553390593274f;
2352
}
2353
2354
void cvtt::Internal::ETCComputer::ConvertFromFakeBT709(MFloat rgb[3], const MFloat yuv[3])
2355
{
2356
MFloat yy = yuv[0] * 0.57735026466774571071f;
2357
MFloat u = yuv[1];
2358
MFloat v = yuv[2];
2359
2360
rgb[0] = yy + u * 1.5748000207960953486f;
2361
rgb[1] = yy - u * 0.46812425854364753669f - v * 0.26491652528157560861f;
2362
rgb[2] = yy + v * 2.6242146882856944069f;
2363
}
2364
2365
2366
void cvtt::Internal::ETCComputer::QuantizeETC2Alpha(int tableIndex, const MUInt15& value, const MUInt15& baseValue, const MUInt15& multiplier, bool is11Bit, bool isSigned, MUInt15& outIndexes, MUInt15& outQuantizedValues)
2367
{
2368
MSInt16 offset = ParallelMath::LosslessCast<MSInt16>::Cast(value) - ParallelMath::LosslessCast<MSInt16>::Cast(baseValue);
2369
MSInt16 offsetTimes2 = offset + offset;
2370
2371
// ETC2's offset tables all have a reflect about 0.5*multiplier
2372
MSInt16 offsetAboutReflectorTimes2 = offsetTimes2 + ParallelMath::LosslessCast<MSInt16>::Cast(multiplier);
2373
2374
MUInt15 absOffsetAboutReflectorTimes2 = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Abs(offsetAboutReflectorTimes2));
2375
MUInt15 lookupIndex = ParallelMath::RightShift(absOffsetAboutReflectorTimes2, 1);
2376
2377
MUInt15 positiveIndex;
2378
MUInt15 positiveOffsetUnmultiplied;
2379
for (int block = 0; block < ParallelMath::ParallelSize; block++)
2380
{
2381
uint16_t blockLookupIndex = ParallelMath::Extract(lookupIndex, block) / ParallelMath::Extract(multiplier, block);
2382
if (blockLookupIndex >= Tables::ETC2::g_alphaRoundingTableWidth)
2383
blockLookupIndex = Tables::ETC2::g_alphaRoundingTableWidth - 1;
2384
uint16_t index = Tables::ETC2::g_alphaRoundingTables[tableIndex][blockLookupIndex];
2385
ParallelMath::PutUInt15(positiveIndex, block, index);
2386
ParallelMath::PutUInt15(positiveOffsetUnmultiplied, block, Tables::ETC2::g_alphaModifierTablePositive[tableIndex][index]);
2387
2388
// TODO: This is suboptimal when the offset is capped. We should detect 0 and 255 values and always map them to the maximum offsets.
2389
// Doing that will also affect refinement though.
2390
}
2391
2392
MSInt16 signBits = ParallelMath::RightShift(offsetAboutReflectorTimes2, 15);
2393
MSInt16 offsetUnmultiplied = ParallelMath::LosslessCast<MSInt16>::Cast(positiveOffsetUnmultiplied) ^ signBits;
2394
MSInt16 quantizedOffset = ParallelMath::CompactMultiply(offsetUnmultiplied, multiplier);
2395
2396
MSInt16 offsetValue = ParallelMath::LosslessCast<MSInt16>::Cast(baseValue) + quantizedOffset;
2397
2398
if (is11Bit)
2399
{
2400
if (isSigned)
2401
outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(1), offsetValue)));
2402
else
2403
outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
2404
}
2405
else
2406
outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(255), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
2407
2408
MUInt15 indexSub = ParallelMath::LosslessCast<MUInt15>::Cast(signBits) & ParallelMath::MakeUInt15(4);
2409
2410
outIndexes = positiveIndex + ParallelMath::MakeUInt15(4) - indexSub;
2411
}
2412
2413
2414
void cvtt::Internal::ETCComputer::EmitTModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 lineColor[3], const ParallelMath::ScalarUInt16 isolatedColor[3], int32_t packedSelectors, ParallelMath::ScalarUInt16 table, bool opaque)
2415
{
2416
static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2417
2418
uint32_t lowBits = 0;
2419
uint32_t highBits = 0;
2420
2421
int rh = ((isolatedColor[0] >> 2) & 3);
2422
int rl = (isolatedColor[0] & 3);
2423
2424
if (rh + rl < 4)
2425
{
2426
// Overflow low
2427
highBits |= 1 << (58 - 32);
2428
}
2429
else
2430
{
2431
// Overflow high
2432
highBits |= 7 << (61 - 32);
2433
}
2434
2435
highBits |= rh << (59 - 32);
2436
highBits |= rl << (56 - 32);
2437
highBits |= isolatedColor[1] << (52 - 32);
2438
highBits |= isolatedColor[2] << (48 - 32);
2439
highBits |= lineColor[0] << (44 - 32);
2440
highBits |= lineColor[1] << (40 - 32);
2441
highBits |= lineColor[2] << (36 - 32);
2442
highBits |= ((table >> 1) & 3) << (34 - 32);
2443
if (opaque)
2444
highBits |= 1 << (33 - 32);
2445
highBits |= (table & 1) << (32 - 32);
2446
2447
for (int px = 0; px < 16; px++)
2448
{
2449
int sel = (packedSelectors >> (2 * selectorOrder[px])) & 3;
2450
if ((sel & 0x1) != 0)
2451
lowBits |= (1 << px);
2452
if ((sel & 0x2) != 0)
2453
lowBits |= (1 << (16 + px));
2454
}
2455
2456
for (int i = 0; i < 4; i++)
2457
outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
2458
for (int i = 0; i < 4; i++)
2459
outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
2460
}
2461
2462
void cvtt::Internal::ETCComputer::EmitHModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 blockColors[2], ParallelMath::ScalarUInt16 sectorBits, ParallelMath::ScalarUInt16 signBits, ParallelMath::ScalarUInt16 table, bool opaque)
2463
{
2464
if (blockColors[0] == blockColors[1])
2465
{
2466
// Base colors are the same.
2467
// If the table low bit isn't 1, then we can't encode this, because swapping the block colors will have no effect
2468
// on their order.
2469
// Instead, we encode this as T mode where all of the indexes are on the line.
2470
2471
ParallelMath::ScalarUInt16 lineColor[3];
2472
ParallelMath::ScalarUInt16 isolatedColor[3];
2473
2474
lineColor[0] = isolatedColor[0] = (blockColors[0] >> 10) & 0x1f;
2475
lineColor[1] = isolatedColor[1] = (blockColors[0] >> 5) & 0x1f;
2476
lineColor[2] = isolatedColor[2] = (blockColors[0] >> 0) & 0x1f;
2477
2478
int32_t packedSelectors = 0x55555555;
2479
for (int px = 0; px < 16; px++)
2480
packedSelectors |= ((signBits >> px) & 1) << ((px * 2) + 1);
2481
2482
EmitTModeBlock(outputBuffer, lineColor, isolatedColor, packedSelectors, table, opaque);
2483
return;
2484
}
2485
2486
static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2487
2488
int16_t colors[2][3];
2489
for (int sector = 0; sector < 2; sector++)
2490
{
2491
for (int ch = 0; ch < 3; ch++)
2492
colors[sector][ch] = (blockColors[sector] >> ((2 - ch) * 5)) & 15;
2493
}
2494
2495
uint32_t lowBits = 0;
2496
uint32_t highBits = 0;
2497
2498
if (((table & 1) == 1) != (blockColors[0] > blockColors[1]))
2499
{
2500
for (int ch = 0; ch < 3; ch++)
2501
std::swap(colors[0][ch], colors[1][ch]);
2502
sectorBits ^= 0xffff;
2503
}
2504
2505
int r1 = colors[0][0];
2506
int g1a = colors[0][1] >> 1;
2507
int g1b = (colors[0][1] & 1);
2508
int b1a = colors[0][2] >> 3;
2509
int b1b = colors[0][2] & 7;
2510
int r2 = colors[1][0];
2511
int g2 = colors[1][1];
2512
int b2 = colors[1][2];
2513
2514
// Avoid overflowing R
2515
if ((g1a & 4) != 0 && r1 + g1a < 8)
2516
highBits |= 1 << (63 - 32);
2517
2518
int fakeDG = b1b >> 1;
2519
int fakeG = b1a | (g1b << 1);
2520
2521
if (fakeG + fakeDG < 4)
2522
{
2523
// Overflow low
2524
highBits |= 1 << (50 - 32);
2525
}
2526
else
2527
{
2528
// Overflow high
2529
highBits |= 7 << (53 - 32);
2530
}
2531
2532
int da = (table >> 2) & 1;
2533
int db = (table >> 1) & 1;
2534
2535
highBits |= r1 << (59 - 32);
2536
highBits |= g1a << (56 - 32);
2537
highBits |= g1b << (52 - 32);
2538
highBits |= b1a << (51 - 32);
2539
highBits |= b1b << (47 - 32);
2540
highBits |= r2 << (43 - 32);
2541
highBits |= g2 << (39 - 32);
2542
highBits |= b2 << (35 - 32);
2543
highBits |= da << (34 - 32);
2544
if (opaque)
2545
highBits |= 1 << (33 - 32);
2546
highBits |= db << (32 - 32);
2547
2548
for (int px = 0; px < 16; px++)
2549
{
2550
int sectorBit = (sectorBits >> selectorOrder[px]) & 1;
2551
int signBit = (signBits >> selectorOrder[px]) & 1;
2552
2553
lowBits |= (signBit << px);
2554
lowBits |= (sectorBit << (16 + px));
2555
}
2556
2557
uint8_t *output = outputBuffer;
2558
2559
for (int i = 0; i < 4; i++)
2560
output[i] = (highBits >> (24 - i * 8)) & 0xff;
2561
for (int i = 0; i < 4; i++)
2562
output[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
2563
}
2564
2565
void cvtt::Internal::ETCComputer::EmitETC1Block(uint8_t *outputBuffer, int blockBestFlip, int blockBestD, const int blockBestColors[2][3], const int blockBestTables[2], const ParallelMath::ScalarUInt16 blockBestSelectors[2], bool transparent)
2566
{
2567
uint32_t highBits = 0;
2568
uint32_t lowBits = 0;
2569
2570
if (blockBestD == 0)
2571
{
2572
highBits |= blockBestColors[0][0] << 28;
2573
highBits |= blockBestColors[1][0] << 24;
2574
highBits |= blockBestColors[0][1] << 20;
2575
highBits |= blockBestColors[1][1] << 16;
2576
highBits |= blockBestColors[0][2] << 12;
2577
highBits |= blockBestColors[1][2] << 8;
2578
}
2579
else
2580
{
2581
highBits |= blockBestColors[0][0] << 27;
2582
highBits |= ((blockBestColors[1][0] - blockBestColors[0][0]) & 7) << 24;
2583
highBits |= blockBestColors[0][1] << 19;
2584
highBits |= ((blockBestColors[1][1] - blockBestColors[0][1]) & 7) << 16;
2585
highBits |= blockBestColors[0][2] << 11;
2586
highBits |= ((blockBestColors[1][2] - blockBestColors[0][2]) & 7) << 8;
2587
}
2588
2589
highBits |= (blockBestTables[0] << 5);
2590
highBits |= (blockBestTables[1] << 2);
2591
if (!transparent)
2592
highBits |= (blockBestD << 1);
2593
highBits |= blockBestFlip;
2594
2595
const uint8_t modifierCodes[4] = { 3, 2, 0, 1 };
2596
2597
uint8_t unpackedSelectors[16];
2598
uint8_t unpackedSelectorCodes[16];
2599
for (int sector = 0; sector < 2; sector++)
2600
{
2601
int blockSectorBestSelectors = blockBestSelectors[sector];
2602
2603
for (int px = 0; px < 8; px++)
2604
{
2605
int selector = (blockSectorBestSelectors >> (2 * px)) & 3;
2606
unpackedSelectorCodes[g_flipTables[blockBestFlip][sector][px]] = modifierCodes[selector];
2607
unpackedSelectors[g_flipTables[blockBestFlip][sector][px]] = selector;
2608
}
2609
}
2610
2611
const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
2612
2613
int lowBitOffset = 0;
2614
for (int sb = 0; sb < 2; sb++)
2615
for (int px = 0; px < 16; px++)
2616
lowBits |= ((unpackedSelectorCodes[pixelSelectorOrder[px]] >> sb) & 1) << (px + sb * 16);
2617
2618
for (int i = 0; i < 4; i++)
2619
outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
2620
for (int i = 0; i < 4; i++)
2621
outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
2622
}
2623
2624
void cvtt::Internal::ETCComputer::CompressETC1BlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], DifferentialResolveStorage &drs, const Options &options, bool punchthrough)
2625
{
2626
int numTries = 0;
2627
2628
MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
2629
MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
2630
2631
MUInt15 bestColors[2] = { zeroU15, zeroU15 };
2632
MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
2633
MUInt15 bestTables[2] = { zeroU15, zeroU15 };
2634
MUInt15 bestFlip = zeroU15;
2635
MUInt15 bestD = zeroU15;
2636
2637
MUInt15 sectorPixels[2][2][8][3];
2638
MFloat sectorPreWeightedPixels[2][2][8][3];
2639
MUInt15 sectorCumulative[2][2][3];
2640
2641
ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
2642
2643
for (int flip = 0; flip < 2; flip++)
2644
{
2645
for (int sector = 0; sector < 2; sector++)
2646
{
2647
for (int ch = 0; ch < 3; ch++)
2648
sectorCumulative[flip][sector][ch] = zeroU15;
2649
2650
for (int px = 0; px < 8; px++)
2651
{
2652
for (int ch = 0; ch < 3; ch++)
2653
{
2654
MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
2655
sectorPixels[flip][sector][px][ch] = pixelChannelValue;
2656
sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
2657
sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
2658
}
2659
}
2660
}
2661
}
2662
2663
static const MSInt16 modifierTables[8][4] =
2664
{
2665
{ ParallelMath::MakeSInt16(-8), ParallelMath::MakeSInt16(-2), ParallelMath::MakeSInt16(2), ParallelMath::MakeSInt16(8) },
2666
{ ParallelMath::MakeSInt16(-17), ParallelMath::MakeSInt16(-5), ParallelMath::MakeSInt16(5), ParallelMath::MakeSInt16(17) },
2667
{ ParallelMath::MakeSInt16(-29), ParallelMath::MakeSInt16(-9), ParallelMath::MakeSInt16(9), ParallelMath::MakeSInt16(29) },
2668
{ ParallelMath::MakeSInt16(-42), ParallelMath::MakeSInt16(-13), ParallelMath::MakeSInt16(13), ParallelMath::MakeSInt16(42) },
2669
{ ParallelMath::MakeSInt16(-60), ParallelMath::MakeSInt16(-18), ParallelMath::MakeSInt16(18), ParallelMath::MakeSInt16(60) },
2670
{ ParallelMath::MakeSInt16(-80), ParallelMath::MakeSInt16(-24), ParallelMath::MakeSInt16(24), ParallelMath::MakeSInt16(80) },
2671
{ ParallelMath::MakeSInt16(-106), ParallelMath::MakeSInt16(-33), ParallelMath::MakeSInt16(33), ParallelMath::MakeSInt16(106) },
2672
{ ParallelMath::MakeSInt16(-183), ParallelMath::MakeSInt16(-47), ParallelMath::MakeSInt16(47), ParallelMath::MakeSInt16(183) },
2673
};
2674
2675
bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
2676
2677
int minD = punchthrough ? 1 : 0;
2678
2679
for (int flip = 0; flip < 2; flip++)
2680
{
2681
drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
2682
2683
MFloat bestIndError[2] = { ParallelMath::MakeFloat(FLT_MAX), ParallelMath::MakeFloat(FLT_MAX) };
2684
MUInt16 bestIndSelectors[2] = { ParallelMath::MakeUInt16(0), ParallelMath::MakeUInt16(0) };
2685
MUInt15 bestIndColors[2] = { zeroU15, zeroU15 };
2686
MUInt15 bestIndTable[2] = { zeroU15, zeroU15 };
2687
2688
for (int d = minD; d < 2; d++)
2689
{
2690
for (int sector = 0; sector < 2; sector++)
2691
{
2692
const int16_t *potentialOffsets = cvtt::Tables::ETC1::g_potentialOffsets4;
2693
2694
for (int table = 0; table < 8; table++)
2695
{
2696
int16_t numOffsets = *potentialOffsets++;
2697
2698
MUInt15 possibleColors[cvtt::Tables::ETC1::g_maxPotentialOffsets];
2699
2700
MUInt15 quantized[3];
2701
for (int oi = 0; oi < numOffsets; oi++)
2702
{
2703
if (!isFakeBT709)
2704
{
2705
for (int ch = 0; ch < 3; ch++)
2706
{
2707
// cu is in range 0..2040
2708
MUInt15 cu15 = ParallelMath::Min(
2709
ParallelMath::MakeUInt15(2040),
2710
ParallelMath::ToUInt15(
2711
ParallelMath::Max(
2712
ParallelMath::MakeSInt16(0),
2713
ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
2714
)
2715
)
2716
);
2717
2718
if (d == 1)
2719
{
2720
//quantized[ch] = (cu * 31 + (cu >> 3) + 1024) >> 11;
2721
quantized[ch] = ParallelMath::ToUInt15(
2722
ParallelMath::RightShift(
2723
(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(1024)
2724
, 11)
2725
);
2726
}
2727
else
2728
{
2729
//quantized[ch] = (cu * 30 + (cu >> 3) + 2048) >> 12;
2730
quantized[ch] = ParallelMath::ToUInt15(
2731
ParallelMath::RightShift(
2732
(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(2048)
2733
, 12)
2734
);
2735
}
2736
}
2737
}
2738
else
2739
{
2740
MUInt15 offsetCumulative[3];
2741
for (int ch = 0; ch < 3; ch++)
2742
{
2743
// cu is in range 0..2040
2744
MUInt15 cu15 = ParallelMath::Min(
2745
ParallelMath::MakeUInt15(2040),
2746
ParallelMath::ToUInt15(
2747
ParallelMath::Max(
2748
ParallelMath::MakeSInt16(0),
2749
ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
2750
)
2751
)
2752
);
2753
2754
offsetCumulative[ch] = cu15;
2755
}
2756
2757
if ((options.flags & cvtt::Flags::ETC_FakeBT709Accurate) != 0)
2758
ResolveHalfBlockFakeBT709RoundingAccurate(quantized, offsetCumulative, d == 1);
2759
else
2760
ResolveHalfBlockFakeBT709RoundingFast(quantized, offsetCumulative, d == 1);
2761
}
2762
2763
possibleColors[oi] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
2764
}
2765
2766
potentialOffsets += numOffsets;
2767
2768
ParallelMath::UInt15 numUniqueColors;
2769
for (int block = 0; block < ParallelMath::ParallelSize; block++)
2770
{
2771
uint16_t blockNumUniqueColors = 1;
2772
for (int i = 1; i < numOffsets; i++)
2773
{
2774
uint16_t color = ParallelMath::Extract(possibleColors[i], block);
2775
if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
2776
ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
2777
}
2778
2779
ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
2780
}
2781
2782
int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
2783
for (int block = 1; block < ParallelMath::ParallelSize; block++)
2784
maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
2785
2786
for (int block = 0; block < ParallelMath::ParallelSize; block++)
2787
{
2788
uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
2789
for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
2790
ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
2791
}
2792
2793
for (int i = 0; i < maxUniqueColors; i++)
2794
{
2795
MFloat error = ParallelMath::MakeFloatZero();
2796
MUInt16 selectors = ParallelMath::MakeUInt16(0);
2797
MUInt15 quantized = possibleColors[i];
2798
TestHalfBlock(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], modifierTables[table], d == 1, options);
2799
2800
if (d == 0)
2801
{
2802
ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestIndError[sector]));
2803
if (ParallelMath::AnySet(errorBetter))
2804
{
2805
bestIndError[sector] = ParallelMath::Min(error, bestIndError[sector]);
2806
ParallelMath::ConditionalSet(bestIndSelectors[sector], errorBetter, selectors);
2807
ParallelMath::ConditionalSet(bestIndColors[sector], errorBetter, quantized);
2808
ParallelMath::ConditionalSet(bestIndTable[sector], errorBetter, ParallelMath::MakeUInt15(table));
2809
}
2810
}
2811
else
2812
{
2813
ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
2814
2815
MUInt15 storageIndexes = drs.diffNumAttempts[sector];
2816
drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
2817
2818
for (int block = 0; block < ParallelMath::ParallelSize; block++)
2819
{
2820
int storageIndex = ParallelMath::Extract(storageIndexes, block);
2821
2822
ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
2823
ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
2824
ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
2825
ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
2826
}
2827
}
2828
}
2829
}
2830
}
2831
2832
if (d == 0)
2833
{
2834
MFloat bestIndErrorTotal = bestIndError[0] + bestIndError[1];
2835
ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(bestIndErrorTotal, bestTotalError));
2836
if (ParallelMath::AnySet(errorBetter))
2837
{
2838
bestIsThisMode = bestIsThisMode | errorBetter;
2839
2840
bestTotalError = ParallelMath::Min(bestTotalError, bestIndErrorTotal);
2841
ParallelMath::ConditionalSet(bestFlip, errorBetter, ParallelMath::MakeUInt15(flip));
2842
ParallelMath::ConditionalSet(bestD, errorBetter, ParallelMath::MakeUInt15(d));
2843
for (int sector = 0; sector < 2; sector++)
2844
{
2845
ParallelMath::ConditionalSet(bestColors[sector], errorBetter, bestIndColors[sector]);
2846
ParallelMath::ConditionalSet(bestSelectors[sector], errorBetter, bestIndSelectors[sector]);
2847
ParallelMath::ConditionalSet(bestTables[sector], errorBetter, bestIndTable[sector]);
2848
}
2849
}
2850
}
2851
else
2852
{
2853
ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(false), ParallelMath::MakeBoolInt16(false) };
2854
FindBestDifferentialCombination(flip, d, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestD, bestColors, bestSelectors, bestTables, drs);
2855
}
2856
}
2857
}
2858
2859
for (int block = 0; block < ParallelMath::ParallelSize; block++)
2860
{
2861
if (!ParallelMath::Extract(bestIsThisMode, block))
2862
continue;
2863
2864
uint32_t highBits = 0;
2865
uint32_t lowBits = 0;
2866
2867
int blockBestFlip = ParallelMath::Extract(bestFlip, block);
2868
int blockBestD = ParallelMath::Extract(bestD, block);
2869
int blockBestTables[2] = { ParallelMath::Extract(bestTables[0], block), ParallelMath::Extract(bestTables[1], block) };
2870
ParallelMath::ScalarUInt16 blockBestSelectors[2] = { ParallelMath::Extract(bestSelectors[0], block), ParallelMath::Extract(bestSelectors[1], block) };
2871
2872
int colors[2][3];
2873
for (int sector = 0; sector < 2; sector++)
2874
{
2875
int sectorColor = ParallelMath::Extract(bestColors[sector], block);
2876
for (int ch = 0; ch < 3; ch++)
2877
colors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
2878
}
2879
2880
EmitETC1Block(outputBuffer + block * 8, blockBestFlip, blockBestD, colors, blockBestTables, blockBestSelectors, false);
2881
}
2882
}
2883
2884
2885
void cvtt::Internal::ETCComputer::CompressETC1PunchthroughBlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], DifferentialResolveStorage &drs, const Options &options)
2886
{
2887
int numTries = 0;
2888
2889
MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
2890
MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
2891
2892
MUInt15 bestColors[2] = { zeroU15, zeroU15 };
2893
MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
2894
MUInt15 bestTables[2] = { zeroU15, zeroU15 };
2895
MUInt15 bestFlip = zeroU15;
2896
2897
MUInt15 sectorPixels[2][2][8][3];
2898
ParallelMath::Int16CompFlag sectorTransparent[2][2][8];
2899
MFloat sectorPreWeightedPixels[2][2][8][3];
2900
MUInt15 sectorCumulative[2][2][3];
2901
2902
ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
2903
2904
for (int flip = 0; flip < 2; flip++)
2905
{
2906
for (int sector = 0; sector < 2; sector++)
2907
{
2908
for (int ch = 0; ch < 3; ch++)
2909
sectorCumulative[flip][sector][ch] = zeroU15;
2910
2911
for (int px = 0; px < 8; px++)
2912
{
2913
for (int ch = 0; ch < 3; ch++)
2914
{
2915
MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
2916
sectorPixels[flip][sector][px][ch] = pixelChannelValue;
2917
sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
2918
sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
2919
}
2920
2921
sectorTransparent[flip][sector][px] = isTransparent[g_flipTables[flip][sector][px]];
2922
}
2923
}
2924
}
2925
2926
static const MUInt15 modifiers[8] =
2927
{
2928
ParallelMath::MakeUInt15(8),
2929
ParallelMath::MakeUInt15(17),
2930
ParallelMath::MakeUInt15(29),
2931
ParallelMath::MakeUInt15(42),
2932
ParallelMath::MakeUInt15(60),
2933
ParallelMath::MakeUInt15(80),
2934
ParallelMath::MakeUInt15(106),
2935
ParallelMath::MakeUInt15(183),
2936
};
2937
2938
bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
2939
2940
const int maxSectorCumulativeOffsets = 17;
2941
2942
for (int flip = 0; flip < 2; flip++)
2943
{
2944
ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(true), ParallelMath::MakeBoolInt16(false) };
2945
2946
for (int sector = 0; sector < 2; sector++)
2947
for (int px = 0; px < 8; px++)
2948
canIgnoreSector[sector] = canIgnoreSector[sector] & sectorTransparent[flip][sector][px];
2949
2950
drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
2951
2952
for (int sector = 0; sector < 2; sector++)
2953
{
2954
MUInt15 sectorNumOpaque = ParallelMath::MakeUInt15(0);
2955
for (int px = 0; px < 8; px++)
2956
sectorNumOpaque = sectorNumOpaque + ParallelMath::SelectOrZero(sectorTransparent[flip][sector][px], ParallelMath::MakeUInt15(1));
2957
2958
int sectorMaxOpaque = 0;
2959
for (int block = 0; block < ParallelMath::ParallelSize; block++)
2960
sectorMaxOpaque = std::max<int>(sectorMaxOpaque, ParallelMath::Extract(sectorNumOpaque, block));
2961
2962
int sectorNumOpaqueMultipliers = sectorMaxOpaque * 2 + 1;
2963
2964
MUInt15 sectorNumOpaqueDenominator = ParallelMath::Max(ParallelMath::MakeUInt15(1), sectorNumOpaque) << 8;
2965
MUInt15 sectorNumOpaqueAddend = sectorNumOpaque << 7;
2966
2967
MSInt16 sectorNumOpaqueSigned = ParallelMath::LosslessCast<MSInt16>::Cast(sectorNumOpaque);
2968
MSInt16 negSectorNumOpaqueSigned = ParallelMath::MakeSInt16(0) - sectorNumOpaqueSigned;
2969
2970
MUInt15 sectorCumulativeMax = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(255), sectorNumOpaque));
2971
2972
for (int table = 0; table < 8; table++)
2973
{
2974
MUInt15 possibleColors[maxSectorCumulativeOffsets];
2975
2976
MUInt15 quantized[3];
2977
for (int om = -sectorMaxOpaque; om <= sectorMaxOpaque; om++)
2978
{
2979
MSInt16 clampedOffsetMult = ParallelMath::Max(ParallelMath::Min(ParallelMath::MakeSInt16(om), sectorNumOpaqueSigned), negSectorNumOpaqueSigned);
2980
MSInt16 offset = ParallelMath::CompactMultiply(clampedOffsetMult, modifiers[table]);
2981
2982
for (int ch = 0; ch < 3; ch++)
2983
{
2984
// cu is in range 0..255*numOpaque (at most 0..2040)
2985
MUInt15 cu15 = ParallelMath::Min(
2986
sectorCumulativeMax,
2987
ParallelMath::ToUInt15(
2988
ParallelMath::Max(
2989
ParallelMath::MakeSInt16(0),
2990
ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + offset
2991
)
2992
)
2993
);
2994
2995
//quantized[ch] = (cu * 31 + (cu >> 3) + (numOpaque * 128)) / (numOpaque * 256)
2996
MUInt16 cuTimes31 = (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15);
2997
MUInt15 cuDiv8 = ParallelMath::RightShift(cu15, 3);
2998
MUInt16 numerator = cuTimes31 + ParallelMath::LosslessCast<MUInt16>::Cast(cuDiv8 + sectorNumOpaqueAddend);
2999
for (int block = 0; block < ParallelMath::ParallelSize; block++)
3000
ParallelMath::PutUInt15(quantized[ch], block, ParallelMath::Extract(numerator, block) / ParallelMath::Extract(sectorNumOpaqueDenominator, block));
3001
}
3002
3003
possibleColors[om + sectorMaxOpaque] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
3004
}
3005
3006
ParallelMath::UInt15 numUniqueColors;
3007
for (int block = 0; block < ParallelMath::ParallelSize; block++)
3008
{
3009
uint16_t blockNumUniqueColors = 1;
3010
for (int i = 1; i < sectorNumOpaqueMultipliers; i++)
3011
{
3012
uint16_t color = ParallelMath::Extract(possibleColors[i], block);
3013
if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
3014
ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
3015
}
3016
3017
ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
3018
}
3019
3020
int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
3021
for (int block = 1; block < ParallelMath::ParallelSize; block++)
3022
maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
3023
3024
for (int block = 0; block < ParallelMath::ParallelSize; block++)
3025
{
3026
uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
3027
for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
3028
ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
3029
}
3030
3031
for (int i = 0; i < maxUniqueColors; i++)
3032
{
3033
MFloat error = ParallelMath::MakeFloatZero();
3034
MUInt16 selectors = ParallelMath::MakeUInt16(0);
3035
MUInt15 quantized = possibleColors[i];
3036
TestHalfBlockPunchthrough(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], sectorTransparent[flip][sector], modifiers[table], options);
3037
3038
ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
3039
3040
MUInt15 storageIndexes = drs.diffNumAttempts[sector];
3041
drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
3042
3043
for (int block = 0; block < ParallelMath::ParallelSize; block++)
3044
{
3045
int storageIndex = ParallelMath::Extract(storageIndexes, block);
3046
3047
ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
3048
ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
3049
ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
3050
ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
3051
}
3052
}
3053
}
3054
}
3055
3056
MUInt15 bestDDummy = ParallelMath::MakeUInt15(0);
3057
FindBestDifferentialCombination(flip, 1, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestDDummy, bestColors, bestSelectors, bestTables, drs);
3058
}
3059
3060
for (int block = 0; block < ParallelMath::ParallelSize; block++)
3061
{
3062
if (!ParallelMath::Extract(bestIsThisMode, block))
3063
continue;
3064
3065
int blockBestColors[2][3];
3066
int blockBestTables[2];
3067
ParallelMath::ScalarUInt16 blockBestSelectors[2];
3068
for (int sector = 0; sector < 2; sector++)
3069
{
3070
int sectorColor = ParallelMath::Extract(bestColors[sector], block);
3071
for (int ch = 0; ch < 3; ch++)
3072
blockBestColors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
3073
3074
blockBestTables[sector] = ParallelMath::Extract(bestTables[sector], block);
3075
blockBestSelectors[sector] = ParallelMath::Extract(bestSelectors[sector], block);
3076
}
3077
3078
EmitETC1Block(outputBuffer + block * 8, ParallelMath::Extract(bestFlip, block), 1, blockBestColors, blockBestTables, blockBestSelectors, true);
3079
}
3080
}
3081
3082
3083
cvtt::ETC1CompressionData *cvtt::Internal::ETCComputer::AllocETC1Data(cvtt::Kernels::allocFunc_t allocFunc, void *context)
3084
{
3085
void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
3086
if (!buffer)
3087
return NULL;
3088
new (buffer) cvtt::Internal::ETCComputer::ETC1CompressionDataInternal(context);
3089
return static_cast<ETC1CompressionData*>(buffer);
3090
}
3091
3092
void cvtt::Internal::ETCComputer::ReleaseETC1Data(ETC1CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
3093
{
3094
cvtt::Internal::ETCComputer::ETC1CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC1CompressionDataInternal*>(compressionData);
3095
void *context = internalData->m_context;
3096
internalData->~ETC1CompressionDataInternal();
3097
freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
3098
}
3099
3100
cvtt::ETC2CompressionData *cvtt::Internal::ETCComputer::AllocETC2Data(cvtt::Kernels::allocFunc_t allocFunc, void *context, const cvtt::Options &options)
3101
{
3102
void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
3103
if (!buffer)
3104
return NULL;
3105
new (buffer) cvtt::Internal::ETCComputer::ETC2CompressionDataInternal(context, options);
3106
return static_cast<ETC2CompressionData*>(buffer);
3107
}
3108
3109
void cvtt::Internal::ETCComputer::ReleaseETC2Data(ETC2CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
3110
{
3111
cvtt::Internal::ETCComputer::ETC2CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC2CompressionDataInternal*>(compressionData);
3112
void *context = internalData->m_context;
3113
internalData->~ETC2CompressionDataInternal();
3114
freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
3115
}
3116
3117
cvtt::Internal::ETCComputer::ETC2CompressionDataInternal::ETC2CompressionDataInternal(void *context, const cvtt::Options &options)
3118
: m_context(context)
3119
{
3120
const float cd[3] = { options.redWeight, options.greenWeight, options.blueWeight };
3121
const float rotCD[3] = { cd[1], cd[2], cd[0] };
3122
3123
const float offs = -(rotCD[0] * cd[0] + rotCD[1] * cd[1] + rotCD[2] * cd[2]) / (cd[0] * cd[0] + cd[1] * cd[1] + cd[2] * cd[2]);
3124
3125
const float chromaAxis0[3] = { rotCD[0] + cd[0] * offs, rotCD[1] + cd[1] * offs, rotCD[2] + cd[2] * offs };
3126
3127
const float chromaAxis1Unnormalized[3] =
3128
{
3129
chromaAxis0[1] * cd[2] - chromaAxis0[2] * cd[1],
3130
chromaAxis0[2] * cd[0] - chromaAxis0[0] * cd[2],
3131
chromaAxis0[0] * cd[1] - chromaAxis0[1] * cd[0]
3132
};
3133
3134
const float ca0LengthSq = (chromaAxis0[0] * chromaAxis0[0] + chromaAxis0[1] * chromaAxis0[1] + chromaAxis0[2] * chromaAxis0[2]);
3135
const float ca1UNLengthSq = (chromaAxis1Unnormalized[0] * chromaAxis1Unnormalized[0] + chromaAxis1Unnormalized[1] * chromaAxis1Unnormalized[1] + chromaAxis1Unnormalized[2] * chromaAxis1Unnormalized[2]);
3136
const float lengthRatio = static_cast<float>(std::sqrt(ca0LengthSq / ca1UNLengthSq));
3137
3138
const float chromaAxis1[3] = { chromaAxis1Unnormalized[0] * lengthRatio, chromaAxis1Unnormalized[1] * lengthRatio, chromaAxis1Unnormalized[2] * lengthRatio };
3139
3140
for (int i = 0; i < 3; i++)
3141
{
3142
m_chromaSideAxis0[i] = chromaAxis0[i];
3143
m_chromaSideAxis1[i] = chromaAxis1[i];
3144
}
3145
}
3146
3147
#endif
3148
3149