Path: blob/master/thirdparty/cvtt/ConvectionKernels_ETC.cpp
9902 views
/*1Convection Texture Tools2Copyright (c) 2018-2019 Eric Lasota34Permission is hereby granted, free of charge, to any person obtaining5a copy of this software and associated documentation files (the6"Software"), to deal in the Software without restriction, including7without limitation the rights to use, copy, modify, merge, publish,8distribute, sublicense, and/or sell copies of the Software, and to9permit persons to whom the Software is furnished to do so, subject10to the following conditions:1112The above copyright notice and this permission notice shall be included13in all copies or substantial portions of the Software.1415THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS16OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF17MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.18IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY19CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,20TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE21SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.2223-------------------------------------------------------------------------------------2425Portions based on DirectX Texture Library (DirectXTex)2627Copyright (c) Microsoft Corporation. All rights reserved.28Licensed under the MIT License.2930http://go.microsoft.com/fwlink/?LinkId=24892631*/32#include "ConvectionKernels_Config.h"3334#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)3536#include "ConvectionKernels.h"37#include "ConvectionKernels_ETC.h"38#include "ConvectionKernels_ETC1.h"39#include "ConvectionKernels_ETC2.h"40#include "ConvectionKernels_ETC2_Rounding.h"41#include "ConvectionKernels_ParallelMath.h"42#include "ConvectionKernels_FakeBT709_Rounding.h"4344#include <cmath>4546const int cvtt::Internal::ETCComputer::g_flipTables[2][2][8] =47{48{49{ 0, 1, 4, 5, 8, 9, 12, 13 },50{ 2, 3, 6, 7, 10, 11, 14, 15 }51},52{53{ 0, 1, 2, 3, 4, 5, 6, 7 },54{ 8, 9, 10, 11, 12, 13, 14, 15 }55},56};5758cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorUniform(const MUInt15 pixelA[3], const MUInt15 pixelB[3])59{60MSInt16 d0 = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[0]);61MFloat fd0 = ParallelMath::ToFloat(d0);62MFloat error = fd0 * fd0;63for (int ch = 1; ch < 3; ch++)64{65MSInt16 d = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[ch]);66MFloat fd = ParallelMath::ToFloat(d);67error = error + fd * fd;68}69return error;70}7172cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorWeighted(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3], const Options options)73{74MFloat dr = ParallelMath::ToFloat(reconstructed[0]) * options.redWeight - preWeightedPixel[0];75MFloat dg = ParallelMath::ToFloat(reconstructed[1]) * options.greenWeight - preWeightedPixel[1];76MFloat db = ParallelMath::ToFloat(reconstructed[2]) * options.blueWeight - preWeightedPixel[2];7778return dr * dr + dg * dg + db * db;79}8081cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorFakeBT709(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3])82{83MFloat yuv[3];84ConvertToFakeBT709(yuv, reconstructed);8586MFloat dy = yuv[0] - preWeightedPixel[0];87MFloat du = yuv[1] - preWeightedPixel[1];88MFloat dv = yuv[2] - preWeightedPixel[2];8990return dy * dy + du * du + dv * dv;91}9293void cvtt::Internal::ETCComputer::TestHalfBlock(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const MSInt16 modifiers[4], bool isDifferential, const Options &options)94{95MUInt15 quantized[3];96MUInt15 unquantized[3];9798for (int ch = 0; ch < 3; ch++)99{100quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));101102if (isDifferential)103unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);104else105unquantized[ch] = (quantized[ch] << 4) | quantized[ch];106}107108MUInt16 selectors = ParallelMath::MakeUInt16(0);109MFloat totalError = ParallelMath::MakeFloatZero();110111MUInt15 u15_255 = ParallelMath::MakeUInt15(255);112MSInt16 s16_zero = ParallelMath::MakeSInt16(0);113114MUInt15 unquantizedModified[4][3];115for (unsigned int s = 0; s < 4; s++)116for (int ch = 0; ch < 3; ch++)117unquantizedModified[s][ch] = ParallelMath::Min(ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::ToSInt16(unquantized[ch]) + modifiers[s], s16_zero)), u15_255);118119bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);120bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);121122for (int px = 0; px < 8; px++)123{124MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);125MUInt16 bestSelector = ParallelMath::MakeUInt16(0);126127for (unsigned int s = 0; s < 4; s++)128{129MFloat error;130if (isFakeBT709)131error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);132else if (isUniform)133error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);134else135error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);136137ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);138bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt16(s), bestSelector);139bestError = ParallelMath::Min(error, bestError);140}141142totalError = totalError + bestError;143selectors = selectors | (bestSelector << (px * 2));144}145146outError = totalError;147outSelectors = selectors;148}149150void cvtt::Internal::ETCComputer::TestHalfBlockPunchthrough(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const ParallelMath::Int16CompFlag isTransparent[8], const MUInt15 modifier, const Options &options)151{152MUInt15 quantized[3];153MUInt15 unquantized[3];154155for (int ch = 0; ch < 3; ch++)156{157quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));158unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);159}160161MUInt16 selectors = ParallelMath::MakeUInt16(0);162MFloat totalError = ParallelMath::MakeFloatZero();163164MUInt15 u15_255 = ParallelMath::MakeUInt15(255);165MSInt16 s16_zero = ParallelMath::MakeSInt16(0);166167MUInt15 unquantizedModified[3][3];168for (int ch = 0; ch < 3; ch++)169{170unquantizedModified[0][ch] = ParallelMath::Max(unquantized[ch], modifier) - modifier;171unquantizedModified[1][ch] = unquantized[ch];172unquantizedModified[2][ch] = ParallelMath::Min(unquantized[ch] + modifier, u15_255);173}174175bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);176bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);177178for (int px = 0; px < 8; px++)179{180ParallelMath::FloatCompFlag isTransparentFloat = ParallelMath::Int16FlagToFloat(isTransparent[px]);181182MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);183MUInt15 bestSelector = ParallelMath::MakeUInt15(0);184185for (unsigned int s = 0; s < 3; s++)186{187MFloat error;188if (isFakeBT709)189error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);190else if (isUniform)191error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);192else193error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);194195ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);196bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(s), bestSelector);197bestError = ParallelMath::Min(error, bestError);198}199200// Annoying quirk: The ETC encoding machinery assumes that selectors are in the table order in the spec, which isn't201// the same as their encoding bits, so the transparent index is actually 1 and the valid indexes are 0, 2, and 3.202203// Remap selector 1 to 2, and 2 to 3204bestSelector = ParallelMath::Min(ParallelMath::MakeUInt15(3), bestSelector << 1);205206// Mark zero transparent as207ParallelMath::ConditionalSet(bestError, isTransparentFloat, ParallelMath::MakeFloatZero());208ParallelMath::ConditionalSet(bestSelector, isTransparent[px], ParallelMath::MakeUInt15(1));209210totalError = totalError + bestError;211selectors = selectors | (ParallelMath::LosslessCast<MUInt16>::Cast(bestSelector) << (px * 2));212}213214outError = totalError;215outSelectors = selectors;216}217218void cvtt::Internal::ETCComputer::FindBestDifferentialCombination(int flip, int d, const ParallelMath::Int16CompFlag canIgnoreSector[2], ParallelMath::Int16CompFlag& bestIsThisMode, MFloat& bestTotalError, MUInt15& bestFlip, MUInt15& bestD, MUInt15 bestColors[2], MUInt16 bestSelectors[2], MUInt15 bestTables[2], DifferentialResolveStorage &drs)219{220// We do this part scalar because most of the cost benefit of parallelization is in error evaluation,221// and this code has a LOT of early-outs and disjointed index lookups that vary heavily between blocks222// and save a lot of time.223for (int block = 0; block < ParallelMath::ParallelSize; block++)224{225bool canIgnore[2] = { ParallelMath::Extract(canIgnoreSector[0], block), ParallelMath::Extract(canIgnoreSector[1], block) };226bool canIgnoreEither = canIgnore[0] || canIgnore[1];227float blockBestTotalError = ParallelMath::Extract(bestTotalError, block);228float bestDiffErrors[2] = { FLT_MAX, FLT_MAX };229uint16_t bestDiffSelectors[2] = { 0, 0 };230uint16_t bestDiffColors[2] = { 0, 0 };231uint16_t bestDiffTables[2] = { 0, 0 };232for (int sector = 0; sector < 2; sector++)233{234unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);235for (unsigned int i = 0; i < sectorNumAttempts; i++)236{237float error = ParallelMath::Extract(drs.diffErrors[sector][i], block);238if (error < bestDiffErrors[sector])239{240bestDiffErrors[sector] = error;241bestDiffSelectors[sector] = ParallelMath::Extract(drs.diffSelectors[sector][i], block);242bestDiffColors[sector] = ParallelMath::Extract(drs.diffColors[sector][i], block);243bestDiffTables[sector] = ParallelMath::Extract(drs.diffTables[sector][i], block);244}245}246}247248if (canIgnore[0])249bestDiffColors[0] = bestDiffColors[1];250else if (canIgnore[1])251bestDiffColors[1] = bestDiffColors[0];252253// The best differential possibilities must be better than the best total error254if (bestDiffErrors[0] + bestDiffErrors[1] < blockBestTotalError)255{256// Fast path if the best possible case is legal257if (canIgnoreEither || ETCDifferentialIsLegalScalar(bestDiffColors[0], bestDiffColors[1]))258{259ParallelMath::PutBoolInt16(bestIsThisMode, block, true);260ParallelMath::PutFloat(bestTotalError, block, bestDiffErrors[0] + bestDiffErrors[1]);261ParallelMath::PutUInt15(bestFlip, block, flip);262ParallelMath::PutUInt15(bestD, block, d);263for (int sector = 0; sector < 2; sector++)264{265ParallelMath::PutUInt15(bestColors[sector], block, bestDiffColors[sector]);266ParallelMath::PutUInt16(bestSelectors[sector], block, bestDiffSelectors[sector]);267ParallelMath::PutUInt15(bestTables[sector], block, bestDiffTables[sector]);268}269}270else271{272// Slow path: Sort the possible cases by quality, and search valid combinations273// TODO: Pre-flatten the error lists so this is nicer to cache274unsigned int numSortIndexes[2] = { 0, 0 };275for (int sector = 0; sector < 2; sector++)276{277unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);278279for (unsigned int i = 0; i < sectorNumAttempts; i++)280{281if (ParallelMath::Extract(drs.diffErrors[sector][i], block) < blockBestTotalError)282drs.attemptSortIndexes[sector][numSortIndexes[sector]++] = i;283}284285struct SortPredicate286{287const MFloat *diffErrors;288int block;289290bool operator()(uint16_t a, uint16_t b) const291{292float errorA = ParallelMath::Extract(diffErrors[a], block);293float errorB = ParallelMath::Extract(diffErrors[b], block);294295if (errorA < errorB)296return true;297if (errorA > errorB)298return false;299300return a < b;301}302};303304SortPredicate sp;305sp.diffErrors = drs.diffErrors[sector];306sp.block = block;307308std::sort<uint16_t*, const SortPredicate&>(drs.attemptSortIndexes[sector], drs.attemptSortIndexes[sector] + numSortIndexes[sector], sp);309}310311int scannedElements = 0;312for (unsigned int i = 0; i < numSortIndexes[0]; i++)313{314unsigned int attemptIndex0 = drs.attemptSortIndexes[0][i];315float error0 = ParallelMath::Extract(drs.diffErrors[0][attemptIndex0], block);316317scannedElements++;318319if (error0 >= blockBestTotalError)320break;321322float maxError1 = ParallelMath::Extract(bestTotalError, block) - error0;323uint16_t diffColor0 = ParallelMath::Extract(drs.diffColors[0][attemptIndex0], block);324325if (maxError1 < bestDiffErrors[1])326break;327328for (unsigned int j = 0; j < numSortIndexes[1]; j++)329{330unsigned int attemptIndex1 = drs.attemptSortIndexes[1][j];331float error1 = ParallelMath::Extract(drs.diffErrors[1][attemptIndex1], block);332333scannedElements++;334335if (error1 >= maxError1)336break;337338uint16_t diffColor1 = ParallelMath::Extract(drs.diffColors[1][attemptIndex1], block);339340if (ETCDifferentialIsLegalScalar(diffColor0, diffColor1))341{342blockBestTotalError = error0 + error1;343344ParallelMath::PutBoolInt16(bestIsThisMode, block, true);345ParallelMath::PutFloat(bestTotalError, block, blockBestTotalError);346ParallelMath::PutUInt15(bestFlip, block, flip);347ParallelMath::PutUInt15(bestD, block, d);348ParallelMath::PutUInt15(bestColors[0], block, diffColor0);349ParallelMath::PutUInt15(bestColors[1], block, diffColor1);350ParallelMath::PutUInt16(bestSelectors[0], block, ParallelMath::Extract(drs.diffSelectors[0][attemptIndex0], block));351ParallelMath::PutUInt16(bestSelectors[1], block, ParallelMath::Extract(drs.diffSelectors[1][attemptIndex1], block));352ParallelMath::PutUInt15(bestTables[0], block, ParallelMath::Extract(drs.diffTables[0][attemptIndex0], block));353ParallelMath::PutUInt15(bestTables[1], block, ParallelMath::Extract(drs.diffTables[1][attemptIndex1], block));354break;355}356}357}358}359}360}361}362363cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannel(const MUInt15 &a, const MUInt15 &b)364{365MSInt16 diff = ParallelMath::LosslessCast<MSInt16>::Cast(b) - ParallelMath::LosslessCast<MSInt16>::Cast(a);366367return ParallelMath::Less(ParallelMath::MakeSInt16(-5), diff) & ParallelMath::Less(diff, ParallelMath::MakeSInt16(4));368}369370cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegal(const MUInt15 &a, const MUInt15 &b)371{372MUInt15 mask = ParallelMath::MakeUInt15(31);373374return ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 10), ParallelMath::RightShift(b, 10))375& ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 5) & mask, ParallelMath::RightShift(b, 5) & mask)376& ETCDifferentialIsLegalForChannel(a & mask, b & mask);377}378379bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannelScalar(const uint16_t &a, const uint16_t &b)380{381int16_t diff = static_cast<int16_t>(b) - static_cast<int16_t>(a);382383return (-4 <= diff) && (diff <= 3);384}385386bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalScalar(const uint16_t &a, const uint16_t &b)387{388MUInt15 mask = ParallelMath::MakeUInt15(31);389390return ETCDifferentialIsLegalForChannelScalar((a >> 10), (b >> 10))391& ETCDifferentialIsLegalForChannelScalar((a >> 5) & 31, (b >> 5) & 31)392& ETCDifferentialIsLegalForChannelScalar(a & 31, b & 31);393}394395void cvtt::Internal::ETCComputer::EncodeTMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)396{397bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);398bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);399400ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);401402MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };403MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };404405MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);406407// To speed this up, we compute line total as the sum, then subtract out isolated408for (unsigned int px = 0; px < 16; px++)409{410for (int ch = 0; ch < 3; ch++)411{412isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);413lineTotal[ch] = lineTotal[ch] + pixels[px][ch];414}415numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));416}417418for (int ch = 0; ch < 3; ch++)419lineTotal[ch] = lineTotal[ch] - isolatedTotal[ch];420421MUInt15 numPixelsLine = ParallelMath::MakeUInt15(16) - numPixelsIsolated;422423MUInt15 isolatedAverageQuantized[3];424MUInt15 isolatedAverageTargets[3];425{426int divisors[ParallelMath::ParallelSize];427for (int block = 0; block < ParallelMath::ParallelSize; block++)428divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;429430MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;431for (int ch = 0; ch < 3; ch++)432{433// isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);434435MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];436if (!isFakeBT709)437numerator = numerator + addend;438439for (int block = 0; block < ParallelMath::ParallelSize; block++)440{441int divisor = divisors[block];442if (divisor == 0)443ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);444else445ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);446}447448isolatedAverageTargets[ch] = numerator;449}450}451452if (isFakeBT709)453ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);454455MUInt15 isolatedColor[3];456for (int ch = 0; ch < 3; ch++)457isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);458459MFloat isolatedError[16];460for (int px = 0; px < 16; px++)461{462if (isFakeBT709)463isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);464else if (isUniform)465isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);466else467isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);468}469470MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);471MUInt15 bestTable = ParallelMath::MakeUInt15(0);472MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);473474MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);475MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;476477int16_t clusterMaxLine = 0;478for (int block = 0; block < ParallelMath::ParallelSize; block++)479{480int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);481if (blockMaxLine > clusterMaxLine)482clusterMaxLine = blockMaxLine;483}484485int16_t clusterMinLine = -clusterMaxLine;486487int lineDivisors[ParallelMath::ParallelSize];488for (int block = 0; block < ParallelMath::ParallelSize; block++)489lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;490491MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;492493for (int table = 0; table < 8; table++)494{495int numUniqueColors[ParallelMath::ParallelSize];496MUInt15 uniqueQuantizedColors[31];497498for (int block = 0; block < ParallelMath::ParallelSize; block++)499numUniqueColors[block] = 0;500501MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);502MUInt15 modifierOffset = (modifier + modifier);503504for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier++)505{506MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));507MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);508509MUInt15 quantized[3];510if (isFakeBT709)511{512MUInt15 targets[3];513for (int ch = 0; ch < 3; ch++)514{515//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));516MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));517MUInt15 divided = ParallelMath::MakeUInt15(0);518for (int block = 0; block < ParallelMath::ParallelSize; block++)519{520int divisor = lineDivisors[block];521if (divisor == 0)522ParallelMath::PutUInt15(divided, block, 0);523else524ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);525}526quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);527targets[ch] = numerator;528}529530ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);531}532else533{534for (int ch = 0; ch < 3; ch++)535{536//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));537MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));538MUInt15 divided = ParallelMath::MakeUInt15(0);539for (int block = 0; block < ParallelMath::ParallelSize; block++)540{541int divisor = lineDivisors[block];542if (divisor == 0)543ParallelMath::PutUInt15(divided, block, 0);544else545ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);546}547quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);548}549}550551MUInt15 packedColor = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);552553for (int block = 0; block < ParallelMath::ParallelSize; block++)554{555uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);556if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))557ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);558}559}560561// Stripe unfilled unique colors562int maxUniqueColors = 0;563for (int block = 0; block < ParallelMath::ParallelSize; block++)564{565if (numUniqueColors[block] > maxUniqueColors)566maxUniqueColors = numUniqueColors[block];567}568569for (int block = 0; block < ParallelMath::ParallelSize; block++)570{571uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);572573int numUnique = numUniqueColors[block];574for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)575ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);576}577578for (int ci = 0; ci < maxUniqueColors; ci++)579{580MUInt15 lineColors[3][3];581for (int ch = 0; ch < 3; ch++)582{583MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], (ch * 5)) & ParallelMath::MakeUInt15(15));584585MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;586lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);587lineColors[1][ch] = unquantizedColor;588lineColors[2][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));589}590591MSInt32 selectors = ParallelMath::MakeSInt32(0);592MFloat error = ParallelMath::MakeFloatZero();593for (int px = 0; px < 16; px++)594{595MFloat pixelError = isolatedError[px];596597MUInt15 pixelBestSelector = ParallelMath::MakeUInt15(0);598for (int i = 0; i < 3; i++)599{600MFloat error = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);601ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, pixelError);602pixelError = ParallelMath::Min(error, pixelError);603pixelBestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(i + 1), pixelBestSelector);604}605606error = error + pixelError;607selectors = selectors | (ParallelMath::ToInt32(pixelBestSelector) << (px * 2));608}609610ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));611bestError = ParallelMath::Min(error, bestError);612613if (ParallelMath::AnySet(errorBetter))614{615ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);616ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);617ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));618bestIsThisMode = bestIsThisMode | errorBetter;619}620}621}622623for (int block = 0; block < ParallelMath::ParallelSize; block++)624{625if (ParallelMath::Extract(bestIsThisMode, block))626{627uint32_t lowBits = 0;628uint32_t highBits = 0;629630uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);631ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];632633for (int ch = 0; ch < 3; ch++)634blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);635636uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);637int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);638639ParallelMath::ScalarUInt16 lineColor[3];640for (int ch = 0; ch < 3; ch++)641lineColor[ch] = (blockBestLineColor >> (ch * 5)) & 15;642643EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, true);644}645}646}647648void cvtt::Internal::ETCComputer::EncodeHMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag groupings[16], const MUInt15 pixels[16][3], HModeEval &he, const MFloat preWeightedPixels[16][3], const Options &options)649{650bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);651bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);652653MUInt15 zero15 = ParallelMath::MakeUInt15(0);654655MUInt15 counts[2] = { zero15, zero15 };656657ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);658659MUInt15 totals[2][3] =660{661{ zero15, zero15, zero15 },662{ zero15, zero15, zero15 }663};664665for (unsigned int px = 0; px < 16; px++)666{667for (int ch = 0; ch < 3; ch++)668{669totals[0][ch] = totals[0][ch] + pixels[px][ch];670totals[1][ch] = totals[1][ch] + ParallelMath::SelectOrZero(groupings[px], pixels[px][ch]);671}672counts[1] = counts[1] + ParallelMath::SelectOrZero(groupings[px], ParallelMath::MakeUInt15(1));673}674675for (int ch = 0; ch < 3; ch++)676totals[0][ch] = totals[0][ch] - totals[1][ch];677counts[0] = ParallelMath::MakeUInt15(16) - counts[1];678679MUInt16 bestSectorBits = ParallelMath::MakeUInt16(0);680MUInt16 bestSignBits = ParallelMath::MakeUInt16(0);681MUInt15 bestColors[2] = { zero15, zero15 };682MUInt15 bestTable = ParallelMath::MakeUInt15(0);683684for (int table = 0; table < 8; table++)685{686MUInt15 numUniqueColors = zero15;687688int modifier = cvtt::Tables::ETC1::g_thModifierTable[table];689690for (int sector = 0; sector < 2; sector++)691{692for (int block = 0; block < ParallelMath::ParallelSize; block++)693{694int blockNumUniqueColors = 0;695uint16_t blockUniqueQuantizedColors[31];696697int maxOffsetMultiplier = ParallelMath::Extract(counts[sector], block);698int minOffsetMultiplier = -maxOffsetMultiplier;699700int modifierOffset = modifier * 2;701702int blockSectorCounts = ParallelMath::Extract(counts[sector], block);703int blockSectorTotals[3];704for (int ch = 0; ch < 3; ch++)705blockSectorTotals[ch] = ParallelMath::Extract(totals[sector][ch], block);706707for (int offsetPremultiplier = minOffsetMultiplier; offsetPremultiplier <= maxOffsetMultiplier; offsetPremultiplier++)708{709// TODO: This isn't ideal for FakeBT709710int16_t quantized[3];711for (int ch = 0; ch < 3; ch++)712{713if (blockSectorCounts == 0)714quantized[ch] = 0;715else716quantized[ch] = std::min<int16_t>(15, std::max<int16_t>(0, (blockSectorTotals[ch] * 2 + blockSectorCounts * 17 + modifierOffset * offsetPremultiplier)) / (blockSectorCounts * 34));717}718719uint16_t packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];720if (blockNumUniqueColors == 0 || packedColor != blockUniqueQuantizedColors[blockNumUniqueColors - 1])721{722assert(blockNumUniqueColors < 32);723blockUniqueQuantizedColors[blockNumUniqueColors++] = packedColor;724}725}726727ParallelMath::PutUInt15(he.numUniqueColors[sector], block, blockNumUniqueColors);728729int baseIndex = 0;730if (sector == 1)731baseIndex = ParallelMath::Extract(he.numUniqueColors[0], block);732733for (int i = 0; i < blockNumUniqueColors; i++)734ParallelMath::PutUInt15(he.uniqueQuantizedColors[baseIndex + i], block, blockUniqueQuantizedColors[i]);735}736}737738MUInt15 totalColors = he.numUniqueColors[0] + he.numUniqueColors[1];739int maxErrorColors = 0;740for (int block = 0; block < ParallelMath::ParallelSize; block++)741maxErrorColors = std::max<int>(maxErrorColors, ParallelMath::Extract(totalColors, block));742743for (int block = 0; block < ParallelMath::ParallelSize; block++)744{745int lastColor = ParallelMath::Extract(totalColors, block);746uint16_t stripeColor = ParallelMath::Extract(he.uniqueQuantizedColors[0], block);747for (int i = lastColor; i < maxErrorColors; i++)748ParallelMath::PutUInt15(he.uniqueQuantizedColors[i], block, stripeColor);749}750751for (int ci = 0; ci < maxErrorColors; ci++)752{753MUInt15 fifteen = ParallelMath::MakeUInt15(15);754MUInt15 twoFiftyFive = ParallelMath::MakeUInt15(255);755MSInt16 zeroS16 = ParallelMath::MakeSInt16(0);756757MUInt15 colors[2][3];758for (int ch = 0; ch < 3; ch++)759{760MUInt15 quantizedChannel = ParallelMath::RightShift(he.uniqueQuantizedColors[ci], ((2 - ch) * 5)) & fifteen;761762MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;763colors[0][ch] = ParallelMath::Min(twoFiftyFive, unquantizedColor + modifier);764colors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(zeroS16, ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::MakeSInt16(modifier)));765}766767MUInt16 signBits = ParallelMath::MakeUInt16(0);768for (int px = 0; px < 16; px++)769{770MFloat errors[2];771for (int i = 0; i < 2; i++)772{773if (isFakeBT709)774errors[i] = ComputeErrorFakeBT709(colors[i], preWeightedPixels[px]);775else if (isUniform)776errors[i] = ComputeErrorUniform(colors[i], pixels[px]);777else778errors[i] = ComputeErrorWeighted(colors[i], preWeightedPixels[px], options);779}780781ParallelMath::Int16CompFlag errorOneLess = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errors[1], errors[0]));782he.errors[ci][px] = ParallelMath::Min(errors[0], errors[1]);783signBits = signBits | ParallelMath::SelectOrZero(errorOneLess, ParallelMath::MakeUInt16(1 << px));784}785he.signBits[ci] = signBits;786}787788int maxUniqueColorCombos = 0;789for (int block = 0; block < ParallelMath::ParallelSize; block++)790{791int numUniqueColorCombos = ParallelMath::Extract(he.numUniqueColors[0], block) * ParallelMath::Extract(he.numUniqueColors[1], block);792if (numUniqueColorCombos > maxUniqueColorCombos)793maxUniqueColorCombos = numUniqueColorCombos;794}795796MUInt15 indexes[2] = { zero15, zero15 };797MUInt15 maxIndex[2] = { he.numUniqueColors[0] - ParallelMath::MakeUInt15(1), he.numUniqueColors[1] - ParallelMath::MakeUInt15(1) };798799int block1Starts[ParallelMath::ParallelSize];800for (int block = 0; block < ParallelMath::ParallelSize; block++)801block1Starts[block] = ParallelMath::Extract(he.numUniqueColors[0], block);802803for (int combo = 0; combo < maxUniqueColorCombos; combo++)804{805MUInt15 index0 = indexes[0] + ParallelMath::MakeUInt15(1);806ParallelMath::Int16CompFlag index0Overflow = ParallelMath::Less(maxIndex[0], index0);807ParallelMath::ConditionalSet(index0, index0Overflow, ParallelMath::MakeUInt15(0));808809MUInt15 index1 = ParallelMath::Min(maxIndex[1], indexes[1] + ParallelMath::SelectOrZero(index0Overflow, ParallelMath::MakeUInt15(1)));810indexes[0] = index0;811indexes[1] = index1;812813int ci0[ParallelMath::ParallelSize];814int ci1[ParallelMath::ParallelSize];815MUInt15 color0;816MUInt15 color1;817818for (int block = 0; block < ParallelMath::ParallelSize; block++)819{820ci0[block] = ParallelMath::Extract(index0, block);821ci1[block] = ParallelMath::Extract(index1, block) + block1Starts[block];822ParallelMath::PutUInt15(color0, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci0[block]], block));823ParallelMath::PutUInt15(color1, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci1[block]], block));824}825826MFloat totalError = ParallelMath::MakeFloatZero();827MUInt16 sectorBits = ParallelMath::MakeUInt16(0);828MUInt16 signBits = ParallelMath::MakeUInt16(0);829for (int px = 0; px < 16; px++)830{831MFloat errorCI0;832MFloat errorCI1;833MUInt16 signBits0;834MUInt16 signBits1;835836for (int block = 0; block < ParallelMath::ParallelSize; block++)837{838ParallelMath::PutFloat(errorCI0, block, ParallelMath::Extract(he.errors[ci0[block]][px], block));839ParallelMath::PutFloat(errorCI1, block, ParallelMath::Extract(he.errors[ci1[block]][px], block));840ParallelMath::PutUInt16(signBits0, block, ParallelMath::Extract(he.signBits[ci0[block]], block));841ParallelMath::PutUInt16(signBits1, block, ParallelMath::Extract(he.signBits[ci1[block]], block));842}843844totalError = totalError + ParallelMath::Min(errorCI0, errorCI1);845846MUInt16 bitPosition = ParallelMath::MakeUInt16(1 << px);847848ParallelMath::Int16CompFlag error1Better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errorCI1, errorCI0));849850sectorBits = sectorBits | ParallelMath::SelectOrZero(error1Better, bitPosition);851signBits = signBits | (bitPosition & ParallelMath::Select(error1Better, signBits1, signBits0));852}853854ParallelMath::FloatCompFlag totalErrorBetter = ParallelMath::Less(totalError, bestError);855ParallelMath::Int16CompFlag totalErrorBetter16 = ParallelMath::FloatFlagToInt16(totalErrorBetter);856if (ParallelMath::AnySet(totalErrorBetter16))857{858bestIsThisMode = bestIsThisMode | totalErrorBetter16;859ParallelMath::ConditionalSet(bestTable, totalErrorBetter16, ParallelMath::MakeUInt15(table));860ParallelMath::ConditionalSet(bestColors[0], totalErrorBetter16, color0);861ParallelMath::ConditionalSet(bestColors[1], totalErrorBetter16, color1);862ParallelMath::ConditionalSet(bestSectorBits, totalErrorBetter16, sectorBits);863ParallelMath::ConditionalSet(bestSignBits, totalErrorBetter16, signBits);864bestError = ParallelMath::Min(totalError, bestError);865}866}867}868869if (ParallelMath::AnySet(bestIsThisMode))870{871for (int block = 0; block < ParallelMath::ParallelSize; block++)872{873if (!ParallelMath::Extract(bestIsThisMode, block))874continue;875876ParallelMath::ScalarUInt16 blockBestColors[2] = { ParallelMath::Extract(bestColors[0], block), ParallelMath::Extract(bestColors[1], block) };877ParallelMath::ScalarUInt16 blockBestSectorBits = ParallelMath::Extract(bestSectorBits, block);878ParallelMath::ScalarUInt16 blockBestSignBits = ParallelMath::Extract(bestSignBits, block);879ParallelMath::ScalarUInt16 blockBestTable = ParallelMath::Extract(bestTable, block);880881EmitHModeBlock(outputBuffer + block * 8, blockBestColors, blockBestSectorBits, blockBestSignBits, blockBestTable, true);882}883}884}885886void cvtt::Internal::ETCComputer::EncodeVirtualTModePunchthrough(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolatedBase[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], const ParallelMath::Int16CompFlag& anyTransparent, const ParallelMath::Int16CompFlag& allTransparent, const Options &options)887{888// We treat T and H mode as the same mode ("Virtual T mode") with punchthrough, because of how the colors work:889//890// T mode: C1, C2+M, Transparent, C2-M891// H mode: C1+M, C1-M, Transparent, C2-M892//893// So in either case, we have 2 colors +/- a modifier, and a third unique color, which is basically T mode except without the middle color.894// The only thing that matters is whether it's better to store the isolated color as T mode color 1, or store it offset in H mode color 2.895//896// Sometimes it won't even be possible to store it in H mode color 2 because the table low bit derives from a numeric comparison of the colors,897// but unlike opaque blocks, we can't flip them.898bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);899bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);900901ParallelMath::FloatCompFlag isTransparentF[16];902for (int px = 0; px < 16; px++)903isTransparentF[px] = ParallelMath::Int16FlagToFloat(isTransparent[px]);904905ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);906ParallelMath::Int16CompFlag bestIsHMode = ParallelMath::MakeBoolInt16(false);907908MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };909MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };910911MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);912MUInt15 numPixelsLine = ParallelMath::MakeUInt15(0);913914ParallelMath::Int16CompFlag isIsolated[16];915ParallelMath::Int16CompFlag isLine[16];916917for (unsigned int px = 0; px < 16; px++)918{919ParallelMath::Int16CompFlag isOpaque = ParallelMath::Not(isTransparent[px]);920isIsolated[px] = isIsolatedBase[px] & isOpaque;921isLine[px] = ParallelMath::Not(isIsolatedBase[px]) & isOpaque;922}923924for (unsigned int px = 0; px < 16; px++)925{926for (int ch = 0; ch < 3; ch++)927{928isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);929lineTotal[ch] = lineTotal[ch] + ParallelMath::SelectOrZero(isLine[px], pixels[px][ch]);930}931numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));932numPixelsLine = numPixelsLine + ParallelMath::SelectOrZero(isLine[px], ParallelMath::MakeUInt15(1));933}934935MUInt15 isolatedAverageQuantized[3];936MUInt15 hModeIsolatedQuantized[8][3];937MUInt15 isolatedAverageTargets[3];938{939int divisors[ParallelMath::ParallelSize];940for (int block = 0; block < ParallelMath::ParallelSize; block++)941divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;942943MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;944for (int ch = 0; ch < 3; ch++)945{946// isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);947948MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];949if (!isFakeBT709)950numerator = numerator + addend;951952MUInt15 hModeIsolatedNumerators[8];953for (int table = 0; table < 8; table++)954{955// FIXME: Handle fake BT.709 correctly956MUInt15 offsetTotal = isolatedTotal[ch] + ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]), numPixelsIsolated));957958hModeIsolatedNumerators[table] = (offsetTotal + offsetTotal) + addend;959}960961for (int block = 0; block < ParallelMath::ParallelSize; block++)962{963int divisor = divisors[block];964if (divisor == 0)965{966ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);967for (int table = 0; table < 8; table++)968ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, 0);969}970else971{972ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);973for (int table = 0; table < 8; table++)974ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, ParallelMath::Extract(hModeIsolatedNumerators[table], block) / divisor);975}976}977978isolatedAverageTargets[ch] = numerator;979}980}981982if (isFakeBT709)983ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);984985for (int table = 0; table < 8; table++)986for (int ch = 0; ch < 3; ch++)987hModeIsolatedQuantized[table][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), hModeIsolatedQuantized[table][ch]);988989MUInt15 isolatedColor[3];990for (int ch = 0; ch < 3; ch++)991isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);992993MFloat isolatedError[16];994for (int px = 0; px < 16; px++)995{996if (isFakeBT709)997isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);998else if (isUniform)999isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);1000else1001isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);10021003ParallelMath::ConditionalSet(isolatedError[px], isTransparentF[px], ParallelMath::MakeFloatZero());1004}10051006MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);1007MUInt15 bestTable = ParallelMath::MakeUInt15(0);1008MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);1009MUInt15 bestIsolatedColor = ParallelMath::MakeUInt15(0);1010MUInt15 bestHModeColor2 = ParallelMath::MakeUInt15(0);1011ParallelMath::Int16CompFlag bestUseHMode = ParallelMath::MakeBoolInt16(false);10121013MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);1014MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;10151016int16_t clusterMaxLine = 0;1017for (int block = 0; block < ParallelMath::ParallelSize; block++)1018{1019int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);1020if (blockMaxLine > clusterMaxLine)1021clusterMaxLine = blockMaxLine;1022}10231024int16_t clusterMinLine = -clusterMaxLine;10251026int lineDivisors[ParallelMath::ParallelSize];1027for (int block = 0; block < ParallelMath::ParallelSize; block++)1028lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;10291030MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;10311032for (int table = 0; table < 8; table++)1033{1034int numUniqueColors[ParallelMath::ParallelSize];1035MUInt15 uniqueQuantizedColors[31];10361037for (int block = 0; block < ParallelMath::ParallelSize; block++)1038numUniqueColors[block] = 0;10391040MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);1041MUInt15 modifierOffset = (modifier + modifier);10421043for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier += 2)1044{1045MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));1046MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);10471048MUInt15 quantized[3];1049if (isFakeBT709)1050{1051MUInt15 targets[3];1052for (int ch = 0; ch < 3; ch++)1053{1054//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));1055MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));1056MUInt15 divided = ParallelMath::MakeUInt15(0);1057for (int block = 0; block < ParallelMath::ParallelSize; block++)1058{1059int divisor = lineDivisors[block];1060if (divisor == 0)1061ParallelMath::PutUInt15(divided, block, 0);1062else1063ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);1064}1065quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);1066targets[ch] = numerator;1067}10681069ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);1070}1071else1072{1073for (int ch = 0; ch < 3; ch++)1074{1075//quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));1076MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));1077MUInt15 divided = ParallelMath::MakeUInt15(0);1078for (int block = 0; block < ParallelMath::ParallelSize; block++)1079{1080int divisor = lineDivisors[block];1081if (divisor == 0)1082ParallelMath::PutUInt15(divided, block, 0);1083else1084ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);1085}1086quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);1087}1088}10891090MUInt15 packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];10911092for (int block = 0; block < ParallelMath::ParallelSize; block++)1093{1094uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);1095if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))1096ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);1097}1098}10991100// Stripe unfilled unique colors1101int maxUniqueColors = 0;1102for (int block = 0; block < ParallelMath::ParallelSize; block++)1103{1104if (numUniqueColors[block] > maxUniqueColors)1105maxUniqueColors = numUniqueColors[block];1106}11071108for (int block = 0; block < ParallelMath::ParallelSize; block++)1109{1110uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);11111112int numUnique = numUniqueColors[block];1113for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)1114ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);1115}11161117MFloat hModeErrors[16];1118MUInt15 hModeUnquantizedColor[3];1119for (int ch = 0; ch < 3; ch++)1120{1121MUInt15 quantizedChannel = hModeIsolatedQuantized[table][ch];11221123MUInt15 unquantizedCh = (quantizedChannel << 4) | quantizedChannel;1124hModeUnquantizedColor[ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedCh) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));1125}11261127for (int px = 0; px < 16; px++)1128{1129hModeErrors[px] = isUniform ? ComputeErrorUniform(hModeUnquantizedColor, pixels[px]) : ComputeErrorWeighted(hModeUnquantizedColor, preWeightedPixels[px], options);1130ParallelMath::ConditionalSet(hModeErrors[px], isTransparentF[px], ParallelMath::MakeFloatZero());1131}11321133MUInt15 packedHModeColor2 = (hModeIsolatedQuantized[table][0] << 10) | (hModeIsolatedQuantized[table][1] << 5) | hModeIsolatedQuantized[table][2];1134ParallelMath::Int16CompFlag tableLowBitIsZero = ((table & 1) == 0) ? ParallelMath::MakeBoolInt16(true) : ParallelMath::MakeBoolInt16(false);11351136for (int ci = 0; ci < maxUniqueColors; ci++)1137{1138MUInt15 lineColors[2][3];1139for (int ch = 0; ch < 3; ch++)1140{1141MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], 10 - (ch * 5)) & ParallelMath::MakeUInt15(15));11421143MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;1144lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);1145lineColors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));1146}11471148MUInt15 bestLineSelector[16];1149MFloat bestLineError[16];1150for (int px = 0; px < 16; px++)1151{1152MFloat lineErrors[2];1153for (int i = 0; i < 2; i++)1154lineErrors[i] = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);11551156ParallelMath::Int16CompFlag firstIsBetter = ParallelMath::FloatFlagToInt16(ParallelMath::LessOrEqual(lineErrors[0], lineErrors[1]));1157bestLineSelector[px] = ParallelMath::Select(firstIsBetter, ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(3));1158bestLineError[px] = ParallelMath::Min(lineErrors[0], lineErrors[1]);11591160ParallelMath::ConditionalSet(bestLineError[px], isTransparentF[px], ParallelMath::MakeFloatZero());1161}11621163// One case considered here was if it was possible to force H mode to be valid when the line color is unused.1164// That case isn't actually useful because it's equivalent to the isolated color being unused at maximum offset,1165// which is always checked after a swap.1166MFloat tModeError = ParallelMath::MakeFloatZero();1167MFloat hModeError = ParallelMath::MakeFloatZero();1168for (int px = 0; px < 16; px++)1169{1170tModeError = tModeError + ParallelMath::Min(bestLineError[px], isolatedError[px]);1171hModeError = hModeError + ParallelMath::Min(bestLineError[px], hModeErrors[px]);1172}11731174ParallelMath::FloatCompFlag hLessError = ParallelMath::Less(hModeError, tModeError);11751176MUInt15 packedHModeColor1 = uniqueQuantizedColors[ci];11771178ParallelMath::Int16CompFlag hModeTableLowBitMustBeZero = ParallelMath::Less(packedHModeColor1, packedHModeColor2);11791180ParallelMath::Int16CompFlag hModeIsLegal = ParallelMath::Equal(hModeTableLowBitMustBeZero, tableLowBitIsZero);1181ParallelMath::Int16CompFlag useHMode = ParallelMath::FloatFlagToInt16(hLessError) & hModeIsLegal;11821183MFloat roundBestError = tModeError;1184ParallelMath::ConditionalSet(roundBestError, ParallelMath::Int16FlagToFloat(useHMode), hModeError);11851186ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(roundBestError, bestError));1187ParallelMath::FloatCompFlag useHModeF = ParallelMath::Int16FlagToFloat(useHMode);11881189if (ParallelMath::AnySet(errorBetter))1190{1191MSInt32 selectors = ParallelMath::MakeSInt32(0);1192for (int px = 0; px < 16; px++)1193{1194MUInt15 selector = bestLineSelector[px];11951196MFloat isolatedPixelError = ParallelMath::Select(useHModeF, hModeErrors[px], isolatedError[px]);1197ParallelMath::Int16CompFlag isolatedBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(isolatedPixelError, bestLineError[px]));11981199ParallelMath::ConditionalSet(selector, isolatedBetter, ParallelMath::MakeUInt15(0));1200ParallelMath::ConditionalSet(selector, isTransparent[px], ParallelMath::MakeUInt15(2));1201selectors = selectors | (ParallelMath::ToInt32(selector) << (px * 2));1202}12031204bestError = ParallelMath::Min(bestError, roundBestError);1205ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);1206ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);1207ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));1208ParallelMath::ConditionalSet(bestIsHMode, errorBetter, useHMode);1209ParallelMath::ConditionalSet(bestHModeColor2, errorBetter, packedHModeColor2);12101211bestIsThisMode = bestIsThisMode | errorBetter;1212}1213}1214}12151216for (int block = 0; block < ParallelMath::ParallelSize; block++)1217{1218if (ParallelMath::Extract(bestIsThisMode, block))1219{1220uint32_t lowBits = 0;1221uint32_t highBits = 0;12221223uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);1224ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];12251226for (int ch = 0; ch < 3; ch++)1227blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);12281229uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);1230int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);12311232ParallelMath::ScalarUInt16 lineColor[3];1233for (int ch = 0; ch < 3; ch++)1234lineColor[ch] = (blockBestLineColor >> (10 - (ch * 5))) & 15;12351236if (ParallelMath::Extract(bestIsHMode, block))1237{1238// T mode: C1, C2+M, Transparent, C2-M1239// H mode: C1+M, C1-M, Transparent, C2-M1240static const ParallelMath::ScalarUInt16 selectorRemapSector[4] = { 1, 0, 1, 0 };1241static const ParallelMath::ScalarUInt16 selectorRemapSign[4] = { 1, 0, 0, 1 };12421243// Remap selectors1244ParallelMath::ScalarUInt16 signBits = 0;1245ParallelMath::ScalarUInt16 sectorBits = 0;1246int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);1247for (int px = 0; px < 16; px++)1248{1249int32_t selector = (blockBestSelectors >> (px * 2)) & 3;1250sectorBits |= (selectorRemapSector[selector] << px);1251signBits |= (selectorRemapSign[selector] << px);1252}12531254ParallelMath::ScalarUInt16 blockColors[2] = { blockBestLineColor, ParallelMath::Extract(bestHModeColor2, block) };12551256EmitHModeBlock(outputBuffer + block * 8, blockColors, sectorBits, signBits, blockBestTable, false);1257}1258else1259EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, false);1260}1261}1262}126312641265cvtt::ParallelMath::UInt15 cvtt::Internal::ETCComputer::DecodePlanarCoeff(const MUInt15 &coeff, int ch)1266{1267if (ch == 1)1268return (coeff << 1) | (ParallelMath::RightShift(coeff, 6));1269else1270return (coeff << 2) | (ParallelMath::RightShift(coeff, 4));1271}12721273void cvtt::Internal::ETCComputer::EncodePlanar(uint8_t *outputBuffer, MFloat &bestError, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)1274{1275// NOTE: If it's desired to do this in another color space, the best way to do it would probably be1276// to do everything in that color space and then transform it back to RGB.12771278// We compute H = (H-O)/4 and V= (V-O)/4 to simplify the math12791280// error = (x*H + y*V + O - C)^21281MFloat h[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };1282MFloat v[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };1283MFloat o[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };12841285bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);1286bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);12871288MFloat totalError = ParallelMath::MakeFloatZero();1289MUInt15 bestCoeffs[3][3]; // [Channel][Coeff]1290for (int ch = 0; ch < 3; ch++)1291{1292float fhh = 0.f;1293float fho = 0.f;1294float fhv = 0.f;1295float foo = 0.f;1296float fov = 0.f;1297float fvv = 0.f;1298MFloat fc = ParallelMath::MakeFloatZero();1299MFloat fh = ParallelMath::MakeFloatZero();1300MFloat fv = ParallelMath::MakeFloatZero();1301MFloat fo = ParallelMath::MakeFloatZero();13021303float &foh = fho;1304float &fvh = fhv;1305float &fvo = fov;13061307for (int px = 0; px < 16; px++)1308{1309float x = static_cast<float>(px % 4);1310float y = static_cast<float>(px / 4);1311MFloat c = isFakeBT709 ? preWeightedPixels[px][ch] : ParallelMath::ToFloat(pixels[px][ch]);13121313// (x*H + y*V + O - C)^21314fhh += x * x;1315fhv += x * y;1316fho += x;1317fh = fh - c * x;13181319fvh += y * x;1320fvv += y * y;1321fvo += y;1322fv = fv - c * y;13231324foh += x;1325fov += y;1326foo += 1;1327fo = fo - c;13281329fh = fh - c * x;1330fv = fv - c * y;1331fo = fo - c;1332fc = fc + c * c;1333}13341335//float totalError = fhh * h * h + fho * h*o + fhv * h*v + foo * o * o + fov * o*v + fvv * v * v + fh * h + fv * v + fo * o + fc;13361337// error = fhh*h^2 + fho*h*o + fhv*h*v + foo*o^2 + fov*o*v + fvv*v^2 + fh*h + fv*v + fo*o + fc1338// derror/dh = 2*fhh*h + fho*o + fhv*v + fh1339// derror/dv = fhv*h + fov*o + 2*fvv*v + fv1340// derror/do = fho*h + 2*foo*o + fov*v + fo13411342// Solve system of equations1343// h o v 1 = 01344// -------1345// d e f g R01346// i j k l R11347// m n p q R213481349float d = 2.0f * fhh;1350float e = fho;1351float f = fhv;1352MFloat gD = fh;13531354float i = fhv;1355float j = fov;1356float k = 2.0f * fvv;1357MFloat lD = fv;13581359float m = fho;1360float n = 2.0f * foo;1361float p = fov;1362MFloat qD = fo;13631364{1365// Factor out first column from R1 and R21366float r0to1 = -i / d;1367float r0to2 = -m / d;13681369// 0 j1 k1 l1D1370float j1 = j + r0to1 * e;1371float k1 = k + r0to1 * f;1372MFloat l1D = lD + gD * r0to1;13731374// 0 n1 p1 q1D1375float n1 = n + r0to2 * e;1376float p1 = p + r0to2 * f;1377MFloat q1D = qD + gD * r0to2;13781379// Factor out third column from R21380float r1to2 = -p1 / k1;13811382// 0 n2 0 q2D1383float n2 = n1 + r1to2 * j1;1384MFloat q2D = q1D + l1D * r1to2;13851386o[ch] = -q2D / n2;13871388// Factor out second column from R11389// 0 n2 0 q2D13901391float r2to1 = -j1 / n2;13921393// 0 0 k1 l2D1394// 0 n2 0 q2D1395MFloat l2D = l1D + q2D * r2to1;13961397float elim2 = -f / k1;1398float elim1 = -e / n2;13991400// d 0 0 g2D1401MFloat g2D = gD + l2D * elim2 + q2D * elim1;14021403// n2*o + q2 = 01404// o = -q2 / n21405h[ch] = -g2D / d;1406v[ch] = -l2D / k1;1407}14081409// Undo the local transformation1410h[ch] = h[ch] * 4.0f + o[ch];1411v[ch] = v[ch] * 4.0f + o[ch];1412}14131414if (isFakeBT709)1415{1416MFloat oRGB[3];1417MFloat hRGB[3];1418MFloat vRGB[3];14191420ConvertFromFakeBT709(oRGB, o);1421ConvertFromFakeBT709(hRGB, h);1422ConvertFromFakeBT709(vRGB, v);14231424// Twiddling in fake BT.607 is a mess, just round off for now (the precision is pretty good anyway)1425{1426ParallelMath::RoundTowardNearestForScope rtn;14271428for (int ch = 0; ch < 3; ch++)1429{1430MFloat fcoeffs[3] = { oRGB[ch], hRGB[ch], vRGB[ch] };14311432for (int c = 0; c < 3; c++)1433{1434MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);1435if (ch == 1)1436coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));1437else1438coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));1439fcoeffs[c] = coeff;1440}14411442for (int c = 0; c < 3; c++)1443bestCoeffs[ch][c] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rtn);1444}1445}14461447MUInt15 reconstructed[16][3];1448for (int ch = 0; ch < 3; ch++)1449{1450MUInt15 dO = DecodePlanarCoeff(bestCoeffs[ch][0], ch);1451MUInt15 dH = DecodePlanarCoeff(bestCoeffs[ch][1], ch);1452MUInt15 dV = DecodePlanarCoeff(bestCoeffs[ch][2], ch);14531454MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);1455MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);14561457MFloat error = ParallelMath::MakeFloatZero();14581459MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;14601461for (int px = 0; px < 16; px++)1462{1463MUInt15 pxv = ParallelMath::MakeUInt15(px);1464MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));1465MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));14661467MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);1468MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));1469reconstructed[px][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);1470}1471}14721473totalError = ParallelMath::MakeFloatZero();1474for (int px = 0; px < 16; px++)1475totalError = totalError + ComputeErrorFakeBT709(reconstructed[px], preWeightedPixels[px]);1476}1477else1478{1479for (int ch = 0; ch < 3; ch++)1480{1481MFloat fcoeffs[3] = { o[ch], h[ch], v[ch] };1482MUInt15 coeffRanges[3][2];14831484for (int c = 0; c < 3; c++)1485{1486MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);1487if (ch == 1)1488coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));1489else1490coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));1491fcoeffs[c] = coeff;1492}14931494{1495ParallelMath::RoundDownForScope rd;1496for (int c = 0; c < 3; c++)1497coeffRanges[c][0] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rd);1498}14991500{1501ParallelMath::RoundUpForScope ru;1502for (int c = 0; c < 3; c++)1503coeffRanges[c][1] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &ru);1504}15051506MFloat bestChannelError = ParallelMath::MakeFloat(FLT_MAX);1507for (int io = 0; io < 2; io++)1508{1509MUInt15 dO = DecodePlanarCoeff(coeffRanges[0][io], ch);15101511for (int ih = 0; ih < 2; ih++)1512{1513MUInt15 dH = DecodePlanarCoeff(coeffRanges[1][ih], ch);1514MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);15151516for (int iv = 0; iv < 2; iv++)1517{1518MUInt15 dV = DecodePlanarCoeff(coeffRanges[2][iv], ch);1519MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);15201521MFloat error = ParallelMath::MakeFloatZero();15221523MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;15241525for (int px = 0; px < 16; px++)1526{1527MUInt15 pxv = ParallelMath::MakeUInt15(px);1528MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));1529MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));15301531MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);1532MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));1533MUInt15 dec = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);15341535MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(dec);15361537MFloat deltaF = ParallelMath::ToFloat(delta);1538error = error + deltaF * deltaF;1539}15401541ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestChannelError));1542if (ParallelMath::AnySet(errorBetter))1543{1544bestChannelError = ParallelMath::Min(error, bestChannelError);1545ParallelMath::ConditionalSet(bestCoeffs[ch][0], errorBetter, coeffRanges[0][io]);1546ParallelMath::ConditionalSet(bestCoeffs[ch][1], errorBetter, coeffRanges[1][ih]);1547ParallelMath::ConditionalSet(bestCoeffs[ch][2], errorBetter, coeffRanges[2][iv]);1548}1549}1550}1551}15521553if (!isUniform)1554{1555switch (ch)1556{1557case 0:1558bestChannelError = bestChannelError * (options.redWeight * options.redWeight);1559break;1560case 1:1561bestChannelError = bestChannelError * (options.greenWeight * options.greenWeight);1562break;1563case 2:1564bestChannelError = bestChannelError * (options.blueWeight * options.blueWeight);1565break;1566default:1567break;1568}1569}15701571totalError = totalError + bestChannelError;1572}1573}15741575ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(totalError, bestError));1576if (ParallelMath::AnySet(errorBetter))1577{1578bestError = ParallelMath::Min(bestError, totalError);15791580for (int block = 0; block < ParallelMath::ParallelSize; block++)1581{1582if (!ParallelMath::Extract(errorBetter, block))1583continue;15841585int ro = ParallelMath::Extract(bestCoeffs[0][0], block);1586int rh = ParallelMath::Extract(bestCoeffs[0][1], block);1587int rv = ParallelMath::Extract(bestCoeffs[0][2], block);15881589int go = ParallelMath::Extract(bestCoeffs[1][0], block);1590int gh = ParallelMath::Extract(bestCoeffs[1][1], block);1591int gv = ParallelMath::Extract(bestCoeffs[1][2], block);15921593int bo = ParallelMath::Extract(bestCoeffs[2][0], block);1594int bh = ParallelMath::Extract(bestCoeffs[2][1], block);1595int bv = ParallelMath::Extract(bestCoeffs[2][2], block);15961597int go1 = go >> 6;1598int go2 = go & 63;15991600int bo1 = bo >> 5;1601int bo2 = (bo >> 3) & 3;1602int bo3 = bo & 7;16031604int rh1 = (rh >> 1);1605int rh2 = rh & 1;16061607int fakeR = ro >> 2;1608int fakeDR = go1 | ((ro & 3) << 1);16091610int fakeG = (go2 >> 2);1611int fakeDG = ((go2 & 3) << 1) | bo1;16121613int fakeB = bo2;1614int fakeDB = bo3 >> 1;16151616uint32_t highBits = 0;1617uint32_t lowBits = 0;16181619// Avoid overflowing R1620if ((fakeDR & 4) != 0 && fakeR + fakeDR < 8)1621highBits |= 1 << (63 - 32);16221623// Avoid overflowing G1624if ((fakeDG & 4) != 0 && fakeG + fakeDG < 8)1625highBits |= 1 << (55 - 32);16261627// Overflow B1628if (fakeB + fakeDB < 4)1629{1630// Overflow low1631highBits |= 1 << (42 - 32);1632}1633else1634{1635// Overflow high1636highBits |= 7 << (45 - 32);1637}16381639highBits |= ro << (57 - 32);1640highBits |= go1 << (56 - 32);1641highBits |= go2 << (49 - 32);1642highBits |= bo1 << (48 - 32);1643highBits |= bo2 << (43 - 32);1644highBits |= bo3 << (39 - 32);1645highBits |= rh1 << (34 - 32);1646highBits |= 1 << (33 - 32);1647highBits |= rh2 << (32 - 32);16481649lowBits |= gh << 25;1650lowBits |= bh << 19;1651lowBits |= rv << 13;1652lowBits |= gv << 6;1653lowBits |= bv << 0;16541655for (int i = 0; i < 4; i++)1656outputBuffer[block * 8 + i] = (highBits >> (24 - i * 8)) & 0xff;1657for (int i = 0; i < 4; i++)1658outputBuffer[block * 8 + i + 4] = (lowBits >> (24 - i * 8)) & 0xff;1659}1660}1661}16621663void cvtt::Internal::ETCComputer::CompressETC2Block(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, ETC2CompressionData *compressionData, const Options &options, bool punchthroughAlpha)1664{1665ParallelMath::Int16CompFlag pixelIsTransparent[16];1666ParallelMath::Int16CompFlag anyTransparent = ParallelMath::MakeBoolInt16(false);1667ParallelMath::Int16CompFlag allTransparent = ParallelMath::MakeBoolInt16(true);16681669if (punchthroughAlpha)1670{1671const float fThreshold = std::max<float>(std::min<float>(1.0f, options.threshold), 0.0f) * 255.0f;16721673// +1.0f is intentional, we want to take the next valid integer (even if it's 256) since everything else lower is transparent1674MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(std::floor(fThreshold + 1.0f)));16751676for (int px = 0; px < 16; px++)1677{1678MUInt15 alpha;1679for (int block = 0; block < ParallelMath::ParallelSize; block++)1680ParallelMath::PutUInt15(alpha, block, pixelBlocks[block].m_pixels[px][3]);16811682ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(alpha, threshold);1683anyTransparent = (anyTransparent | isTransparent);1684allTransparent = (allTransparent & isTransparent);1685pixelIsTransparent[px] = isTransparent;1686}1687}1688else1689{1690for (int px = 0; px < 16; px++)1691pixelIsTransparent[px] = ParallelMath::MakeBoolInt16(false);16921693allTransparent = anyTransparent = ParallelMath::MakeBoolInt16(false);1694}16951696MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);16971698ETC2CompressionDataInternal* internalData = static_cast<ETC2CompressionDataInternal*>(compressionData);16991700MUInt15 pixels[16][3];1701MFloat preWeightedPixels[16][3];1702ExtractBlocks(pixels, preWeightedPixels, pixelBlocks, options);17031704if (ParallelMath::AnySet(anyTransparent))1705{1706for (int px = 0; px < 16; px++)1707{1708ParallelMath::Int16CompFlag flag = pixelIsTransparent[px];1709ParallelMath::FloatCompFlag fflag = ParallelMath::Int16FlagToFloat(flag);17101711for (int ch = 0; ch < 3; ch++)1712{1713ParallelMath::ConditionalSet(pixels[px][ch], flag, ParallelMath::MakeUInt15(0));1714ParallelMath::ConditionalSet(preWeightedPixels[px][ch], fflag, ParallelMath::MakeFloat(0.0f));1715}1716}1717}17181719if (!ParallelMath::AllSet(allTransparent))1720EncodePlanar(outputBuffer, bestError, pixels, preWeightedPixels, options);17211722MFloat chromaDelta[16][2];17231724MUInt15 numOpaque = ParallelMath::MakeUInt15(16);1725for (int px = 0; px < 16; px++)1726numOpaque = numOpaque - ParallelMath::SelectOrZero(pixelIsTransparent[px], ParallelMath::MakeUInt15(1));17271728if (options.flags & cvtt::Flags::Uniform)1729{1730MSInt16 chromaCoordinates3[16][2];1731for (int px = 0; px < 16; px++)1732{1733chromaCoordinates3[px][0] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);1734chromaCoordinates3[px][1] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][1] << 1) + ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);1735}17361737MSInt16 chromaCoordinateCentroid[2] = { ParallelMath::MakeSInt16(0), ParallelMath::MakeSInt16(0) };1738for (int px = 0; px < 16; px++)1739{1740for (int ch = 0; ch < 2; ch++)1741chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];1742}17431744if (punchthroughAlpha)1745{1746for (int px = 0; px < 16; px++)1747{1748for (int ch = 0; ch < 2; ch++)1749{1750MUInt15 chromaCoordinateMultiplied = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(chromaCoordinates3[px][ch], numOpaque));1751MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(chromaCoordinateMultiplied) - chromaCoordinateCentroid[ch];1752chromaDelta[px][ch] = ParallelMath::ToFloat(delta);1753}1754}1755}1756else1757{1758for (int px = 0; px < 16; px++)1759{1760for (int ch = 0; ch < 2; ch++)1761chromaDelta[px][ch] = ParallelMath::ToFloat((chromaCoordinates3[px][ch] << 4) - chromaCoordinateCentroid[ch]);1762}1763}17641765const MFloat rcpSqrt3 = ParallelMath::MakeFloat(0.57735026918962576450914878050196f);17661767for (int px = 0; px < 16; px++)1768chromaDelta[px][1] = chromaDelta[px][1] * rcpSqrt3;1769}1770else1771{1772const float chromaAxis0[3] = { internalData->m_chromaSideAxis0[0], internalData->m_chromaSideAxis0[1], internalData->m_chromaSideAxis0[2] };1773const float chromaAxis1[3] = { internalData->m_chromaSideAxis1[0], internalData->m_chromaSideAxis1[1], internalData->m_chromaSideAxis1[2] };17741775MFloat chromaCoordinates3[16][2];1776for (int px = 0; px < 16; px++)1777{1778const MFloat &px0 = preWeightedPixels[px][0];1779const MFloat &px1 = preWeightedPixels[px][1];1780const MFloat &px2 = preWeightedPixels[px][2];17811782chromaCoordinates3[px][0] = px0 * chromaAxis0[0] + px1 * chromaAxis0[1] + px2 * chromaAxis0[2];1783chromaCoordinates3[px][1] = px0 * chromaAxis1[0] + px1 * chromaAxis1[1] + px2 * chromaAxis1[2];1784}17851786MFloat chromaCoordinateCentroid[2] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };1787for (int px = 0; px < 16; px++)1788{1789for (int ch = 0; ch < 2; ch++)1790chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];1791}17921793if (punchthroughAlpha)1794{1795const MFloat numOpaqueF = ParallelMath::ToFloat(numOpaque);1796for (int px = 0; px < 16; px++)1797{1798for (int ch = 0; ch < 2; ch++)1799{1800MFloat chromaCoordinateMultiplied = chromaCoordinates3[px][ch] * numOpaqueF;1801MFloat delta = chromaCoordinateMultiplied - chromaCoordinateCentroid[ch];1802chromaDelta[px][ch] = delta;1803}1804}1805}1806else1807{1808for (int px = 0; px < 16; px++)1809{1810for (int ch = 0; ch < 2; ch++)1811chromaDelta[px][ch] = chromaCoordinates3[px][ch] * 16.0f - chromaCoordinateCentroid[ch];1812}1813}1814}181518161817MFloat covXX = ParallelMath::MakeFloatZero();1818MFloat covYY = ParallelMath::MakeFloatZero();1819MFloat covXY = ParallelMath::MakeFloatZero();18201821for (int px = 0; px < 16; px++)1822{1823MFloat nx = chromaDelta[px][0];1824MFloat ny = chromaDelta[px][1];18251826covXX = covXX + nx * nx;1827covYY = covYY + ny * ny;1828covXY = covXY + nx * ny;1829}18301831MFloat halfTrace = (covXX + covYY) * 0.5f;1832MFloat det = covXX * covYY - covXY * covXY;18331834MFloat mm = ParallelMath::Sqrt(ParallelMath::Max(ParallelMath::MakeFloatZero(), halfTrace * halfTrace - det));18351836MFloat ev = halfTrace + mm;18371838MFloat dx = (covYY - ev + covXY);1839MFloat dy = -(covXX - ev + covXY);18401841// If evenly distributed, pick an arbitrary plane1842ParallelMath::FloatCompFlag allZero = ParallelMath::Equal(dx, ParallelMath::MakeFloatZero()) & ParallelMath::Equal(dy, ParallelMath::MakeFloatZero());1843ParallelMath::ConditionalSet(dx, allZero, ParallelMath::MakeFloat(1.f));18441845ParallelMath::Int16CompFlag sectorAssignments[16];1846for (int px = 0; px < 16; px++)1847sectorAssignments[px] = ParallelMath::FloatFlagToInt16(ParallelMath::Less(chromaDelta[px][0] * dx + chromaDelta[px][1] * dy, ParallelMath::MakeFloatZero()));18481849if (!ParallelMath::AllSet(allTransparent))1850{1851EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);18521853// Flip sector assignments1854for (int px = 0; px < 16; px++)1855sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);18561857EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);18581859EncodeHMode(outputBuffer, bestError, sectorAssignments, pixels, internalData->m_h, preWeightedPixels, options);18601861CompressETC1BlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, internalData->m_drs, options, true);1862}18631864if (ParallelMath::AnySet(anyTransparent))1865{1866if (!ParallelMath::AllSet(allTransparent))1867{1868// Flip sector assignments1869for (int px = 0; px < 16; px++)1870sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);1871}18721873// Reset the error of any transparent blocks to max and retry with punchthrough modes1874ParallelMath::ConditionalSet(bestError, ParallelMath::Int16FlagToFloat(anyTransparent), ParallelMath::MakeFloat(FLT_MAX));18751876EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);18771878// Flip sector assignments1879for (int px = 0; px < 16; px++)1880sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);18811882EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);18831884CompressETC1PunchthroughBlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, pixelIsTransparent, static_cast<ETC2CompressionDataInternal*>(compressionData)->m_drs, options);1885}1886}18871888void cvtt::Internal::ETCComputer::CompressETC2AlphaBlock(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, const Options &options)1889{1890MUInt15 pixels[16];18911892for (int px = 0; px < 16; px++)1893{1894for (int block = 0; block < ParallelMath::ParallelSize; block++)1895ParallelMath::PutUInt15(pixels[px], block, pixelBlocks[block].m_pixels[px][3]);1896}18971898CompressETC2AlphaBlockInternal(outputBuffer, pixels, false, false, options);1899}19001901void cvtt::Internal::ETCComputer::CompressETC2AlphaBlockInternal(uint8_t *outputBuffer, const MUInt15 pixels[16], bool is11Bit, bool isSigned, const Options &options)1902{1903MUInt15 minAlpha = ParallelMath::MakeUInt15(is11Bit ? 2047 : 255);1904MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);19051906for (int px = 0; px < 16; px++)1907{1908minAlpha = ParallelMath::Min(minAlpha, pixels[px]);1909maxAlpha = ParallelMath::Max(maxAlpha, pixels[px]);1910}19111912MUInt15 alphaSpan = maxAlpha - minAlpha;1913MUInt15 alphaSpanMidpointTimes2 = maxAlpha + minAlpha;19141915MUInt31 bestTotalError = ParallelMath::MakeUInt31(0x7fffffff);1916MUInt15 bestTableIndex = ParallelMath::MakeUInt15(0);1917MUInt15 bestBaseCodeword = ParallelMath::MakeUInt15(0);1918MUInt15 bestMultiplier = ParallelMath::MakeUInt15(0);1919MUInt15 bestIndexes[16];19201921for (int px = 0; px < 16; px++)1922bestIndexes[px] = ParallelMath::MakeUInt15(0);19231924const int numAlphaRanges = 10;1925for (uint16_t tableIndex = 0; tableIndex < 16; tableIndex++)1926{1927for (int r = 0; r < numAlphaRanges; r++)1928{1929int subrange = r % 3;1930int mainRange = r / 3;19311932int16_t maxOffset = Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - (subrange & 1)];1933int16_t minOffset = -Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - ((subrange >> 1) & 1)] - 1;1934uint16_t offsetSpan = static_cast<uint16_t>(maxOffset - minOffset);19351936MSInt16 vminOffset = ParallelMath::MakeSInt16(minOffset);1937MUInt15 vmaxOffset = ParallelMath::MakeUInt15(maxOffset);1938MUInt15 voffsetSpan = ParallelMath::MakeUInt15(offsetSpan);19391940MUInt15 minMultiplier = ParallelMath::MakeUInt15(0);1941for (int block = 0; block < ParallelMath::ParallelSize; block++)1942{1943uint16_t singleAlphaSpan = ParallelMath::Extract(alphaSpan, block);19441945uint16_t lowMultiplier = singleAlphaSpan / offsetSpan;1946ParallelMath::PutUInt15(minMultiplier, block, lowMultiplier);1947}19481949if (is11Bit)1950{1951// Clamps this to valid multipliers under 15 and rounds down to nearest multiple of 81952minMultiplier = ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(112)) & ParallelMath::MakeUInt15(120);1953}1954else1955{1956// We cap at 1 and 14 so both multipliers are valid and dividable1957// Cases where offset span is 0 should be caught by multiplier 1 of table 131958minMultiplier = ParallelMath::Max(ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(14)), ParallelMath::MakeUInt15(1));1959}19601961for (uint16_t multiplierOffset = 0; multiplierOffset < 2; multiplierOffset++)1962{1963MUInt15 multiplier = minMultiplier;19641965if (is11Bit)1966{1967if (multiplierOffset == 1)1968multiplier = multiplier + ParallelMath::MakeUInt15(8);1969else1970multiplier = ParallelMath::Max(multiplier, ParallelMath::MakeUInt15(1));1971}1972else1973{1974if (multiplierOffset == 1)1975multiplier = multiplier + ParallelMath::MakeUInt15(1);1976}19771978MSInt16 multipliedMinOffset = ParallelMath::CompactMultiply(ParallelMath::LosslessCast<MSInt16>::Cast(multiplier), vminOffset);1979MUInt15 multipliedMaxOffset = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(multiplier, vmaxOffset));19801981// codeword = (maxOffset + minOffset + minAlpha + maxAlpha) / 21982MSInt16 unclampedBaseAlphaTimes2 = ParallelMath::LosslessCast<MSInt16>::Cast(alphaSpanMidpointTimes2) - ParallelMath::LosslessCast<MSInt16>::Cast(multipliedMaxOffset) - multipliedMinOffset;19831984MUInt15 baseAlpha;1985if (is11Bit)1986{1987// In unsigned, 4 is added to the unquantized alpha, so compensating for that cancels the 4 we have to add to do rounding.1988if (isSigned)1989unclampedBaseAlphaTimes2 = unclampedBaseAlphaTimes2 + ParallelMath::MakeSInt16(8);19901991// -128 is illegal for some reason1992MSInt16 minBaseAlphaTimes2 = isSigned ? ParallelMath::MakeSInt16(16) : ParallelMath::MakeSInt16(0);19931994MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, minBaseAlphaTimes2)), ParallelMath::MakeUInt15(4095));1995baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2, 1) & ParallelMath::MakeUInt15(2040);19961997if (!isSigned)1998baseAlpha = baseAlpha + ParallelMath::MakeUInt15(4);1999}2000else2001{2002MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, ParallelMath::MakeSInt16(0))), ParallelMath::MakeUInt15(510));2003baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2 + ParallelMath::MakeUInt15(1), 1);2004}20052006MUInt15 indexes[16];2007MUInt31 totalError = ParallelMath::MakeUInt31(0);2008for (int px = 0; px < 16; px++)2009{2010MUInt15 quantizedValues;2011QuantizeETC2Alpha(tableIndex, pixels[px], baseAlpha, multiplier, is11Bit, isSigned, indexes[px], quantizedValues);20122013if (is11Bit)2014{2015MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(quantizedValues) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px]);2016MSInt32 deltaSq = ParallelMath::XMultiply(delta, delta);2017totalError = totalError + ParallelMath::LosslessCast<MUInt31>::Cast(deltaSq);2018}2019else2020totalError = totalError + ParallelMath::ToUInt31(ParallelMath::SqDiffUInt8(quantizedValues, pixels[px]));2021}20222023ParallelMath::Int16CompFlag isBetter = ParallelMath::Int32FlagToInt16(ParallelMath::Less(totalError, bestTotalError));2024if (ParallelMath::AnySet(isBetter))2025{2026ParallelMath::ConditionalSet(bestTotalError, isBetter, totalError);2027ParallelMath::ConditionalSet(bestTableIndex, isBetter, ParallelMath::MakeUInt15(tableIndex));2028ParallelMath::ConditionalSet(bestBaseCodeword, isBetter, baseAlpha);2029ParallelMath::ConditionalSet(bestMultiplier, isBetter, multiplier);20302031for (int px = 0; px < 16; px++)2032ParallelMath::ConditionalSet(bestIndexes[px], isBetter, indexes[px]);2033}20342035// TODO: Do one refine pass2036}2037}2038}20392040if (is11Bit)2041{2042bestMultiplier = ParallelMath::RightShift(bestMultiplier, 3);20432044if (isSigned)2045bestBaseCodeword = bestBaseCodeword ^ ParallelMath::MakeUInt15(0x80);2046}20472048for (int block = 0; block < ParallelMath::ParallelSize; block++)2049{2050uint8_t *output = outputBuffer + block * 8;20512052output[0] = static_cast<uint8_t>(ParallelMath::Extract(bestBaseCodeword, block));20532054ParallelMath::ScalarUInt16 multiplier = ParallelMath::Extract(bestMultiplier, block);2055ParallelMath::ScalarUInt16 tableIndex = ParallelMath::Extract(bestTableIndex, block);20562057output[1] = static_cast<uint8_t>((multiplier << 4) | tableIndex);20582059static const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };20602061ParallelMath::ScalarUInt16 indexes[16];2062for (int px = 0; px < 16; px++)2063indexes[pixelSelectorOrder[px]] = ParallelMath::Extract(bestIndexes[px], block);20642065int outputOffset = 2;2066int outputBits = 0;2067int numOutputBits = 0;2068for (int s = 0; s < 16; s++)2069{2070outputBits = (outputBits << 3) | indexes[s];2071numOutputBits += 3;20722073if (numOutputBits >= 8)2074{2075output[outputOffset++] = static_cast<uint8_t>(outputBits >> (numOutputBits - 8));2076numOutputBits -= 8;20772078outputBits &= ((1 << numOutputBits) - 1);2079}2080}20812082assert(outputOffset == 8 && numOutputBits == 0);2083}2084}20852086void cvtt::Internal::ETCComputer::CompressEACBlock(uint8_t *outputBuffer, const PixelBlockScalarS16 *inputBlocks, bool isSigned, const Options &options)2087{2088MUInt15 pixels[16];2089for (int px = 0; px < 16; px++)2090{2091MSInt16 adjustedPixel;2092for (int block = 0; block < ParallelMath::ParallelSize; block++)2093ParallelMath::PutSInt16(adjustedPixel, block, inputBlocks[block].m_pixels[px]);20942095// We use a slightly shifted range here so we can keep the unquantized base color in a UInt152096// That is, signed range is 1..2047, and unsigned range is 0..20472097if (isSigned)2098{2099adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(1023)) + ParallelMath::MakeSInt16(1024);2100adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(1), adjustedPixel);2101}2102else2103{2104adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(2047));2105adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(0), adjustedPixel);2106}210721082109pixels[px] = ParallelMath::LosslessCast<MUInt15>::Cast(adjustedPixel);2110}21112112CompressETC2AlphaBlockInternal(outputBuffer, pixels, true, isSigned, options);2113}21142115void cvtt::Internal::ETCComputer::CompressETC1Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC1CompressionData *compressionData, const Options &options)2116{2117DifferentialResolveStorage &drs = static_cast<ETC1CompressionDataInternal*>(compressionData)->m_drs;2118MFloat bestTotalError = ParallelMath::MakeFloat(FLT_MAX);21192120MUInt15 pixels[16][3];2121MFloat preWeightedPixels[16][3];2122ExtractBlocks(pixels, preWeightedPixels, inputBlocks, options);21232124CompressETC1BlockInternal(bestTotalError, outputBuffer, pixels, preWeightedPixels, drs, options, false);2125}21262127void cvtt::Internal::ETCComputer::ExtractBlocks(MUInt15 pixels[16][3], MFloat preWeightedPixels[16][3], const PixelBlockU8 *inputBlocks, const Options &options)2128{2129bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);2130bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);21312132for (int px = 0; px < 16; px++)2133{2134for (int ch = 0; ch < 3; ch++)2135{2136for (int block = 0; block < ParallelMath::ParallelSize; block++)2137ParallelMath::PutUInt15(pixels[px][ch], block, inputBlocks[block].m_pixels[px][ch]);2138}21392140if (isFakeBT709)2141ConvertToFakeBT709(preWeightedPixels[px], pixels[px]);2142else if (isUniform)2143{2144for (int ch = 0; ch < 3; ch++)2145preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);2146}2147else2148{2149preWeightedPixels[px][0] = ParallelMath::ToFloat(pixels[px][0]) * options.redWeight;2150preWeightedPixels[px][1] = ParallelMath::ToFloat(pixels[px][1]) * options.greenWeight;2151preWeightedPixels[px][2] = ParallelMath::ToFloat(pixels[px][2]) * options.blueWeight;2152}2153}2154}21552156void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingAccurate(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)2157{2158for (int ch = 0; ch < 3; ch++)2159{2160const MUInt15& cu15 = sectorCumulative[ch];21612162if (isDifferential)2163{2164//quantized[ch] = (cu * 31 + (cu >> 3)) >> 11;2165quantized[ch] = ParallelMath::ToUInt15(2166ParallelMath::RightShift(2167(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))2168, 11)2169);2170}2171else2172{2173//quantized[ch] = (cu * 30 + (cu >> 3)) >> 12;2174quantized[ch] = ParallelMath::ToUInt15(2175ParallelMath::RightShift(2176(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))2177, 12)2178);2179}2180}21812182MFloat lowOctantRGBFloat[3];2183MFloat highOctantRGBFloat[3];21842185for (int ch = 0; ch < 3; ch++)2186{2187MUInt15 unquantized;2188MUInt15 unquantizedNext;2189if (isDifferential)2190{2191unquantized = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);2192MUInt15 quantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(31), quantized[ch] + ParallelMath::MakeUInt15(1));2193unquantizedNext = (quantizedNext << 3) | ParallelMath::RightShift(quantizedNext, 2);2194}2195else2196{2197unquantized = (quantized[ch] << 4) | quantized[ch];2198unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));2199}2200lowOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantized << 3);2201highOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantizedNext << 3);2202}22032204MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);2205MUInt15 bestOctant = ParallelMath::MakeUInt15(0);22062207MFloat cumulativeYUV[3];2208ConvertToFakeBT709(cumulativeYUV, sectorCumulative);22092210for (uint16_t octant = 0; octant < 8; octant++)2211{2212const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];2213const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];2214const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];22152216MFloat octantYUV[3];2217ConvertToFakeBT709(octantYUV, r, g, b);22182219MFloat delta[3];2220for (int ch = 0; ch < 3; ch++)2221delta[ch] = octantYUV[ch] - cumulativeYUV[ch];22222223MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];2224ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));2225ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));2226bestError = ParallelMath::Min(error, bestError);2227}22282229for (int ch = 0; ch < 3; ch++)2230quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));2231}22322233void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingFast(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)2234{2235// sectorCumulative range is 0..2040 (11 bits)2236MUInt15 roundingOffset = ParallelMath::MakeUInt15(0);22372238MUInt15 rOffset;2239MUInt15 gOffset;2240MUInt15 bOffset;2241MUInt15 quantizedBase[3];2242MUInt15 upperBound;22432244MUInt15 sectorCumulativeFillIn[3];2245for (int ch = 0; ch < 3; ch++)2246sectorCumulativeFillIn[ch] = sectorCumulative[ch] + ParallelMath::RightShift(sectorCumulative[ch], 8);22472248if (isDifferential)2249{2250rOffset = (sectorCumulativeFillIn[0] << 6) & ParallelMath::MakeUInt15(0xf00);2251gOffset = (sectorCumulativeFillIn[1] << 4) & ParallelMath::MakeUInt15(0x0f0);2252bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 2) & ParallelMath::MakeUInt15(0x00f);22532254for (int ch = 0; ch < 3; ch++)2255quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 6);22562257upperBound = ParallelMath::MakeUInt15(31);2258}2259else2260{2261rOffset = (sectorCumulativeFillIn[0] << 5) & ParallelMath::MakeUInt15(0xf00);2262gOffset = (sectorCumulativeFillIn[1] << 1) & ParallelMath::MakeUInt15(0x0f0);2263bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 3) & ParallelMath::MakeUInt15(0x00f);22642265for (int ch = 0; ch < 3; ch++)2266quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 7);22672268upperBound = ParallelMath::MakeUInt15(15);2269}22702271MUInt15 lookupIndex = (rOffset | gOffset | bOffset);22722273MUInt15 octant;2274for (int block = 0; block < ParallelMath::ParallelSize; block++)2275ParallelMath::PutUInt15(octant, block, Tables::FakeBT709::g_rounding16[ParallelMath::Extract(lookupIndex, block)]);22762277quantizedBase[0] = quantizedBase[0] + (octant & ParallelMath::MakeUInt15(1));2278quantizedBase[1] = quantizedBase[1] + (ParallelMath::RightShift(octant, 1) & ParallelMath::MakeUInt15(1));2279quantizedBase[2] = quantizedBase[2] + (ParallelMath::RightShift(octant, 2) & ParallelMath::MakeUInt15(1));22802281for (int ch = 0; ch < 3; ch++)2282quantized[ch] = ParallelMath::Min(quantizedBase[ch], upperBound);2283}22842285void cvtt::Internal::ETCComputer::ResolveTHFakeBT709Rounding(MUInt15 quantized[3], const MUInt15 targets[3], const MUInt15 &granularity)2286{2287MFloat lowOctantRGBFloat[3];2288MFloat highOctantRGBFloat[3];22892290for (int ch = 0; ch < 3; ch++)2291{2292MUInt15 unquantized = (quantized[ch] << 4) | quantized[ch];2293MUInt15 unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));22942295lowOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantized, granularity) << 1);2296highOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantizedNext, granularity) << 1);2297}22982299MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);2300MUInt15 bestOctant = ParallelMath::MakeUInt15(0);23012302MFloat cumulativeYUV[3];2303ConvertToFakeBT709(cumulativeYUV, ParallelMath::ToFloat(targets[0]), ParallelMath::ToFloat(targets[1]), ParallelMath::ToFloat(targets[2]));23042305for (uint16_t octant = 0; octant < 8; octant++)2306{2307const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];2308const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];2309const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];23102311MFloat octantYUV[3];2312ConvertToFakeBT709(octantYUV, r, g, b);23132314MFloat delta[3];2315for (int ch = 0; ch < 3; ch++)2316delta[ch] = octantYUV[ch] - cumulativeYUV[ch];23172318MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];2319ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));2320ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));2321bestError = ParallelMath::Min(error, bestError);2322}23232324for (int ch = 0; ch < 3; ch++)2325quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));2326}23272328void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MUInt15 color[3])2329{2330MFloat floatRGB[3];2331for (int ch = 0; ch < 3; ch++)2332floatRGB[ch] = ParallelMath::ToFloat(color[ch]);23332334ConvertToFakeBT709(yuv, floatRGB);2335}23362337void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat color[3])2338{2339ConvertToFakeBT709(yuv, color[0], color[1], color[2]);2340}23412342void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat &pr, const MFloat &pg, const MFloat &pb)2343{2344MFloat r = pr;2345MFloat g = pg;2346MFloat b = pb;23472348yuv[0] = r * 0.368233989135369f + g * 1.23876274963149f + b * 0.125054068802017f;2349yuv[1] = r * 0.5f - g * 0.4541529f - b * 0.04584709f;2350yuv[2] = r * -0.081014709086133f - g * 0.272538676238785f + b * 0.353553390593274f;2351}23522353void cvtt::Internal::ETCComputer::ConvertFromFakeBT709(MFloat rgb[3], const MFloat yuv[3])2354{2355MFloat yy = yuv[0] * 0.57735026466774571071f;2356MFloat u = yuv[1];2357MFloat v = yuv[2];23582359rgb[0] = yy + u * 1.5748000207960953486f;2360rgb[1] = yy - u * 0.46812425854364753669f - v * 0.26491652528157560861f;2361rgb[2] = yy + v * 2.6242146882856944069f;2362}236323642365void cvtt::Internal::ETCComputer::QuantizeETC2Alpha(int tableIndex, const MUInt15& value, const MUInt15& baseValue, const MUInt15& multiplier, bool is11Bit, bool isSigned, MUInt15& outIndexes, MUInt15& outQuantizedValues)2366{2367MSInt16 offset = ParallelMath::LosslessCast<MSInt16>::Cast(value) - ParallelMath::LosslessCast<MSInt16>::Cast(baseValue);2368MSInt16 offsetTimes2 = offset + offset;23692370// ETC2's offset tables all have a reflect about 0.5*multiplier2371MSInt16 offsetAboutReflectorTimes2 = offsetTimes2 + ParallelMath::LosslessCast<MSInt16>::Cast(multiplier);23722373MUInt15 absOffsetAboutReflectorTimes2 = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Abs(offsetAboutReflectorTimes2));2374MUInt15 lookupIndex = ParallelMath::RightShift(absOffsetAboutReflectorTimes2, 1);23752376MUInt15 positiveIndex;2377MUInt15 positiveOffsetUnmultiplied;2378for (int block = 0; block < ParallelMath::ParallelSize; block++)2379{2380uint16_t blockLookupIndex = ParallelMath::Extract(lookupIndex, block) / ParallelMath::Extract(multiplier, block);2381if (blockLookupIndex >= Tables::ETC2::g_alphaRoundingTableWidth)2382blockLookupIndex = Tables::ETC2::g_alphaRoundingTableWidth - 1;2383uint16_t index = Tables::ETC2::g_alphaRoundingTables[tableIndex][blockLookupIndex];2384ParallelMath::PutUInt15(positiveIndex, block, index);2385ParallelMath::PutUInt15(positiveOffsetUnmultiplied, block, Tables::ETC2::g_alphaModifierTablePositive[tableIndex][index]);23862387// TODO: This is suboptimal when the offset is capped. We should detect 0 and 255 values and always map them to the maximum offsets.2388// Doing that will also affect refinement though.2389}23902391MSInt16 signBits = ParallelMath::RightShift(offsetAboutReflectorTimes2, 15);2392MSInt16 offsetUnmultiplied = ParallelMath::LosslessCast<MSInt16>::Cast(positiveOffsetUnmultiplied) ^ signBits;2393MSInt16 quantizedOffset = ParallelMath::CompactMultiply(offsetUnmultiplied, multiplier);23942395MSInt16 offsetValue = ParallelMath::LosslessCast<MSInt16>::Cast(baseValue) + quantizedOffset;23962397if (is11Bit)2398{2399if (isSigned)2400outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(1), offsetValue)));2401else2402outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));2403}2404else2405outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(255), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));24062407MUInt15 indexSub = ParallelMath::LosslessCast<MUInt15>::Cast(signBits) & ParallelMath::MakeUInt15(4);24082409outIndexes = positiveIndex + ParallelMath::MakeUInt15(4) - indexSub;2410}241124122413void cvtt::Internal::ETCComputer::EmitTModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 lineColor[3], const ParallelMath::ScalarUInt16 isolatedColor[3], int32_t packedSelectors, ParallelMath::ScalarUInt16 table, bool opaque)2414{2415static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };24162417uint32_t lowBits = 0;2418uint32_t highBits = 0;24192420int rh = ((isolatedColor[0] >> 2) & 3);2421int rl = (isolatedColor[0] & 3);24222423if (rh + rl < 4)2424{2425// Overflow low2426highBits |= 1 << (58 - 32);2427}2428else2429{2430// Overflow high2431highBits |= 7 << (61 - 32);2432}24332434highBits |= rh << (59 - 32);2435highBits |= rl << (56 - 32);2436highBits |= isolatedColor[1] << (52 - 32);2437highBits |= isolatedColor[2] << (48 - 32);2438highBits |= lineColor[0] << (44 - 32);2439highBits |= lineColor[1] << (40 - 32);2440highBits |= lineColor[2] << (36 - 32);2441highBits |= ((table >> 1) & 3) << (34 - 32);2442if (opaque)2443highBits |= 1 << (33 - 32);2444highBits |= (table & 1) << (32 - 32);24452446for (int px = 0; px < 16; px++)2447{2448int sel = (packedSelectors >> (2 * selectorOrder[px])) & 3;2449if ((sel & 0x1) != 0)2450lowBits |= (1 << px);2451if ((sel & 0x2) != 0)2452lowBits |= (1 << (16 + px));2453}24542455for (int i = 0; i < 4; i++)2456outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;2457for (int i = 0; i < 4; i++)2458outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;2459}24602461void cvtt::Internal::ETCComputer::EmitHModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 blockColors[2], ParallelMath::ScalarUInt16 sectorBits, ParallelMath::ScalarUInt16 signBits, ParallelMath::ScalarUInt16 table, bool opaque)2462{2463if (blockColors[0] == blockColors[1])2464{2465// Base colors are the same.2466// If the table low bit isn't 1, then we can't encode this, because swapping the block colors will have no effect2467// on their order.2468// Instead, we encode this as T mode where all of the indexes are on the line.24692470ParallelMath::ScalarUInt16 lineColor[3];2471ParallelMath::ScalarUInt16 isolatedColor[3];24722473lineColor[0] = isolatedColor[0] = (blockColors[0] >> 10) & 0x1f;2474lineColor[1] = isolatedColor[1] = (blockColors[0] >> 5) & 0x1f;2475lineColor[2] = isolatedColor[2] = (blockColors[0] >> 0) & 0x1f;24762477int32_t packedSelectors = 0x55555555;2478for (int px = 0; px < 16; px++)2479packedSelectors |= ((signBits >> px) & 1) << ((px * 2) + 1);24802481EmitTModeBlock(outputBuffer, lineColor, isolatedColor, packedSelectors, table, opaque);2482return;2483}24842485static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };24862487int16_t colors[2][3];2488for (int sector = 0; sector < 2; sector++)2489{2490for (int ch = 0; ch < 3; ch++)2491colors[sector][ch] = (blockColors[sector] >> ((2 - ch) * 5)) & 15;2492}24932494uint32_t lowBits = 0;2495uint32_t highBits = 0;24962497if (((table & 1) == 1) != (blockColors[0] > blockColors[1]))2498{2499for (int ch = 0; ch < 3; ch++)2500std::swap(colors[0][ch], colors[1][ch]);2501sectorBits ^= 0xffff;2502}25032504int r1 = colors[0][0];2505int g1a = colors[0][1] >> 1;2506int g1b = (colors[0][1] & 1);2507int b1a = colors[0][2] >> 3;2508int b1b = colors[0][2] & 7;2509int r2 = colors[1][0];2510int g2 = colors[1][1];2511int b2 = colors[1][2];25122513// Avoid overflowing R2514if ((g1a & 4) != 0 && r1 + g1a < 8)2515highBits |= 1 << (63 - 32);25162517int fakeDG = b1b >> 1;2518int fakeG = b1a | (g1b << 1);25192520if (fakeG + fakeDG < 4)2521{2522// Overflow low2523highBits |= 1 << (50 - 32);2524}2525else2526{2527// Overflow high2528highBits |= 7 << (53 - 32);2529}25302531int da = (table >> 2) & 1;2532int db = (table >> 1) & 1;25332534highBits |= r1 << (59 - 32);2535highBits |= g1a << (56 - 32);2536highBits |= g1b << (52 - 32);2537highBits |= b1a << (51 - 32);2538highBits |= b1b << (47 - 32);2539highBits |= r2 << (43 - 32);2540highBits |= g2 << (39 - 32);2541highBits |= b2 << (35 - 32);2542highBits |= da << (34 - 32);2543if (opaque)2544highBits |= 1 << (33 - 32);2545highBits |= db << (32 - 32);25462547for (int px = 0; px < 16; px++)2548{2549int sectorBit = (sectorBits >> selectorOrder[px]) & 1;2550int signBit = (signBits >> selectorOrder[px]) & 1;25512552lowBits |= (signBit << px);2553lowBits |= (sectorBit << (16 + px));2554}25552556uint8_t *output = outputBuffer;25572558for (int i = 0; i < 4; i++)2559output[i] = (highBits >> (24 - i * 8)) & 0xff;2560for (int i = 0; i < 4; i++)2561output[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;2562}25632564void cvtt::Internal::ETCComputer::EmitETC1Block(uint8_t *outputBuffer, int blockBestFlip, int blockBestD, const int blockBestColors[2][3], const int blockBestTables[2], const ParallelMath::ScalarUInt16 blockBestSelectors[2], bool transparent)2565{2566uint32_t highBits = 0;2567uint32_t lowBits = 0;25682569if (blockBestD == 0)2570{2571highBits |= blockBestColors[0][0] << 28;2572highBits |= blockBestColors[1][0] << 24;2573highBits |= blockBestColors[0][1] << 20;2574highBits |= blockBestColors[1][1] << 16;2575highBits |= blockBestColors[0][2] << 12;2576highBits |= blockBestColors[1][2] << 8;2577}2578else2579{2580highBits |= blockBestColors[0][0] << 27;2581highBits |= ((blockBestColors[1][0] - blockBestColors[0][0]) & 7) << 24;2582highBits |= blockBestColors[0][1] << 19;2583highBits |= ((blockBestColors[1][1] - blockBestColors[0][1]) & 7) << 16;2584highBits |= blockBestColors[0][2] << 11;2585highBits |= ((blockBestColors[1][2] - blockBestColors[0][2]) & 7) << 8;2586}25872588highBits |= (blockBestTables[0] << 5);2589highBits |= (blockBestTables[1] << 2);2590if (!transparent)2591highBits |= (blockBestD << 1);2592highBits |= blockBestFlip;25932594const uint8_t modifierCodes[4] = { 3, 2, 0, 1 };25952596uint8_t unpackedSelectors[16];2597uint8_t unpackedSelectorCodes[16];2598for (int sector = 0; sector < 2; sector++)2599{2600int blockSectorBestSelectors = blockBestSelectors[sector];26012602for (int px = 0; px < 8; px++)2603{2604int selector = (blockSectorBestSelectors >> (2 * px)) & 3;2605unpackedSelectorCodes[g_flipTables[blockBestFlip][sector][px]] = modifierCodes[selector];2606unpackedSelectors[g_flipTables[blockBestFlip][sector][px]] = selector;2607}2608}26092610const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };26112612int lowBitOffset = 0;2613for (int sb = 0; sb < 2; sb++)2614for (int px = 0; px < 16; px++)2615lowBits |= ((unpackedSelectorCodes[pixelSelectorOrder[px]] >> sb) & 1) << (px + sb * 16);26162617for (int i = 0; i < 4; i++)2618outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;2619for (int i = 0; i < 4; i++)2620outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;2621}26222623void cvtt::Internal::ETCComputer::CompressETC1BlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], DifferentialResolveStorage &drs, const Options &options, bool punchthrough)2624{2625int numTries = 0;26262627MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);2628MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);26292630MUInt15 bestColors[2] = { zeroU15, zeroU15 };2631MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };2632MUInt15 bestTables[2] = { zeroU15, zeroU15 };2633MUInt15 bestFlip = zeroU15;2634MUInt15 bestD = zeroU15;26352636MUInt15 sectorPixels[2][2][8][3];2637MFloat sectorPreWeightedPixels[2][2][8][3];2638MUInt15 sectorCumulative[2][2][3];26392640ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);26412642for (int flip = 0; flip < 2; flip++)2643{2644for (int sector = 0; sector < 2; sector++)2645{2646for (int ch = 0; ch < 3; ch++)2647sectorCumulative[flip][sector][ch] = zeroU15;26482649for (int px = 0; px < 8; px++)2650{2651for (int ch = 0; ch < 3; ch++)2652{2653MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];2654sectorPixels[flip][sector][px][ch] = pixelChannelValue;2655sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];2656sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;2657}2658}2659}2660}26612662static const MSInt16 modifierTables[8][4] =2663{2664{ ParallelMath::MakeSInt16(-8), ParallelMath::MakeSInt16(-2), ParallelMath::MakeSInt16(2), ParallelMath::MakeSInt16(8) },2665{ ParallelMath::MakeSInt16(-17), ParallelMath::MakeSInt16(-5), ParallelMath::MakeSInt16(5), ParallelMath::MakeSInt16(17) },2666{ ParallelMath::MakeSInt16(-29), ParallelMath::MakeSInt16(-9), ParallelMath::MakeSInt16(9), ParallelMath::MakeSInt16(29) },2667{ ParallelMath::MakeSInt16(-42), ParallelMath::MakeSInt16(-13), ParallelMath::MakeSInt16(13), ParallelMath::MakeSInt16(42) },2668{ ParallelMath::MakeSInt16(-60), ParallelMath::MakeSInt16(-18), ParallelMath::MakeSInt16(18), ParallelMath::MakeSInt16(60) },2669{ ParallelMath::MakeSInt16(-80), ParallelMath::MakeSInt16(-24), ParallelMath::MakeSInt16(24), ParallelMath::MakeSInt16(80) },2670{ ParallelMath::MakeSInt16(-106), ParallelMath::MakeSInt16(-33), ParallelMath::MakeSInt16(33), ParallelMath::MakeSInt16(106) },2671{ ParallelMath::MakeSInt16(-183), ParallelMath::MakeSInt16(-47), ParallelMath::MakeSInt16(47), ParallelMath::MakeSInt16(183) },2672};26732674bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);26752676int minD = punchthrough ? 1 : 0;26772678for (int flip = 0; flip < 2; flip++)2679{2680drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;26812682MFloat bestIndError[2] = { ParallelMath::MakeFloat(FLT_MAX), ParallelMath::MakeFloat(FLT_MAX) };2683MUInt16 bestIndSelectors[2] = { ParallelMath::MakeUInt16(0), ParallelMath::MakeUInt16(0) };2684MUInt15 bestIndColors[2] = { zeroU15, zeroU15 };2685MUInt15 bestIndTable[2] = { zeroU15, zeroU15 };26862687for (int d = minD; d < 2; d++)2688{2689for (int sector = 0; sector < 2; sector++)2690{2691const int16_t *potentialOffsets = cvtt::Tables::ETC1::g_potentialOffsets4;26922693for (int table = 0; table < 8; table++)2694{2695int16_t numOffsets = *potentialOffsets++;26962697MUInt15 possibleColors[cvtt::Tables::ETC1::g_maxPotentialOffsets];26982699MUInt15 quantized[3];2700for (int oi = 0; oi < numOffsets; oi++)2701{2702if (!isFakeBT709)2703{2704for (int ch = 0; ch < 3; ch++)2705{2706// cu is in range 0..20402707MUInt15 cu15 = ParallelMath::Min(2708ParallelMath::MakeUInt15(2040),2709ParallelMath::ToUInt15(2710ParallelMath::Max(2711ParallelMath::MakeSInt16(0),2712ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])2713)2714)2715);27162717if (d == 1)2718{2719//quantized[ch] = (cu * 31 + (cu >> 3) + 1024) >> 11;2720quantized[ch] = ParallelMath::ToUInt15(2721ParallelMath::RightShift(2722(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(1024)2723, 11)2724);2725}2726else2727{2728//quantized[ch] = (cu * 30 + (cu >> 3) + 2048) >> 12;2729quantized[ch] = ParallelMath::ToUInt15(2730ParallelMath::RightShift(2731(ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(2048)2732, 12)2733);2734}2735}2736}2737else2738{2739MUInt15 offsetCumulative[3];2740for (int ch = 0; ch < 3; ch++)2741{2742// cu is in range 0..20402743MUInt15 cu15 = ParallelMath::Min(2744ParallelMath::MakeUInt15(2040),2745ParallelMath::ToUInt15(2746ParallelMath::Max(2747ParallelMath::MakeSInt16(0),2748ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])2749)2750)2751);27522753offsetCumulative[ch] = cu15;2754}27552756if ((options.flags & cvtt::Flags::ETC_FakeBT709Accurate) != 0)2757ResolveHalfBlockFakeBT709RoundingAccurate(quantized, offsetCumulative, d == 1);2758else2759ResolveHalfBlockFakeBT709RoundingFast(quantized, offsetCumulative, d == 1);2760}27612762possibleColors[oi] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);2763}27642765potentialOffsets += numOffsets;27662767ParallelMath::UInt15 numUniqueColors;2768for (int block = 0; block < ParallelMath::ParallelSize; block++)2769{2770uint16_t blockNumUniqueColors = 1;2771for (int i = 1; i < numOffsets; i++)2772{2773uint16_t color = ParallelMath::Extract(possibleColors[i], block);2774if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))2775ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);2776}27772778ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);2779}27802781int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);2782for (int block = 1; block < ParallelMath::ParallelSize; block++)2783maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));27842785for (int block = 0; block < ParallelMath::ParallelSize; block++)2786{2787uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);2788for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)2789ParallelMath::PutUInt15(possibleColors[i], block, fillColor);2790}27912792for (int i = 0; i < maxUniqueColors; i++)2793{2794MFloat error = ParallelMath::MakeFloatZero();2795MUInt16 selectors = ParallelMath::MakeUInt16(0);2796MUInt15 quantized = possibleColors[i];2797TestHalfBlock(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], modifierTables[table], d == 1, options);27982799if (d == 0)2800{2801ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestIndError[sector]));2802if (ParallelMath::AnySet(errorBetter))2803{2804bestIndError[sector] = ParallelMath::Min(error, bestIndError[sector]);2805ParallelMath::ConditionalSet(bestIndSelectors[sector], errorBetter, selectors);2806ParallelMath::ConditionalSet(bestIndColors[sector], errorBetter, quantized);2807ParallelMath::ConditionalSet(bestIndTable[sector], errorBetter, ParallelMath::MakeUInt15(table));2808}2809}2810else2811{2812ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);28132814MUInt15 storageIndexes = drs.diffNumAttempts[sector];2815drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));28162817for (int block = 0; block < ParallelMath::ParallelSize; block++)2818{2819int storageIndex = ParallelMath::Extract(storageIndexes, block);28202821ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));2822ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));2823ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));2824ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);2825}2826}2827}2828}2829}28302831if (d == 0)2832{2833MFloat bestIndErrorTotal = bestIndError[0] + bestIndError[1];2834ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(bestIndErrorTotal, bestTotalError));2835if (ParallelMath::AnySet(errorBetter))2836{2837bestIsThisMode = bestIsThisMode | errorBetter;28382839bestTotalError = ParallelMath::Min(bestTotalError, bestIndErrorTotal);2840ParallelMath::ConditionalSet(bestFlip, errorBetter, ParallelMath::MakeUInt15(flip));2841ParallelMath::ConditionalSet(bestD, errorBetter, ParallelMath::MakeUInt15(d));2842for (int sector = 0; sector < 2; sector++)2843{2844ParallelMath::ConditionalSet(bestColors[sector], errorBetter, bestIndColors[sector]);2845ParallelMath::ConditionalSet(bestSelectors[sector], errorBetter, bestIndSelectors[sector]);2846ParallelMath::ConditionalSet(bestTables[sector], errorBetter, bestIndTable[sector]);2847}2848}2849}2850else2851{2852ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(false), ParallelMath::MakeBoolInt16(false) };2853FindBestDifferentialCombination(flip, d, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestD, bestColors, bestSelectors, bestTables, drs);2854}2855}2856}28572858for (int block = 0; block < ParallelMath::ParallelSize; block++)2859{2860if (!ParallelMath::Extract(bestIsThisMode, block))2861continue;28622863uint32_t highBits = 0;2864uint32_t lowBits = 0;28652866int blockBestFlip = ParallelMath::Extract(bestFlip, block);2867int blockBestD = ParallelMath::Extract(bestD, block);2868int blockBestTables[2] = { ParallelMath::Extract(bestTables[0], block), ParallelMath::Extract(bestTables[1], block) };2869ParallelMath::ScalarUInt16 blockBestSelectors[2] = { ParallelMath::Extract(bestSelectors[0], block), ParallelMath::Extract(bestSelectors[1], block) };28702871int colors[2][3];2872for (int sector = 0; sector < 2; sector++)2873{2874int sectorColor = ParallelMath::Extract(bestColors[sector], block);2875for (int ch = 0; ch < 3; ch++)2876colors[sector][ch] = (sectorColor >> (ch * 5)) & 31;2877}28782879EmitETC1Block(outputBuffer + block * 8, blockBestFlip, blockBestD, colors, blockBestTables, blockBestSelectors, false);2880}2881}288228832884void cvtt::Internal::ETCComputer::CompressETC1PunchthroughBlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], DifferentialResolveStorage &drs, const Options &options)2885{2886int numTries = 0;28872888MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);2889MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);28902891MUInt15 bestColors[2] = { zeroU15, zeroU15 };2892MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };2893MUInt15 bestTables[2] = { zeroU15, zeroU15 };2894MUInt15 bestFlip = zeroU15;28952896MUInt15 sectorPixels[2][2][8][3];2897ParallelMath::Int16CompFlag sectorTransparent[2][2][8];2898MFloat sectorPreWeightedPixels[2][2][8][3];2899MUInt15 sectorCumulative[2][2][3];29002901ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);29022903for (int flip = 0; flip < 2; flip++)2904{2905for (int sector = 0; sector < 2; sector++)2906{2907for (int ch = 0; ch < 3; ch++)2908sectorCumulative[flip][sector][ch] = zeroU15;29092910for (int px = 0; px < 8; px++)2911{2912for (int ch = 0; ch < 3; ch++)2913{2914MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];2915sectorPixels[flip][sector][px][ch] = pixelChannelValue;2916sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];2917sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;2918}29192920sectorTransparent[flip][sector][px] = isTransparent[g_flipTables[flip][sector][px]];2921}2922}2923}29242925static const MUInt15 modifiers[8] =2926{2927ParallelMath::MakeUInt15(8),2928ParallelMath::MakeUInt15(17),2929ParallelMath::MakeUInt15(29),2930ParallelMath::MakeUInt15(42),2931ParallelMath::MakeUInt15(60),2932ParallelMath::MakeUInt15(80),2933ParallelMath::MakeUInt15(106),2934ParallelMath::MakeUInt15(183),2935};29362937bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);29382939const int maxSectorCumulativeOffsets = 17;29402941for (int flip = 0; flip < 2; flip++)2942{2943ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(true), ParallelMath::MakeBoolInt16(false) };29442945for (int sector = 0; sector < 2; sector++)2946for (int px = 0; px < 8; px++)2947canIgnoreSector[sector] = canIgnoreSector[sector] & sectorTransparent[flip][sector][px];29482949drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;29502951for (int sector = 0; sector < 2; sector++)2952{2953MUInt15 sectorNumOpaque = ParallelMath::MakeUInt15(0);2954for (int px = 0; px < 8; px++)2955sectorNumOpaque = sectorNumOpaque + ParallelMath::SelectOrZero(sectorTransparent[flip][sector][px], ParallelMath::MakeUInt15(1));29562957int sectorMaxOpaque = 0;2958for (int block = 0; block < ParallelMath::ParallelSize; block++)2959sectorMaxOpaque = std::max<int>(sectorMaxOpaque, ParallelMath::Extract(sectorNumOpaque, block));29602961int sectorNumOpaqueMultipliers = sectorMaxOpaque * 2 + 1;29622963MUInt15 sectorNumOpaqueDenominator = ParallelMath::Max(ParallelMath::MakeUInt15(1), sectorNumOpaque) << 8;2964MUInt15 sectorNumOpaqueAddend = sectorNumOpaque << 7;29652966MSInt16 sectorNumOpaqueSigned = ParallelMath::LosslessCast<MSInt16>::Cast(sectorNumOpaque);2967MSInt16 negSectorNumOpaqueSigned = ParallelMath::MakeSInt16(0) - sectorNumOpaqueSigned;29682969MUInt15 sectorCumulativeMax = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(255), sectorNumOpaque));29702971for (int table = 0; table < 8; table++)2972{2973MUInt15 possibleColors[maxSectorCumulativeOffsets];29742975MUInt15 quantized[3];2976for (int om = -sectorMaxOpaque; om <= sectorMaxOpaque; om++)2977{2978MSInt16 clampedOffsetMult = ParallelMath::Max(ParallelMath::Min(ParallelMath::MakeSInt16(om), sectorNumOpaqueSigned), negSectorNumOpaqueSigned);2979MSInt16 offset = ParallelMath::CompactMultiply(clampedOffsetMult, modifiers[table]);29802981for (int ch = 0; ch < 3; ch++)2982{2983// cu is in range 0..255*numOpaque (at most 0..2040)2984MUInt15 cu15 = ParallelMath::Min(2985sectorCumulativeMax,2986ParallelMath::ToUInt15(2987ParallelMath::Max(2988ParallelMath::MakeSInt16(0),2989ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + offset2990)2991)2992);29932994//quantized[ch] = (cu * 31 + (cu >> 3) + (numOpaque * 128)) / (numOpaque * 256)2995MUInt16 cuTimes31 = (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15);2996MUInt15 cuDiv8 = ParallelMath::RightShift(cu15, 3);2997MUInt16 numerator = cuTimes31 + ParallelMath::LosslessCast<MUInt16>::Cast(cuDiv8 + sectorNumOpaqueAddend);2998for (int block = 0; block < ParallelMath::ParallelSize; block++)2999ParallelMath::PutUInt15(quantized[ch], block, ParallelMath::Extract(numerator, block) / ParallelMath::Extract(sectorNumOpaqueDenominator, block));3000}30013002possibleColors[om + sectorMaxOpaque] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);3003}30043005ParallelMath::UInt15 numUniqueColors;3006for (int block = 0; block < ParallelMath::ParallelSize; block++)3007{3008uint16_t blockNumUniqueColors = 1;3009for (int i = 1; i < sectorNumOpaqueMultipliers; i++)3010{3011uint16_t color = ParallelMath::Extract(possibleColors[i], block);3012if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))3013ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);3014}30153016ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);3017}30183019int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);3020for (int block = 1; block < ParallelMath::ParallelSize; block++)3021maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));30223023for (int block = 0; block < ParallelMath::ParallelSize; block++)3024{3025uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);3026for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)3027ParallelMath::PutUInt15(possibleColors[i], block, fillColor);3028}30293030for (int i = 0; i < maxUniqueColors; i++)3031{3032MFloat error = ParallelMath::MakeFloatZero();3033MUInt16 selectors = ParallelMath::MakeUInt16(0);3034MUInt15 quantized = possibleColors[i];3035TestHalfBlockPunchthrough(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], sectorTransparent[flip][sector], modifiers[table], options);30363037ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);30383039MUInt15 storageIndexes = drs.diffNumAttempts[sector];3040drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));30413042for (int block = 0; block < ParallelMath::ParallelSize; block++)3043{3044int storageIndex = ParallelMath::Extract(storageIndexes, block);30453046ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));3047ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));3048ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));3049ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);3050}3051}3052}3053}30543055MUInt15 bestDDummy = ParallelMath::MakeUInt15(0);3056FindBestDifferentialCombination(flip, 1, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestDDummy, bestColors, bestSelectors, bestTables, drs);3057}30583059for (int block = 0; block < ParallelMath::ParallelSize; block++)3060{3061if (!ParallelMath::Extract(bestIsThisMode, block))3062continue;30633064int blockBestColors[2][3];3065int blockBestTables[2];3066ParallelMath::ScalarUInt16 blockBestSelectors[2];3067for (int sector = 0; sector < 2; sector++)3068{3069int sectorColor = ParallelMath::Extract(bestColors[sector], block);3070for (int ch = 0; ch < 3; ch++)3071blockBestColors[sector][ch] = (sectorColor >> (ch * 5)) & 31;30723073blockBestTables[sector] = ParallelMath::Extract(bestTables[sector], block);3074blockBestSelectors[sector] = ParallelMath::Extract(bestSelectors[sector], block);3075}30763077EmitETC1Block(outputBuffer + block * 8, ParallelMath::Extract(bestFlip, block), 1, blockBestColors, blockBestTables, blockBestSelectors, true);3078}3079}308030813082cvtt::ETC1CompressionData *cvtt::Internal::ETCComputer::AllocETC1Data(cvtt::Kernels::allocFunc_t allocFunc, void *context)3083{3084void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));3085if (!buffer)3086return NULL;3087new (buffer) cvtt::Internal::ETCComputer::ETC1CompressionDataInternal(context);3088return static_cast<ETC1CompressionData*>(buffer);3089}30903091void cvtt::Internal::ETCComputer::ReleaseETC1Data(ETC1CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)3092{3093cvtt::Internal::ETCComputer::ETC1CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC1CompressionDataInternal*>(compressionData);3094void *context = internalData->m_context;3095internalData->~ETC1CompressionDataInternal();3096freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));3097}30983099cvtt::ETC2CompressionData *cvtt::Internal::ETCComputer::AllocETC2Data(cvtt::Kernels::allocFunc_t allocFunc, void *context, const cvtt::Options &options)3100{3101void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));3102if (!buffer)3103return NULL;3104new (buffer) cvtt::Internal::ETCComputer::ETC2CompressionDataInternal(context, options);3105return static_cast<ETC2CompressionData*>(buffer);3106}31073108void cvtt::Internal::ETCComputer::ReleaseETC2Data(ETC2CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)3109{3110cvtt::Internal::ETCComputer::ETC2CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC2CompressionDataInternal*>(compressionData);3111void *context = internalData->m_context;3112internalData->~ETC2CompressionDataInternal();3113freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));3114}31153116cvtt::Internal::ETCComputer::ETC2CompressionDataInternal::ETC2CompressionDataInternal(void *context, const cvtt::Options &options)3117: m_context(context)3118{3119const float cd[3] = { options.redWeight, options.greenWeight, options.blueWeight };3120const float rotCD[3] = { cd[1], cd[2], cd[0] };31213122const float offs = -(rotCD[0] * cd[0] + rotCD[1] * cd[1] + rotCD[2] * cd[2]) / (cd[0] * cd[0] + cd[1] * cd[1] + cd[2] * cd[2]);31233124const float chromaAxis0[3] = { rotCD[0] + cd[0] * offs, rotCD[1] + cd[1] * offs, rotCD[2] + cd[2] * offs };31253126const float chromaAxis1Unnormalized[3] =3127{3128chromaAxis0[1] * cd[2] - chromaAxis0[2] * cd[1],3129chromaAxis0[2] * cd[0] - chromaAxis0[0] * cd[2],3130chromaAxis0[0] * cd[1] - chromaAxis0[1] * cd[0]3131};31323133const float ca0LengthSq = (chromaAxis0[0] * chromaAxis0[0] + chromaAxis0[1] * chromaAxis0[1] + chromaAxis0[2] * chromaAxis0[2]);3134const float ca1UNLengthSq = (chromaAxis1Unnormalized[0] * chromaAxis1Unnormalized[0] + chromaAxis1Unnormalized[1] * chromaAxis1Unnormalized[1] + chromaAxis1Unnormalized[2] * chromaAxis1Unnormalized[2]);3135const float lengthRatio = static_cast<float>(std::sqrt(ca0LengthSq / ca1UNLengthSq));31363137const float chromaAxis1[3] = { chromaAxis1Unnormalized[0] * lengthRatio, chromaAxis1Unnormalized[1] * lengthRatio, chromaAxis1Unnormalized[2] * lengthRatio };31383139for (int i = 0; i < 3; i++)3140{3141m_chromaSideAxis0[i] = chromaAxis0[i];3142m_chromaSideAxis1[i] = chromaAxis1[i];3143}3144}31453146#endif314731483149