Path: blob/master/thirdparty/cvtt/ConvectionKernels_S3TC.cpp
9903 views
/*1Convection Texture Tools2Copyright (c) 2018-2019 Eric Lasota34Permission is hereby granted, free of charge, to any person obtaining5a copy of this software and associated documentation files (the6"Software"), to deal in the Software without restriction, including7without limitation the rights to use, copy, modify, merge, publish,8distribute, sublicense, and/or sell copies of the Software, and to9permit persons to whom the Software is furnished to do so, subject10to the following conditions:1112The above copyright notice and this permission notice shall be included13in all copies or substantial portions of the Software.1415THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS16OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF17MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.18IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY19CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,20TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE21SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.2223-------------------------------------------------------------------------------------2425Portions based on DirectX Texture Library (DirectXTex)2627Copyright (c) Microsoft Corporation. All rights reserved.28Licensed under the MIT License.2930http://go.microsoft.com/fwlink/?LinkId=24892631*/32#include "ConvectionKernels_Config.h"3334#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)3536#include "ConvectionKernels_S3TC.h"3738#include "ConvectionKernels_AggregatedError.h"39#include "ConvectionKernels_BCCommon.h"40#include "ConvectionKernels_EndpointRefiner.h"41#include "ConvectionKernels_EndpointSelector.h"42#include "ConvectionKernels_IndexSelector.h"43#include "ConvectionKernels_UnfinishedEndpoints.h"44#include "ConvectionKernels_S3TC_SingleColor.h"4546void cvtt::Internal::S3TCComputer::Init(MFloat& error)47{48error = ParallelMath::MakeFloat(FLT_MAX);49}5051void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v)52{53MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));54v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);55}5657void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v)58{59MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));60v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);61}6263void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[3])64{65QuantizeTo5Bits(endPoint[0]);66QuantizeTo6Bits(endPoint[1]);67QuantizeTo5Bits(endPoint[2]);68}6970cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span)71{72return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;73}7475cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)76{77MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));78absDiff = absDiff + d;79return absDiff * absDiff;80}8182void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,83MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)84{85float channelWeightsSq[3];8687for (int ch = 0; ch < 3; ch++)88channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];8990MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };9192for (int px = 0; px < 16; px++)93{94for (int ch = 0; ch < 3; ch++)95totals[ch] = totals[ch] + pixels[px][ch];96}9798MUInt15 average[3];99for (int ch = 0; ch < 3; ch++)100average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);101102const Tables::S3TCSC::TableEntry* rbTable = NULL;103const Tables::S3TCSC::TableEntry* gTable = NULL;104if (flags & cvtt::Flags::S3TC_Paranoid)105{106if (range == 4)107{108rbTable = Tables::S3TCSC::g_singleColor5_3_p;109gTable = Tables::S3TCSC::g_singleColor6_3_p;110}111else112{113assert(range == 3);114rbTable = Tables::S3TCSC::g_singleColor5_2_p;115gTable = Tables::S3TCSC::g_singleColor6_2_p;116}117}118else119{120if (range == 4)121{122rbTable = Tables::S3TCSC::g_singleColor5_3;123gTable = Tables::S3TCSC::g_singleColor6_3;124}125else126{127assert(range == 3);128rbTable = Tables::S3TCSC::g_singleColor5_2;129gTable = Tables::S3TCSC::g_singleColor6_2;130}131}132133MUInt15 interpolated[3];134MUInt15 eps[2][3];135MSInt16 spans[3];136for (int i = 0; i < ParallelMath::ParallelSize; i++)137{138for (int ch = 0; ch < 3; ch++)139{140uint16_t avg = ParallelMath::Extract(average[ch], i);141const Tables::S3TCSC::TableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);142ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);143ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);144ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);145ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);146}147}148149MFloat error = ParallelMath::MakeFloatZero();150if (flags & cvtt::Flags::S3TC_Paranoid)151{152MFloat spanParanoidFactors[3];153for (int ch = 0; ch < 3; ch++)154spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);155156for (int px = 0; px < 16; px++)157{158for (int ch = 0; ch < 3; ch++)159error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];160}161}162else163{164for (int px = 0; px < 16; px++)165{166for (int ch = 0; ch < 3; ch++)167error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];168}169}170171ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);172ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);173174if (ParallelMath::AnySet(better16))175{176bestError = ParallelMath::Min(bestError, error);177for (int epi = 0; epi < 2; epi++)178for (int ch = 0; ch < 3; ch++)179ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);180181MUInt15 vindexes = ParallelMath::MakeUInt15(1);182for (int px = 0; px < 16; px++)183ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);184185ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));186}187}188189void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,190MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)191{192float channelWeightsSq[3];193194for (int ch = 0; ch < 3; ch++)195channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];196197MUInt15 endPoints[2][3];198199for (int ep = 0; ep < 2; ep++)200for (int ch = 0; ch < 3; ch++)201endPoints[ep][ch] = unquantizedEndPoints[ep][ch];202203QuantizeTo565(endPoints[0]);204QuantizeTo565(endPoints[1]);205206IndexSelector<3> selector;207selector.Init<false>(channelWeights, endPoints, range);208209MUInt15 indexes[16];210211MFloat paranoidFactors[3];212for (int ch = 0; ch < 3; ch++)213paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));214215MFloat error = ParallelMath::MakeFloatZero();216AggregatedError<3> aggError;217for (int px = 0; px < 16; px++)218{219MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);220indexes[px] = index;221222if (refiner)223refiner->ContributeUnweightedPW(preWeightedPixels[px], index);224225MUInt15 reconstructed[3];226selector.ReconstructLDRPrecise(index, reconstructed);227228if (flags & Flags::S3TC_Paranoid)229{230for (int ch = 0; ch < 3; ch++)231error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];232}233else234BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);235}236237if (!(flags & Flags::S3TC_Paranoid))238error = aggError.Finalize(flags, channelWeightsSq);239240ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);241242if (ParallelMath::AnySet(better))243{244ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);245246ParallelMath::ConditionalSet(bestError, better, error);247248for (int ep = 0; ep < 2; ep++)249for (int ch = 0; ch < 3; ch++)250ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);251252for (int px = 0; px < 16; px++)253ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);254255ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));256}257}258259void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,260const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,261const ParallelMath::RoundTowardNearestForScope* rtn)262{263UNREFERENCED_PARAMETER(alphaTest);264UNREFERENCED_PARAMETER(flags);265266EndpointRefiner<3> refiner;267268refiner.Init(nCounts, channelWeights);269270bool escape = false;271int e = 0;272for (int i = 0; i < nCounts; i++)273{274for (int n = 0; n < counts[i]; n++)275{276ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);277if (!ParallelMath::AnySet(valid))278{279escape = true;280break;281}282283if (ParallelMath::AllSet(valid))284refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));285else286{287MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));288refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);289}290}291292if (escape)293break;294}295296MUInt15 endPoints[2][3];297refiner.GetRefinedEndpointsLDR(endPoints, rtn);298299TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);300}301302void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)303{304UNREFERENCED_PARAMETER(flags);305ParallelMath::RoundTowardNearestForScope rtn;306307float weights[1] = { 1.0f };308309MUInt15 pixels[16];310MFloat floatPixels[16];311312for (int px = 0; px < 16; px++)313{314ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);315floatPixels[px] = ParallelMath::ToFloat(pixels[px]);316}317318MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };319320IndexSelector<1> selector;321selector.Init<false>(weights, ep, 16);322323MUInt15 indexes[16];324325for (int px = 0; px < 16; px++)326indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);327328for (int block = 0; block < ParallelMath::ParallelSize; block++)329{330for (int px = 0; px < 16; px += 2)331{332int index0 = ParallelMath::Extract(indexes[px], block);333int index1 = ParallelMath::Extract(indexes[px + 1], block);334335packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));336}337338packedBlocks += packedBlockStride;339}340}341342void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)343{344if (maxTweakRounds < 1)345maxTweakRounds = 1;346347if (numRefineRounds < 1)348numRefineRounds = 1;349350ParallelMath::RoundTowardNearestForScope rtn;351352float oneWeight[1] = { 1.0f };353354MUInt15 pixels[16];355MFloat floatPixels[16];356357MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);358MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);359360for (int px = 0; px < 16; px++)361{362ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);363364if (isSigned)365pixels[px] = ParallelMath::Min(pixels[px], highTerminal);366367floatPixels[px] = ParallelMath::ToFloat(pixels[px]);368}369370MUInt15 sortedPixels[16];371for (int px = 0; px < 16; px++)372sortedPixels[px] = pixels[px];373374for (int sortEnd = 15; sortEnd > 0; sortEnd--)375{376for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)377{378MUInt15 a = sortedPixels[sortOffset];379MUInt15 b = sortedPixels[sortOffset + 1];380381sortedPixels[sortOffset] = ParallelMath::Min(a, b);382sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);383}384}385386MUInt15 zero = ParallelMath::MakeUInt15(0);387MUInt15 one = ParallelMath::MakeUInt15(1);388389MUInt15 bestIsFullRange = zero;390MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);391MUInt15 bestEP[2] = { zero, zero };392MUInt15 bestIndexes[16] = {393zero, zero, zero, zero,394zero, zero, zero, zero,395zero, zero, zero, zero,396zero, zero, zero, zero397};398399// Full-precision400{401MUInt15 minEP = sortedPixels[0];402MUInt15 maxEP = sortedPixels[15];403404MFloat base[1] = { ParallelMath::ToFloat(minEP) };405MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };406407UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);408409int numTweakRounds = BCCommon::TweakRoundsForRange(8);410if (numTweakRounds > maxTweakRounds)411numTweakRounds = maxTweakRounds;412413for (int tweak = 0; tweak < numTweakRounds; tweak++)414{415MUInt15 ep[2][1];416417ufep.FinishLDR(tweak, 8, ep[0], ep[1]);418419for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)420{421EndpointRefiner<1> refiner;422refiner.Init(8, oneWeight);423424if (isSigned)425for (int epi = 0; epi < 2; epi++)426ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);427428IndexSelector<1> indexSelector;429indexSelector.Init<false>(oneWeight, ep, 8);430431MUInt15 indexes[16];432433AggregatedError<1> aggError;434for (int px = 0; px < 16; px++)435{436MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);437438MUInt15 reconstructedPixel;439440indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);441BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);442443if (refinePass != numRefineRounds - 1)444refiner.ContributeUnweightedPW(&floatPixels[px], index);445446indexes[px] = index;447}448MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);449450ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);451ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);452453if (ParallelMath::AnySet(errorBetter16))454{455bestError = ParallelMath::Min(error, bestError);456ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);457for (int px = 0; px < 16; px++)458ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);459460for (int epi = 0; epi < 2; epi++)461ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);462}463464if (refinePass != numRefineRounds - 1)465refiner.GetRefinedEndpointsLDR(ep, &rtn);466}467}468}469470// Reduced precision with special endpoints471{472MUInt15 bestHeuristicMin = sortedPixels[0];473MUInt15 bestHeuristicMax = sortedPixels[15];474475ParallelMath::Int16CompFlag canTryClipping;476477// In reduced precision, we want try putting endpoints at the reserved indexes at the ends.478// The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.479// This will usually not find anything, but it's cheap to check.480481{482MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255483MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));484485MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);486canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);487}488489if (ParallelMath::AnySet(canTryClipping))490{491MUInt15 lowClearances[16];492MUInt15 highClearances[16];493MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);494495lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);496497for (int px = 1; px < 16; px++)498{499lowClearances[px] = sortedPixels[px - 1];500highClearances[px] = highTerminal - sortedPixels[16 - px];501}502503for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)504{505uint16_t numSkippedLow = firstIndex;506507MUInt15 lowClearance = lowClearances[firstIndex];508509for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)510{511uint16_t numSkippedHigh = 15 - lastIndex;512uint16_t numSkipped = numSkippedLow + numSkippedHigh;513514MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);515516ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);517518if (!ParallelMath::AnySet(areMoreSkipped))519continue;520521MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);522MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);523524MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];525526ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));527ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);528ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);529}530}531}532533MUInt15 bestSimpleMin = one;534MUInt15 bestSimpleMax = highTerminalMinusOne;535536for (int px = 0; px < 16; px++)537{538ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);539ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);540}541542MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };543MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };544545int minEPRange = 2;546if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))547minEPRange = 1;548549int maxEPRange = 2;550if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))551maxEPRange = 1;552553for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)554{555for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)556{557MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };558MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };559560UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);561562int numTweakRounds = BCCommon::TweakRoundsForRange(6);563if (numTweakRounds > maxTweakRounds)564numTweakRounds = maxTweakRounds;565566for (int tweak = 0; tweak < numTweakRounds; tweak++)567{568MUInt15 ep[2][1];569570ufep.FinishLDR(tweak, 8, ep[0], ep[1]);571572for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)573{574EndpointRefiner<1> refiner;575refiner.Init(6, oneWeight);576577if (isSigned)578for (int epi = 0; epi < 2; epi++)579ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);580581IndexSelector<1> indexSelector;582indexSelector.Init<false>(oneWeight, ep, 6);583584MUInt15 indexes[16];585MFloat error = ParallelMath::MakeFloatZero();586587for (int px = 0; px < 16; px++)588{589MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);590591MUInt15 reconstructedPixel;592593indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);594595MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);596MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);597MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);598599MFloat bestPixelError = zeroError;600MUInt15 index = ParallelMath::MakeUInt15(6);601602ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));603bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);604605ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);606607if (ParallelMath::AllSet(selectedIndexBetter))608{609if (refinePass != numRefineRounds - 1)610refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);611}612else613{614MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());615616if (refinePass != numRefineRounds - 1)617refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);618}619620ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);621bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);622623error = error + bestPixelError;624625indexes[px] = index;626}627628ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);629ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);630631if (ParallelMath::AnySet(errorBetter16))632{633bestError = ParallelMath::Min(error, bestError);634ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);635for (int px = 0; px < 16; px++)636ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);637638for (int epi = 0; epi < 2; epi++)639ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);640}641642if (refinePass != numRefineRounds - 1)643refiner.GetRefinedEndpointsLDR(ep, &rtn);644}645}646}647}648}649650for (int block = 0; block < ParallelMath::ParallelSize; block++)651{652int ep0 = ParallelMath::Extract(bestEP[0], block);653int ep1 = ParallelMath::Extract(bestEP[1], block);654int isFullRange = ParallelMath::Extract(bestIsFullRange, block);655656if (isSigned)657{658ep0 -= 127;659ep1 -= 127;660661assert(ep0 >= -127 && ep0 <= 127);662assert(ep1 >= -127 && ep1 <= 127);663}664665666bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);667668if (swapEndpoints)669std::swap(ep0, ep1);670671uint16_t dumpBits = 0;672int dumpBitsOffset = 0;673int dumpByteOffset = 2;674packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);675packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);676677int maxValue = (isFullRange != 0) ? 7 : 5;678679for (int px = 0; px < 16; px++)680{681int index = ParallelMath::Extract(bestIndexes[px], block);682683if (swapEndpoints && index <= maxValue)684index = maxValue - index;685686if (index != 0)687{688if (index == maxValue)689index = 1;690else if (index < maxValue)691index++;692}693694assert(index >= 0 && index < 8);695696dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);697dumpBitsOffset += 3;698699if (dumpBitsOffset >= 8)700{701assert(dumpByteOffset < 8);702packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);703dumpBits >>= 8;704dumpBitsOffset -= 8;705dumpByteOffset++;706}707}708709assert(dumpBitsOffset == 0);710assert(dumpByteOffset == 8);711712packedBlocks += packedBlockStride;713}714}715716void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)717{718ParallelMath::RoundTowardNearestForScope rtn;719720if (numRefineRounds < 1)721numRefineRounds = 1;722723if (maxTweakRounds < 1)724maxTweakRounds = 1;725726EndpointSelector<3, 8> endpointSelector;727728MUInt15 pixels[16][4];729MFloat floatPixels[16][4];730731MFloat preWeightedPixels[16][4];732733for (int px = 0; px < 16; px++)734{735for (int ch = 0; ch < 4; ch++)736ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);737}738739for (int px = 0; px < 16; px++)740{741for (int ch = 0; ch < 4; ch++)742floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);743}744745if (alphaTest)746{747MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));748749for (int px = 0; px < 16; px++)750{751ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);752pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));753}754}755756BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);757758MUInt15 minAlpha = ParallelMath::MakeUInt15(255);759760for (int px = 0; px < 16; px++)761minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);762763MFloat pixelWeights[16];764for (int px = 0; px < 16; px++)765{766pixelWeights[px] = ParallelMath::MakeFloat(1.0f);767if (alphaTest)768{769ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));770771ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());772}773}774775for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)776{777for (int px = 0; px < 16; px++)778endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);779780endpointSelector.FinishPass(pass);781}782783UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);784785MUInt15 bestEndpoints[2][3];786MUInt15 bestIndexes[16];787MUInt15 bestRange = ParallelMath::MakeUInt15(0);788MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);789790for (int px = 0; px < 16; px++)791bestIndexes[px] = ParallelMath::MakeUInt15(0);792793for (int ep = 0; ep < 2; ep++)794for (int ch = 0; ch < 3; ch++)795bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);796797if (exhaustive)798{799MSInt16 sortBins[16];800801{802// Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,803// and pack the original indexes into the low bits.804805MUInt15 sortEP[2][3];806ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);807808IndexSelector<3> sortSelector;809sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);810811for (int16_t px = 0; px < 16; px++)812{813MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);814815if (alphaTest)816{817ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));818819ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0820}821822sortBin = sortBin + ParallelMath::MakeSInt16(px);823824sortBins[px] = sortBin;825}826}827828// Sort bins829for (int sortEnd = 1; sortEnd < 16; sortEnd++)830{831for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)832{833MSInt16 a = sortBins[sortLoc];834MSInt16 b = sortBins[sortLoc - 1];835836sortBins[sortLoc] = ParallelMath::Max(a, b);837sortBins[sortLoc - 1] = ParallelMath::Min(a, b);838}839}840841MUInt15 firstElement = ParallelMath::MakeUInt15(0);842for (uint16_t e = 0; e < 16; e++)843{844ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));845ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));846if (!ParallelMath::AnySet(isInvalid))847break;848}849850MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;851852MUInt15 sortedInputs[16][4];853MFloat floatSortedInputs[16][4];854MFloat pwFloatSortedInputs[16][4];855856for (int e = 0; e < 16; e++)857{858for (int ch = 0; ch < 4; ch++)859sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);860}861862for (int block = 0; block < ParallelMath::ParallelSize; block++)863{864for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)865{866ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);867int originalIndex = (sortBin & 15);868869for (int ch = 0; ch < 4; ch++)870ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));871}872}873874for (int e = 0; e < 16; e++)875{876for (int ch = 0; ch < 4; ch++)877{878MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);879floatSortedInputs[e][ch] = f;880pwFloatSortedInputs[e][ch] = f * channelWeights[ch];881}882}883884for (int n0 = 0; n0 <= 15; n0++)885{886int remainingFor1 = 16 - n0;887if (remainingFor1 == 16)888remainingFor1 = 15;889890for (int n1 = 0; n1 <= remainingFor1; n1++)891{892int remainingFor2 = 16 - n1 - n0;893if (remainingFor2 == 16)894remainingFor2 = 15;895896for (int n2 = 0; n2 <= remainingFor2; n2++)897{898int n3 = 16 - n2 - n1 - n0;899900if (n3 == 16)901continue;902903int counts[4] = { n0, n1, n2, n3 };904905TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);906}907}908}909910TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);911912if (alphaTest)913{914for (int n0 = 0; n0 <= 15; n0++)915{916int remainingFor1 = 16 - n0;917if (remainingFor1 == 16)918remainingFor1 = 15;919920for (int n1 = 0; n1 <= remainingFor1; n1++)921{922int n2 = 16 - n1 - n0;923924if (n2 == 16)925continue;926927int counts[3] = { n0, n1, n2 };928929TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);930}931}932933TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);934}935}936else937{938int minRange = alphaTest ? 3 : 4;939940for (int range = minRange; range <= 4; range++)941{942int tweakRounds = BCCommon::TweakRoundsForRange(range);943if (tweakRounds > maxTweakRounds)944tweakRounds = maxTweakRounds;945946for (int tweak = 0; tweak < tweakRounds; tweak++)947{948MUInt15 endPoints[2][3];949950ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);951952for (int refine = 0; refine < numRefineRounds; refine++)953{954EndpointRefiner<3> refiner;955refiner.Init(range, channelWeights);956957TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);958959if (refine != numRefineRounds - 1)960refiner.GetRefinedEndpointsLDR(endPoints, &rtn);961}962}963}964}965966for (int block = 0; block < ParallelMath::ParallelSize; block++)967{968ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);969assert(range == 3 || range == 4);970971ParallelMath::ScalarUInt16 compressedEP[2];972for (int ep = 0; ep < 2; ep++)973{974ParallelMath::ScalarUInt16 endPoint[3];975for (int ch = 0; ch < 3; ch++)976endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);977978int compressed = (endPoint[0] & 0xf8) << 8;979compressed |= (endPoint[1] & 0xfc) << 3;980compressed |= (endPoint[2] & 0xf8) >> 3;981982compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);983}984985int indexOrder[4];986987if (range == 4)988{989if (compressedEP[0] == compressedEP[1])990{991indexOrder[0] = 0;992indexOrder[1] = 0;993indexOrder[2] = 0;994indexOrder[3] = 0;995}996else if (compressedEP[0] < compressedEP[1])997{998std::swap(compressedEP[0], compressedEP[1]);999indexOrder[0] = 1;1000indexOrder[1] = 3;1001indexOrder[2] = 2;1002indexOrder[3] = 0;1003}1004else1005{1006indexOrder[0] = 0;1007indexOrder[1] = 2;1008indexOrder[2] = 3;1009indexOrder[3] = 1;1010}1011}1012else1013{1014assert(range == 3);10151016if (compressedEP[0] > compressedEP[1])1017{1018std::swap(compressedEP[0], compressedEP[1]);1019indexOrder[0] = 1;1020indexOrder[1] = 2;1021indexOrder[2] = 0;1022}1023else1024{1025indexOrder[0] = 0;1026indexOrder[1] = 2;1027indexOrder[2] = 1;1028}1029indexOrder[3] = 3;1030}10311032packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);1033packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);1034packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);1035packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);10361037for (int i = 0; i < 16; i += 4)1038{1039int packedIndexes = 0;1040for (int subi = 0; subi < 4; subi++)1041{1042ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);1043packedIndexes |= (indexOrder[index] << (subi * 2));1044}10451046packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);1047}10481049packedBlocks += packedBlockStride;1050}1051}10521053#endif105410551056