419b342a9a
Make BC6 and BC7 CVTT faster while still having better quality than DXT5.
1054 lines
39 KiB
C++
1054 lines
39 KiB
C++
/*
|
|
Convection Texture Tools
|
|
Copyright (c) 2018-2019 Eric Lasota
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining
|
|
a copy of this software and associated documentation files (the
|
|
"Software"), to deal in the Software without restriction, including
|
|
without limitation the rights to use, copy, modify, merge, publish,
|
|
distribute, sublicense, and/or sell copies of the Software, and to
|
|
permit persons to whom the Software is furnished to do so, subject
|
|
to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included
|
|
in all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
-------------------------------------------------------------------------------------
|
|
|
|
Portions based on DirectX Texture Library (DirectXTex)
|
|
|
|
Copyright (c) Microsoft Corporation. All rights reserved.
|
|
Licensed under the MIT License.
|
|
|
|
http://go.microsoft.com/fwlink/?LinkId=248926
|
|
*/
|
|
#include "ConvectionKernels_Config.h"
|
|
|
|
#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
|
|
|
|
#include "ConvectionKernels_S3TC.h"
|
|
|
|
#include "ConvectionKernels_AggregatedError.h"
|
|
#include "ConvectionKernels_BCCommon.h"
|
|
#include "ConvectionKernels_EndpointRefiner.h"
|
|
#include "ConvectionKernels_EndpointSelector.h"
|
|
#include "ConvectionKernels_IndexSelector.h"
|
|
#include "ConvectionKernels_UnfinishedEndpoints.h"
|
|
#include "ConvectionKernels_S3TC_SingleColor.h"
|
|
|
|
void cvtt::Internal::S3TCComputer::Init(MFloat& error)
|
|
{
|
|
error = ParallelMath::MakeFloat(FLT_MAX);
|
|
}
|
|
|
|
void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v)
|
|
{
|
|
MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
|
|
v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
|
|
}
|
|
|
|
void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v)
|
|
{
|
|
MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
|
|
v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
|
|
}
|
|
|
|
void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[3])
|
|
{
|
|
QuantizeTo5Bits(endPoint[0]);
|
|
QuantizeTo6Bits(endPoint[1]);
|
|
QuantizeTo5Bits(endPoint[2]);
|
|
}
|
|
|
|
cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span)
|
|
{
|
|
return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
|
|
}
|
|
|
|
cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
|
|
{
|
|
MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
|
|
absDiff = absDiff + d;
|
|
return absDiff * absDiff;
|
|
}
|
|
|
|
void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
|
|
MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
|
|
{
|
|
float channelWeightsSq[3];
|
|
|
|
for (int ch = 0; ch < 3; ch++)
|
|
channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
|
|
|
|
MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
for (int ch = 0; ch < 3; ch++)
|
|
totals[ch] = totals[ch] + pixels[px][ch];
|
|
}
|
|
|
|
MUInt15 average[3];
|
|
for (int ch = 0; ch < 3; ch++)
|
|
average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
|
|
|
|
const Tables::S3TCSC::TableEntry* rbTable = NULL;
|
|
const Tables::S3TCSC::TableEntry* gTable = NULL;
|
|
if (flags & cvtt::Flags::S3TC_Paranoid)
|
|
{
|
|
if (range == 4)
|
|
{
|
|
rbTable = Tables::S3TCSC::g_singleColor5_3_p;
|
|
gTable = Tables::S3TCSC::g_singleColor6_3_p;
|
|
}
|
|
else
|
|
{
|
|
assert(range == 3);
|
|
rbTable = Tables::S3TCSC::g_singleColor5_2_p;
|
|
gTable = Tables::S3TCSC::g_singleColor6_2_p;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (range == 4)
|
|
{
|
|
rbTable = Tables::S3TCSC::g_singleColor5_3;
|
|
gTable = Tables::S3TCSC::g_singleColor6_3;
|
|
}
|
|
else
|
|
{
|
|
assert(range == 3);
|
|
rbTable = Tables::S3TCSC::g_singleColor5_2;
|
|
gTable = Tables::S3TCSC::g_singleColor6_2;
|
|
}
|
|
}
|
|
|
|
MUInt15 interpolated[3];
|
|
MUInt15 eps[2][3];
|
|
MSInt16 spans[3];
|
|
for (int i = 0; i < ParallelMath::ParallelSize; i++)
|
|
{
|
|
for (int ch = 0; ch < 3; ch++)
|
|
{
|
|
uint16_t avg = ParallelMath::Extract(average[ch], i);
|
|
const Tables::S3TCSC::TableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
|
|
ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
|
|
ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
|
|
ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
|
|
ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
|
|
}
|
|
}
|
|
|
|
MFloat error = ParallelMath::MakeFloatZero();
|
|
if (flags & cvtt::Flags::S3TC_Paranoid)
|
|
{
|
|
MFloat spanParanoidFactors[3];
|
|
for (int ch = 0; ch < 3; ch++)
|
|
spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
for (int ch = 0; ch < 3; ch++)
|
|
error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
for (int ch = 0; ch < 3; ch++)
|
|
error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
|
|
}
|
|
}
|
|
|
|
ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
|
|
ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
|
|
|
|
if (ParallelMath::AnySet(better16))
|
|
{
|
|
bestError = ParallelMath::Min(bestError, error);
|
|
for (int epi = 0; epi < 2; epi++)
|
|
for (int ch = 0; ch < 3; ch++)
|
|
ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
|
|
|
|
MUInt15 vindexes = ParallelMath::MakeUInt15(1);
|
|
for (int px = 0; px < 16; px++)
|
|
ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
|
|
|
|
ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
|
|
}
|
|
}
|
|
|
|
void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
|
|
MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
|
|
{
|
|
float channelWeightsSq[3];
|
|
|
|
for (int ch = 0; ch < 3; ch++)
|
|
channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
|
|
|
|
MUInt15 endPoints[2][3];
|
|
|
|
for (int ep = 0; ep < 2; ep++)
|
|
for (int ch = 0; ch < 3; ch++)
|
|
endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
|
|
|
|
QuantizeTo565(endPoints[0]);
|
|
QuantizeTo565(endPoints[1]);
|
|
|
|
IndexSelector<3> selector;
|
|
selector.Init<false>(channelWeights, endPoints, range);
|
|
|
|
MUInt15 indexes[16];
|
|
|
|
MFloat paranoidFactors[3];
|
|
for (int ch = 0; ch < 3; ch++)
|
|
paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
|
|
|
|
MFloat error = ParallelMath::MakeFloatZero();
|
|
AggregatedError<3> aggError;
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
|
|
indexes[px] = index;
|
|
|
|
if (refiner)
|
|
refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
|
|
|
|
MUInt15 reconstructed[3];
|
|
selector.ReconstructLDRPrecise(index, reconstructed);
|
|
|
|
if (flags & Flags::S3TC_Paranoid)
|
|
{
|
|
for (int ch = 0; ch < 3; ch++)
|
|
error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
|
|
}
|
|
else
|
|
BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
|
|
}
|
|
|
|
if (!(flags & Flags::S3TC_Paranoid))
|
|
error = aggError.Finalize(flags, channelWeightsSq);
|
|
|
|
ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
|
|
|
|
if (ParallelMath::AnySet(better))
|
|
{
|
|
ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
|
|
|
|
ParallelMath::ConditionalSet(bestError, better, error);
|
|
|
|
for (int ep = 0; ep < 2; ep++)
|
|
for (int ch = 0; ch < 3; ch++)
|
|
ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
|
|
|
|
ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
|
|
}
|
|
}
|
|
|
|
void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
|
|
const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
|
|
const ParallelMath::RoundTowardNearestForScope* rtn)
|
|
{
|
|
UNREFERENCED_PARAMETER(alphaTest);
|
|
UNREFERENCED_PARAMETER(flags);
|
|
|
|
EndpointRefiner<3> refiner;
|
|
|
|
refiner.Init(nCounts, channelWeights);
|
|
|
|
bool escape = false;
|
|
int e = 0;
|
|
for (int i = 0; i < nCounts; i++)
|
|
{
|
|
for (int n = 0; n < counts[i]; n++)
|
|
{
|
|
ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
|
|
if (!ParallelMath::AnySet(valid))
|
|
{
|
|
escape = true;
|
|
break;
|
|
}
|
|
|
|
if (ParallelMath::AllSet(valid))
|
|
refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
|
|
else
|
|
{
|
|
MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
|
|
refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
|
|
}
|
|
}
|
|
|
|
if (escape)
|
|
break;
|
|
}
|
|
|
|
MUInt15 endPoints[2][3];
|
|
refiner.GetRefinedEndpointsLDR(endPoints, rtn);
|
|
|
|
TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
|
|
}
|
|
|
|
void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
|
|
{
|
|
UNREFERENCED_PARAMETER(flags);
|
|
ParallelMath::RoundTowardNearestForScope rtn;
|
|
|
|
float weights[1] = { 1.0f };
|
|
|
|
MUInt15 pixels[16];
|
|
MFloat floatPixels[16];
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
|
|
floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
|
|
}
|
|
|
|
MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
|
|
|
|
IndexSelector<1> selector;
|
|
selector.Init<false>(weights, ep, 16);
|
|
|
|
MUInt15 indexes[16];
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
|
|
|
|
for (int block = 0; block < ParallelMath::ParallelSize; block++)
|
|
{
|
|
for (int px = 0; px < 16; px += 2)
|
|
{
|
|
int index0 = ParallelMath::Extract(indexes[px], block);
|
|
int index1 = ParallelMath::Extract(indexes[px + 1], block);
|
|
|
|
packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
|
|
}
|
|
|
|
packedBlocks += packedBlockStride;
|
|
}
|
|
}
|
|
|
|
void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
|
|
{
|
|
if (maxTweakRounds < 1)
|
|
maxTweakRounds = 1;
|
|
|
|
if (numRefineRounds < 1)
|
|
numRefineRounds = 1;
|
|
|
|
ParallelMath::RoundTowardNearestForScope rtn;
|
|
|
|
float oneWeight[1] = { 1.0f };
|
|
|
|
MUInt15 pixels[16];
|
|
MFloat floatPixels[16];
|
|
|
|
MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
|
|
MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
|
|
|
|
if (isSigned)
|
|
pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
|
|
|
|
floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
|
|
}
|
|
|
|
MUInt15 sortedPixels[16];
|
|
for (int px = 0; px < 16; px++)
|
|
sortedPixels[px] = pixels[px];
|
|
|
|
for (int sortEnd = 15; sortEnd > 0; sortEnd--)
|
|
{
|
|
for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
|
|
{
|
|
MUInt15 a = sortedPixels[sortOffset];
|
|
MUInt15 b = sortedPixels[sortOffset + 1];
|
|
|
|
sortedPixels[sortOffset] = ParallelMath::Min(a, b);
|
|
sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
|
|
}
|
|
}
|
|
|
|
MUInt15 zero = ParallelMath::MakeUInt15(0);
|
|
MUInt15 one = ParallelMath::MakeUInt15(1);
|
|
|
|
MUInt15 bestIsFullRange = zero;
|
|
MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
|
|
MUInt15 bestEP[2] = { zero, zero };
|
|
MUInt15 bestIndexes[16] = {
|
|
zero, zero, zero, zero,
|
|
zero, zero, zero, zero,
|
|
zero, zero, zero, zero,
|
|
zero, zero, zero, zero
|
|
};
|
|
|
|
// Full-precision
|
|
{
|
|
MUInt15 minEP = sortedPixels[0];
|
|
MUInt15 maxEP = sortedPixels[15];
|
|
|
|
MFloat base[1] = { ParallelMath::ToFloat(minEP) };
|
|
MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
|
|
|
|
UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
|
|
|
|
int numTweakRounds = BCCommon::TweakRoundsForRange(8);
|
|
if (numTweakRounds > maxTweakRounds)
|
|
numTweakRounds = maxTweakRounds;
|
|
|
|
for (int tweak = 0; tweak < numTweakRounds; tweak++)
|
|
{
|
|
MUInt15 ep[2][1];
|
|
|
|
ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
|
|
|
|
for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
|
|
{
|
|
EndpointRefiner<1> refiner;
|
|
refiner.Init(8, oneWeight);
|
|
|
|
if (isSigned)
|
|
for (int epi = 0; epi < 2; epi++)
|
|
ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
|
|
|
|
IndexSelector<1> indexSelector;
|
|
indexSelector.Init<false>(oneWeight, ep, 8);
|
|
|
|
MUInt15 indexes[16];
|
|
|
|
AggregatedError<1> aggError;
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
|
|
|
|
MUInt15 reconstructedPixel;
|
|
|
|
indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
|
|
BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
|
|
|
|
if (refinePass != numRefineRounds - 1)
|
|
refiner.ContributeUnweightedPW(&floatPixels[px], index);
|
|
|
|
indexes[px] = index;
|
|
}
|
|
MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
|
|
|
|
ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
|
|
ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
|
|
|
|
if (ParallelMath::AnySet(errorBetter16))
|
|
{
|
|
bestError = ParallelMath::Min(error, bestError);
|
|
ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
|
|
for (int px = 0; px < 16; px++)
|
|
ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
|
|
|
|
for (int epi = 0; epi < 2; epi++)
|
|
ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
|
|
}
|
|
|
|
if (refinePass != numRefineRounds - 1)
|
|
refiner.GetRefinedEndpointsLDR(ep, &rtn);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Reduced precision with special endpoints
|
|
{
|
|
MUInt15 bestHeuristicMin = sortedPixels[0];
|
|
MUInt15 bestHeuristicMax = sortedPixels[15];
|
|
|
|
ParallelMath::Int16CompFlag canTryClipping;
|
|
|
|
// In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
|
|
// The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
|
|
// This will usually not find anything, but it's cheap to check.
|
|
|
|
{
|
|
MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
|
|
MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
|
|
|
|
MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
|
|
canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
|
|
}
|
|
|
|
if (ParallelMath::AnySet(canTryClipping))
|
|
{
|
|
MUInt15 lowClearances[16];
|
|
MUInt15 highClearances[16];
|
|
MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
|
|
|
|
lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
|
|
|
|
for (int px = 1; px < 16; px++)
|
|
{
|
|
lowClearances[px] = sortedPixels[px - 1];
|
|
highClearances[px] = highTerminal - sortedPixels[16 - px];
|
|
}
|
|
|
|
for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
|
|
{
|
|
uint16_t numSkippedLow = firstIndex;
|
|
|
|
MUInt15 lowClearance = lowClearances[firstIndex];
|
|
|
|
for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
|
|
{
|
|
uint16_t numSkippedHigh = 15 - lastIndex;
|
|
uint16_t numSkipped = numSkippedLow + numSkippedHigh;
|
|
|
|
MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
|
|
|
|
ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
|
|
|
|
if (!ParallelMath::AnySet(areMoreSkipped))
|
|
continue;
|
|
|
|
MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
|
|
MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
|
|
|
|
MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
|
|
|
|
ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
|
|
ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
|
|
ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
|
|
}
|
|
}
|
|
}
|
|
|
|
MUInt15 bestSimpleMin = one;
|
|
MUInt15 bestSimpleMax = highTerminalMinusOne;
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
|
|
ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
|
|
}
|
|
|
|
MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
|
|
MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
|
|
|
|
int minEPRange = 2;
|
|
if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
|
|
minEPRange = 1;
|
|
|
|
int maxEPRange = 2;
|
|
if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
|
|
maxEPRange = 1;
|
|
|
|
for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
|
|
{
|
|
for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
|
|
{
|
|
MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
|
|
MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
|
|
|
|
UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
|
|
|
|
int numTweakRounds = BCCommon::TweakRoundsForRange(6);
|
|
if (numTweakRounds > maxTweakRounds)
|
|
numTweakRounds = maxTweakRounds;
|
|
|
|
for (int tweak = 0; tweak < numTweakRounds; tweak++)
|
|
{
|
|
MUInt15 ep[2][1];
|
|
|
|
ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
|
|
|
|
for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
|
|
{
|
|
EndpointRefiner<1> refiner;
|
|
refiner.Init(6, oneWeight);
|
|
|
|
if (isSigned)
|
|
for (int epi = 0; epi < 2; epi++)
|
|
ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
|
|
|
|
IndexSelector<1> indexSelector;
|
|
indexSelector.Init<false>(oneWeight, ep, 6);
|
|
|
|
MUInt15 indexes[16];
|
|
MFloat error = ParallelMath::MakeFloatZero();
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
|
|
|
|
MUInt15 reconstructedPixel;
|
|
|
|
indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
|
|
|
|
MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
|
|
MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
|
|
MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
|
|
|
|
MFloat bestPixelError = zeroError;
|
|
MUInt15 index = ParallelMath::MakeUInt15(6);
|
|
|
|
ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
|
|
bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
|
|
|
|
ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
|
|
|
|
if (ParallelMath::AllSet(selectedIndexBetter))
|
|
{
|
|
if (refinePass != numRefineRounds - 1)
|
|
refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
|
|
}
|
|
else
|
|
{
|
|
MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
|
|
|
|
if (refinePass != numRefineRounds - 1)
|
|
refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
|
|
}
|
|
|
|
ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
|
|
bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
|
|
|
|
error = error + bestPixelError;
|
|
|
|
indexes[px] = index;
|
|
}
|
|
|
|
ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
|
|
ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
|
|
|
|
if (ParallelMath::AnySet(errorBetter16))
|
|
{
|
|
bestError = ParallelMath::Min(error, bestError);
|
|
ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
|
|
for (int px = 0; px < 16; px++)
|
|
ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
|
|
|
|
for (int epi = 0; epi < 2; epi++)
|
|
ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
|
|
}
|
|
|
|
if (refinePass != numRefineRounds - 1)
|
|
refiner.GetRefinedEndpointsLDR(ep, &rtn);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int block = 0; block < ParallelMath::ParallelSize; block++)
|
|
{
|
|
int ep0 = ParallelMath::Extract(bestEP[0], block);
|
|
int ep1 = ParallelMath::Extract(bestEP[1], block);
|
|
int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
|
|
|
|
if (isSigned)
|
|
{
|
|
ep0 -= 127;
|
|
ep1 -= 127;
|
|
|
|
assert(ep0 >= -127 && ep0 <= 127);
|
|
assert(ep1 >= -127 && ep1 <= 127);
|
|
}
|
|
|
|
|
|
bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
|
|
|
|
if (swapEndpoints)
|
|
std::swap(ep0, ep1);
|
|
|
|
uint16_t dumpBits = 0;
|
|
int dumpBitsOffset = 0;
|
|
int dumpByteOffset = 2;
|
|
packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
|
|
packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
|
|
|
|
int maxValue = (isFullRange != 0) ? 7 : 5;
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
int index = ParallelMath::Extract(bestIndexes[px], block);
|
|
|
|
if (swapEndpoints && index <= maxValue)
|
|
index = maxValue - index;
|
|
|
|
if (index != 0)
|
|
{
|
|
if (index == maxValue)
|
|
index = 1;
|
|
else if (index < maxValue)
|
|
index++;
|
|
}
|
|
|
|
assert(index >= 0 && index < 8);
|
|
|
|
dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
|
|
dumpBitsOffset += 3;
|
|
|
|
if (dumpBitsOffset >= 8)
|
|
{
|
|
assert(dumpByteOffset < 8);
|
|
packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
|
|
dumpBits >>= 8;
|
|
dumpBitsOffset -= 8;
|
|
dumpByteOffset++;
|
|
}
|
|
}
|
|
|
|
assert(dumpBitsOffset == 0);
|
|
assert(dumpByteOffset == 8);
|
|
|
|
packedBlocks += packedBlockStride;
|
|
}
|
|
}
|
|
|
|
void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
|
|
{
|
|
ParallelMath::RoundTowardNearestForScope rtn;
|
|
|
|
if (numRefineRounds < 1)
|
|
numRefineRounds = 1;
|
|
|
|
if (maxTweakRounds < 1)
|
|
maxTweakRounds = 1;
|
|
|
|
EndpointSelector<3, 8> endpointSelector;
|
|
|
|
MUInt15 pixels[16][4];
|
|
MFloat floatPixels[16][4];
|
|
|
|
MFloat preWeightedPixels[16][4];
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
for (int ch = 0; ch < 4; ch++)
|
|
ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
|
|
}
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
for (int ch = 0; ch < 4; ch++)
|
|
floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
|
|
}
|
|
|
|
if (alphaTest)
|
|
{
|
|
MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
|
|
pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
|
|
}
|
|
}
|
|
|
|
BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
|
|
|
|
MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
|
|
|
|
MFloat pixelWeights[16];
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
|
|
if (alphaTest)
|
|
{
|
|
ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
|
|
|
|
ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
|
|
}
|
|
}
|
|
|
|
for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
|
|
{
|
|
for (int px = 0; px < 16; px++)
|
|
endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
|
|
|
|
endpointSelector.FinishPass(pass);
|
|
}
|
|
|
|
UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
|
|
|
|
MUInt15 bestEndpoints[2][3];
|
|
MUInt15 bestIndexes[16];
|
|
MUInt15 bestRange = ParallelMath::MakeUInt15(0);
|
|
MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
|
|
|
|
for (int px = 0; px < 16; px++)
|
|
bestIndexes[px] = ParallelMath::MakeUInt15(0);
|
|
|
|
for (int ep = 0; ep < 2; ep++)
|
|
for (int ch = 0; ch < 3; ch++)
|
|
bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
|
|
|
|
if (exhaustive)
|
|
{
|
|
MSInt16 sortBins[16];
|
|
|
|
{
|
|
// Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
|
|
// and pack the original indexes into the low bits.
|
|
|
|
MUInt15 sortEP[2][3];
|
|
ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
|
|
|
|
IndexSelector<3> sortSelector;
|
|
sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
|
|
|
|
for (int16_t px = 0; px < 16; px++)
|
|
{
|
|
MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
|
|
|
|
if (alphaTest)
|
|
{
|
|
ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
|
|
|
|
ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
|
|
}
|
|
|
|
sortBin = sortBin + ParallelMath::MakeSInt16(px);
|
|
|
|
sortBins[px] = sortBin;
|
|
}
|
|
}
|
|
|
|
// Sort bins
|
|
for (int sortEnd = 1; sortEnd < 16; sortEnd++)
|
|
{
|
|
for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
|
|
{
|
|
MSInt16 a = sortBins[sortLoc];
|
|
MSInt16 b = sortBins[sortLoc - 1];
|
|
|
|
sortBins[sortLoc] = ParallelMath::Max(a, b);
|
|
sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
|
|
}
|
|
}
|
|
|
|
MUInt15 firstElement = ParallelMath::MakeUInt15(0);
|
|
for (uint16_t e = 0; e < 16; e++)
|
|
{
|
|
ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
|
|
ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
|
|
if (!ParallelMath::AnySet(isInvalid))
|
|
break;
|
|
}
|
|
|
|
MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
|
|
|
|
MUInt15 sortedInputs[16][4];
|
|
MFloat floatSortedInputs[16][4];
|
|
MFloat pwFloatSortedInputs[16][4];
|
|
|
|
for (int e = 0; e < 16; e++)
|
|
{
|
|
for (int ch = 0; ch < 4; ch++)
|
|
sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
|
|
}
|
|
|
|
for (int block = 0; block < ParallelMath::ParallelSize; block++)
|
|
{
|
|
for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
|
|
{
|
|
ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
|
|
int originalIndex = (sortBin & 15);
|
|
|
|
for (int ch = 0; ch < 4; ch++)
|
|
ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
|
|
}
|
|
}
|
|
|
|
for (int e = 0; e < 16; e++)
|
|
{
|
|
for (int ch = 0; ch < 4; ch++)
|
|
{
|
|
MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
|
|
floatSortedInputs[e][ch] = f;
|
|
pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
|
|
}
|
|
}
|
|
|
|
for (int n0 = 0; n0 <= 15; n0++)
|
|
{
|
|
int remainingFor1 = 16 - n0;
|
|
if (remainingFor1 == 16)
|
|
remainingFor1 = 15;
|
|
|
|
for (int n1 = 0; n1 <= remainingFor1; n1++)
|
|
{
|
|
int remainingFor2 = 16 - n1 - n0;
|
|
if (remainingFor2 == 16)
|
|
remainingFor2 = 15;
|
|
|
|
for (int n2 = 0; n2 <= remainingFor2; n2++)
|
|
{
|
|
int n3 = 16 - n2 - n1 - n0;
|
|
|
|
if (n3 == 16)
|
|
continue;
|
|
|
|
int counts[4] = { n0, n1, n2, n3 };
|
|
|
|
TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
|
|
}
|
|
}
|
|
}
|
|
|
|
TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
|
|
|
|
if (alphaTest)
|
|
{
|
|
for (int n0 = 0; n0 <= 15; n0++)
|
|
{
|
|
int remainingFor1 = 16 - n0;
|
|
if (remainingFor1 == 16)
|
|
remainingFor1 = 15;
|
|
|
|
for (int n1 = 0; n1 <= remainingFor1; n1++)
|
|
{
|
|
int n2 = 16 - n1 - n0;
|
|
|
|
if (n2 == 16)
|
|
continue;
|
|
|
|
int counts[3] = { n0, n1, n2 };
|
|
|
|
TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
|
|
}
|
|
}
|
|
|
|
TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
int minRange = alphaTest ? 3 : 4;
|
|
|
|
for (int range = minRange; range <= 4; range++)
|
|
{
|
|
int tweakRounds = BCCommon::TweakRoundsForRange(range);
|
|
if (tweakRounds > maxTweakRounds)
|
|
tweakRounds = maxTweakRounds;
|
|
|
|
for (int tweak = 0; tweak < tweakRounds; tweak++)
|
|
{
|
|
MUInt15 endPoints[2][3];
|
|
|
|
ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
|
|
|
|
for (int refine = 0; refine < numRefineRounds; refine++)
|
|
{
|
|
EndpointRefiner<3> refiner;
|
|
refiner.Init(range, channelWeights);
|
|
|
|
TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
|
|
|
|
if (refine != numRefineRounds - 1)
|
|
refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int block = 0; block < ParallelMath::ParallelSize; block++)
|
|
{
|
|
ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
|
|
assert(range == 3 || range == 4);
|
|
|
|
ParallelMath::ScalarUInt16 compressedEP[2];
|
|
for (int ep = 0; ep < 2; ep++)
|
|
{
|
|
ParallelMath::ScalarUInt16 endPoint[3];
|
|
for (int ch = 0; ch < 3; ch++)
|
|
endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
|
|
|
|
int compressed = (endPoint[0] & 0xf8) << 8;
|
|
compressed |= (endPoint[1] & 0xfc) << 3;
|
|
compressed |= (endPoint[2] & 0xf8) >> 3;
|
|
|
|
compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
|
|
}
|
|
|
|
int indexOrder[4];
|
|
|
|
if (range == 4)
|
|
{
|
|
if (compressedEP[0] == compressedEP[1])
|
|
{
|
|
indexOrder[0] = 0;
|
|
indexOrder[1] = 0;
|
|
indexOrder[2] = 0;
|
|
indexOrder[3] = 0;
|
|
}
|
|
else if (compressedEP[0] < compressedEP[1])
|
|
{
|
|
std::swap(compressedEP[0], compressedEP[1]);
|
|
indexOrder[0] = 1;
|
|
indexOrder[1] = 3;
|
|
indexOrder[2] = 2;
|
|
indexOrder[3] = 0;
|
|
}
|
|
else
|
|
{
|
|
indexOrder[0] = 0;
|
|
indexOrder[1] = 2;
|
|
indexOrder[2] = 3;
|
|
indexOrder[3] = 1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
assert(range == 3);
|
|
|
|
if (compressedEP[0] > compressedEP[1])
|
|
{
|
|
std::swap(compressedEP[0], compressedEP[1]);
|
|
indexOrder[0] = 1;
|
|
indexOrder[1] = 2;
|
|
indexOrder[2] = 0;
|
|
}
|
|
else
|
|
{
|
|
indexOrder[0] = 0;
|
|
indexOrder[1] = 2;
|
|
indexOrder[2] = 1;
|
|
}
|
|
indexOrder[3] = 3;
|
|
}
|
|
|
|
packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
|
|
packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
|
|
packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
|
|
packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
|
|
|
|
for (int i = 0; i < 16; i += 4)
|
|
{
|
|
int packedIndexes = 0;
|
|
for (int subi = 0; subi < 4; subi++)
|
|
{
|
|
ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
|
|
packedIndexes |= (indexOrder[index] << (subi * 2));
|
|
}
|
|
|
|
packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
|
|
}
|
|
|
|
packedBlocks += packedBlockStride;
|
|
}
|
|
}
|
|
|
|
#endif
|