419b342a9a
Make BC6 and BC7 CVTT faster while still having better quality than DXT5.
104 lines
4.5 KiB
C++
104 lines
4.5 KiB
C++
#pragma once
|
|
#ifndef __CVTT_BCCOMMON_H__
|
|
#define __CVTT_BCCOMMON_H__
|
|
|
|
#include "ConvectionKernels_AggregatedError.h"
|
|
#include "ConvectionKernels_ParallelMath.h"
|
|
|
|
namespace cvtt
|
|
{
|
|
namespace Internal
|
|
{
|
|
class BCCommon
|
|
{
|
|
public:
|
|
typedef ParallelMath::Float MFloat;
|
|
typedef ParallelMath::UInt16 MUInt16;
|
|
typedef ParallelMath::UInt15 MUInt15;
|
|
typedef ParallelMath::AInt16 MAInt16;
|
|
typedef ParallelMath::SInt16 MSInt16;
|
|
typedef ParallelMath::SInt32 MSInt32;
|
|
|
|
static int TweakRoundsForRange(int range);
|
|
|
|
template<int TVectorSize>
|
|
static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, AggregatedError<TVectorSize> &aggError)
|
|
{
|
|
for (int ch = 0; ch < numRealChannels; ch++)
|
|
aggError.Add(ParallelMath::SqDiffUInt8(reconstructed[ch], original[ch]), ch);
|
|
}
|
|
|
|
template<int TVectorSize>
|
|
static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], AggregatedError<TVectorSize> &aggError)
|
|
{
|
|
ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, TVectorSize, aggError);
|
|
}
|
|
|
|
template<int TVectorSize>
|
|
static MFloat ComputeErrorLDRSimple(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, const float *channelWeightsSq)
|
|
{
|
|
AggregatedError<TVectorSize> aggError;
|
|
ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, numRealChannels, aggError);
|
|
return aggError.Finalize(flags, channelWeightsSq);
|
|
}
|
|
|
|
template<int TVectorSize>
|
|
static MFloat ComputeErrorHDRFast(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
|
|
{
|
|
MFloat error = ParallelMath::MakeFloatZero();
|
|
if (flags & Flags::Uniform)
|
|
{
|
|
for (int ch = 0; ch < TVectorSize; ch++)
|
|
error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]);
|
|
}
|
|
else
|
|
{
|
|
for (int ch = 0; ch < TVectorSize; ch++)
|
|
error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
template<int TVectorSize>
|
|
static MFloat ComputeErrorHDRSlow(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
|
|
{
|
|
MFloat error = ParallelMath::MakeFloatZero();
|
|
if (flags & Flags::Uniform)
|
|
{
|
|
for (int ch = 0; ch < TVectorSize; ch++)
|
|
error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]);
|
|
}
|
|
else
|
|
{
|
|
for (int ch = 0; ch < TVectorSize; ch++)
|
|
error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
template<int TChannelCount>
|
|
static void PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount], const MUInt15 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
|
|
{
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
for (int ch = 0; ch < TChannelCount; ch++)
|
|
preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
|
|
}
|
|
}
|
|
|
|
template<int TChannelCount>
|
|
static void PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount], const MSInt16 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
|
|
{
|
|
for (int px = 0; px < 16; px++)
|
|
{
|
|
for (int ch = 0; ch < TChannelCount; ch++)
|
|
preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
|
|
}
|
|
}
|
|
};
|
|
}
|
|
}
|
|
|
|
#endif
|