virtualx-engine/modules/betsy/bc1.glsl

#[versions]

standard = "";
dithered = "#define BC1_DITHER";

#[compute]
#version 450

#include "CrossPlatformSettings_piece_all.glsl"
#include "UavCrossPlatform_piece_all.glsl"

#define FLT_MAX 340282346638528859811704183484516925440.0f

layout(binding = 0) uniform sampler2D srcTex;
layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;

layout(std430, binding = 2) readonly restrict buffer globalBuffer {
	float2 c_oMatch5[256];
	float2 c_oMatch6[256];
};

layout(push_constant, std430) uniform Params {
	uint p_numRefinements;
	uint p_padding[3];
}
params;

layout(local_size_x = 8, //
		local_size_y = 8, //
		local_size_z = 1) in;

float3 rgb565to888(float rgb565) {
	float3 retVal;
	retVal.x = floor(rgb565 / 2048.0f);
	retVal.y = floor(mod(rgb565, 2048.0f) / 32.0f);
	retVal.z = floor(mod(rgb565, 32.0f));

	// This is the correct 565 to 888 conversion:
	//		rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f )
	//
	// However stb_dxt follows a different one:
	//		rb = floor( rb * ( 256 / 32 + 8 / 32 ) );
	//		g  = floor( g  * ( 256 / 64 + 4 / 64 ) );
	//
	// I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded
	// It's quite possible this is the reason:
	//		http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/
	//
	// Or maybe it's just because it's cheap to do with integer shifts.
	// Anyway, we follow stb_dxt's conversion just in case
	// (gives almost the same result, with 1 or -1 of difference for a very few values)
	//
	// Perhaps when we make 888 -> 565 -> 888 it doesn't matter
	// because they end up mapping to the original number

	return floor(retVal * float3(8.25f, 4.0625f, 8.25f));
}

float rgb888to565(float3 rgbValue) {
	rgbValue.rb = floor(rgbValue.rb * 31.0f / 255.0f + 0.5f);
	rgbValue.g = floor(rgbValue.g * 63.0f / 255.0f + 0.5f);

	return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b;
}

// linear interpolation at 1/3 point between a and b, using desired rounding type
float3 lerp13(float3 a, float3 b) {
#ifdef STB_DXT_USE_ROUNDING_BIAS
	// with rounding bias
	return a + floor((b - a) * (1.0f / 3.0f) + 0.5f);
#else
	// without rounding bias
	return floor((2.0f * a + b) / 3.0f);
#endif
}

/// Unpacks a block of 4 colors from two 16-bit endpoints
void EvalColors(out float3 colors[4], float c0, float c1) {
	colors[0] = rgb565to888(c0);
	colors[1] = rgb565to888(c1);
	colors[2] = lerp13(colors[0], colors[1]);
	colors[3] = lerp13(colors[1], colors[0]);
}

/** The color optimization function. (Clever code, part 1)
@param outMinEndp16 [out]
	Minimum endpoint, in RGB565
@param outMaxEndp16 [out]
	Maximum endpoint, in RGB565
*/
void OptimizeColorsBlock(const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16) {
	// determine color distribution
	float3 avgColor;
	float3 minColor;
	float3 maxColor;

	avgColor = minColor = maxColor = unpackUnorm4x8(srcPixelsBlock[0]).xyz;
	for (int i = 1; i < 16; ++i) {
		const float3 currColorUnorm = unpackUnorm4x8(srcPixelsBlock[i]).xyz;
		avgColor += currColorUnorm;
		minColor = min(minColor, currColorUnorm);
		maxColor = max(maxColor, currColorUnorm);
	}

	avgColor = round(avgColor * 255.0f / 16.0f);
	maxColor *= 255.0f;
	minColor *= 255.0f;

	// determine covariance matrix
	float cov[6];
	for (int i = 0; i < 6; ++i)
		cov[i] = 0;

	for (int i = 0; i < 16; ++i) {
		const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
		float3 rgbDiff = currColor - avgColor;

		cov[0] += rgbDiff.r * rgbDiff.r;
		cov[1] += rgbDiff.r * rgbDiff.g;
		cov[2] += rgbDiff.r * rgbDiff.b;
		cov[3] += rgbDiff.g * rgbDiff.g;
		cov[4] += rgbDiff.g * rgbDiff.b;
		cov[5] += rgbDiff.b * rgbDiff.b;
	}

	// convert covariance matrix to float, find principal axis via power iter
	for (int i = 0; i < 6; ++i)
		cov[i] /= 255.0f;

	float3 vF = maxColor - minColor;

	const int nIterPower = 4;
	for (int iter = 0; iter < nIterPower; ++iter) {
		const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2];
		const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4];
		const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5];

		vF.r = r;
		vF.g = g;
		vF.b = b;
	}

	float magn = max3(abs(vF.r), abs(vF.g), abs(vF.b));
	float3 v;

	if (magn < 4.0f) { // too small, default to luminance
		v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000.
		v.g = 587.0f;
		v.b = 114.0f;
	} else {
		v = trunc(vF * (512.0f / magn));
	}

	// Pick colors at extreme points
	float3 minEndpoint, maxEndpoint;
	float minDot = FLT_MAX;
	float maxDot = -FLT_MAX;
	for (int i = 0; i < 16; ++i) {
		const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
		const float dotValue = dot(currColor, v);

		if (dotValue < minDot) {
			minDot = dotValue;
			minEndpoint = currColor;
		}

		if (dotValue > maxDot) {
			maxDot = dotValue;
			maxEndpoint = currColor;
		}
	}

	outMinEndp16 = rgb888to565(minEndpoint);
	outMaxEndp16 = rgb888to565(maxEndpoint);
}

// The color matching function
uint MatchColorsBlock(const uint srcPixelsBlock[16], float3 color[4]) {
	uint mask = 0u;
	float3 dir = color[0] - color[1];
	float stops[4];

	for (int i = 0; i < 4; ++i)
		stops[i] = dot(color[i], dir);

	// think of the colors as arranged on a line; project point onto that line, then choose
	// next color out of available ones. we compute the crossover points for "best color in top
	// half"/"best in bottom half" and then the same inside that subinterval.
	//
	// relying on this 1d approximation isn't always optimal in terms of euclidean distance,
	// but it's very close and a lot faster.
	// http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html

	float c0Point = trunc((stops[1] + stops[3]) * 0.5f);
	float halfPoint = trunc((stops[3] + stops[2]) * 0.5f);
	float c3Point = trunc((stops[2] + stops[0]) * 0.5f);

#ifndef BC1_DITHER
	// the version without dithering is straightforward
	for (uint i = 16u; i-- > 0u;) {
		const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;

		const float dotValue = dot(currColor, dir);
		mask <<= 2u;

		if (dotValue < halfPoint)
			mask |= ((dotValue < c0Point) ? 1u : 3u);
		else
			mask |= ((dotValue < c3Point) ? 2u : 0u);
	}
#else
	// with floyd-steinberg dithering
	float4 ep1 = float4(0, 0, 0, 0);
	float4 ep2 = float4(0, 0, 0, 0);

	c0Point *= 16.0f;
	halfPoint *= 16.0f;
	c3Point *= 16.0f;

	for (uint y = 0u; y < 4u; ++y) {
		float ditherDot;
		uint lmask, step;

		float3 currColor;
		float dotValue;

		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 0]).xyz * 255.0f;
		dotValue = dot(currColor, dir);

		ditherDot = (dotValue * 16.0f) + (3 * ep2[1] + 5 * ep2[0]);
		if (ditherDot < halfPoint)
			step = (ditherDot < c0Point) ? 1u : 3u;
		else
			step = (ditherDot < c3Point) ? 2u : 0u;
		ep1[0] = dotValue - stops[step];
		lmask = step;

		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 1]).xyz * 255.0f;
		dotValue = dot(currColor, dir);

		ditherDot = (dotValue * 16.0f) + (7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]);
		if (ditherDot < halfPoint)
			step = (ditherDot < c0Point) ? 1u : 3u;
		else
			step = (ditherDot < c3Point) ? 2u : 0u;
		ep1[1] = dotValue - stops[step];
		lmask |= step << 2u;

		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;
		dotValue = dot(currColor, dir);

		ditherDot = (dotValue * 16.0f) + (7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]);
		if (ditherDot < halfPoint)
			step = (ditherDot < c0Point) ? 1u : 3u;
		else
			step = (ditherDot < c3Point) ? 2u : 0u;
		ep1[2] = dotValue - stops[step];
		lmask |= step << 4u;

		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;
		dotValue = dot(currColor, dir);

		ditherDot = (dotValue * 16.0f) + (7 * ep1[2] + 5 * ep2[3] + ep2[2]);
		if (ditherDot < halfPoint)
			step = (ditherDot < c0Point) ? 1u : 3u;
		else
			step = (ditherDot < c3Point) ? 2u : 0u;
		ep1[3] = dotValue - stops[step];
		lmask |= step << 6u;

		mask |= lmask << (y * 8u);
		{
			float4 tmp = ep1;
			ep1 = ep2;
			ep2 = tmp;
		} // swap
	}
#endif

	return mask;
}

// The refinement function. (Clever code, part 2)
// Tries to optimize colors to suit block contents better.
// (By solving a least squares system via normal equations+Cramer's rule)
bool RefineBlock(const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16,
		inout float inOutMaxEndp16) {
	float newMin16, newMax16;
	const float oldMin = inOutMinEndp16;
	const float oldMax = inOutMaxEndp16;

	if ((mask ^ (mask << 2u)) < 4u) // all pixels have the same index?
	{
		// yes, linear system would be singular; solve using optimal
		// single-color match on average color
		float3 rgbVal = float3(8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f);
		for (int i = 0; i < 16; ++i)
			rgbVal += unpackUnorm4x8(srcPixelsBlock[i]).xyz;

		rgbVal = floor(rgbVal * (255.0f / 16.0f));

		newMax16 = c_oMatch5[uint(rgbVal.r)][0] * 2048.0f + //
				c_oMatch6[uint(rgbVal.g)][0] * 32.0f + //
				c_oMatch5[uint(rgbVal.b)][0];
		newMin16 = c_oMatch5[uint(rgbVal.r)][1] * 2048.0f + //
				c_oMatch6[uint(rgbVal.g)][1] * 32.0f + //
				c_oMatch5[uint(rgbVal.b)][1];
	} else {
		const float w1Tab[4] = { 3, 0, 2, 1 };
		const float prods[4] = { 589824.0f, 2304.0f, 262402.0f, 66562.0f };
		// ^some magic to save a lot of multiplies in the accumulating loop...
		// (precomputed products of weights for least squares system, accumulated inside one 32-bit
		// register)

		float akku = 0.0f;
		uint cm = mask;
		float3 at1 = float3(0, 0, 0);
		float3 at2 = float3(0, 0, 0);
		for (int i = 0; i < 16; ++i, cm >>= 2u) {
			const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;

			const uint step = cm & 3u;
			const float w1 = w1Tab[step];
			akku += prods[step];
			at1 += currColor * w1;
			at2 += currColor;
		}

		at2 = 3.0f * at2 - at1;

		// extract solutions and decide solvability
		const float xx = floor(akku / 65535.0f);
		const float yy = floor(mod(akku, 65535.0f) / 256.0f);
		const float xy = mod(akku, 256.0f);

		float2 f_rb_g;
		f_rb_g.x = 3.0f * 31.0f / 255.0f / (xx * yy - xy * xy);
		f_rb_g.y = f_rb_g.x * 63.0f / 31.0f;

		// solve.
		const float3 newMaxVal = clamp(floor((at1 * yy - at2 * xy) * f_rb_g.xyx + 0.5f),
				float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31));
		newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z;

		const float3 newMinVal = clamp(floor((at2 * xx - at1 * xy) * f_rb_g.xyx + 0.5f),
				float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31));
		newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z;
	}

	inOutMinEndp16 = newMin16;
	inOutMaxEndp16 = newMax16;

	return oldMin != newMin16 || oldMax != newMax16;
}

#ifdef BC1_DITHER
/// Quantizes 'srcValue' which is originally in 888 (full range),
/// converting it to 565 and then back to 888 (quantized)
float3 quant(float3 srcValue) {
	srcValue = clamp(srcValue, 0.0f, 255.0f);
	// Convert 888 -> 565
	srcValue = floor(srcValue * float3(31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f) + 0.5f);
	// Convert 565 -> 888 back
	srcValue = floor(srcValue * float3(8.25f, 4.0625f, 8.25f));

	return srcValue;
}

void DitherBlock(const uint srcPixBlck[16], out uint dthPixBlck[16]) {
	float3 ep1[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) };
	float3 ep2[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) };

	for (uint y = 0u; y < 16u; y += 4u) {
		float3 srcPixel, dithPixel;

		srcPixel = unpackUnorm4x8(srcPixBlck[y + 0u]).xyz * 255.0f;
		dithPixel = quant(srcPixel + trunc((3 * ep2[1] + 5 * ep2[0]) * (1.0f / 16.0f)));
		ep1[0] = srcPixel - dithPixel;
		dthPixBlck[y + 0u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));

		srcPixel = unpackUnorm4x8(srcPixBlck[y + 1u]).xyz * 255.0f;
		dithPixel = quant(
				srcPixel + trunc((7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]) * (1.0f / 16.0f)));
		ep1[1] = srcPixel - dithPixel;
		dthPixBlck[y + 1u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));

		srcPixel = unpackUnorm4x8(srcPixBlck[y + 2u]).xyz * 255.0f;
		dithPixel = quant(
				srcPixel + trunc((7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]) * (1.0f / 16.0f)));
		ep1[2] = srcPixel - dithPixel;
		dthPixBlck[y + 2u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));

		srcPixel = unpackUnorm4x8(srcPixBlck[y + 3u]).xyz * 255.0f;
		dithPixel = quant(srcPixel + trunc((7 * ep1[2] + 5 * ep2[3] + ep2[2]) * (1.0f / 16.0f)));
		ep1[3] = srcPixel - dithPixel;
		dthPixBlck[y + 3u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));

		// swap( ep1, ep2 )
		for (uint i = 0u; i < 4u; ++i) {
			float3 tmp = ep1[i];
			ep1[i] = ep2[i];
			ep2[i] = tmp;
		}
	}
}
#endif

void main() {
	uint srcPixelsBlock[16];

	bool bAllColorsEqual = true;

	// Load the whole 4x4 block
	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u;
	for (uint i = 0u; i < 16u; ++i) {
		const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i & 0x03u, i >> 2u);
		const float3 srcPixels0 = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyz;
		srcPixelsBlock[i] = packUnorm4x8(float4(srcPixels0, 1.0f));
		bAllColorsEqual = bAllColorsEqual && srcPixelsBlock[0] == srcPixelsBlock[i];
	}

	float maxEndp16, minEndp16;
	uint mask = 0u;

	if (bAllColorsEqual) {
		const uint3 rgbVal = uint3(unpackUnorm4x8(srcPixelsBlock[0]).xyz * 255.0f);
		mask = 0xAAAAAAAAu;
		maxEndp16 =
				c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0];
		minEndp16 =
				c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1];
	} else {
#ifdef BC1_DITHER
		uint ditherPixelsBlock[16];
		// first step: compute dithered version for PCA if desired
		DitherBlock(srcPixelsBlock, ditherPixelsBlock);
#else
#define ditherPixelsBlock srcPixelsBlock
#endif

		// second step: pca+map along principal axis
		OptimizeColorsBlock(ditherPixelsBlock, minEndp16, maxEndp16);
		if (minEndp16 != maxEndp16) {
			float3 colors[4];
			EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted
			mask = MatchColorsBlock(srcPixelsBlock, colors);
		}

		// third step: refine (multiple times if requested)
		bool bStopRefinement = false;
		for (uint i = 0u; i < params.p_numRefinements && !bStopRefinement; ++i) {
			const uint lastMask = mask;

			if (RefineBlock(ditherPixelsBlock, mask, minEndp16, maxEndp16)) {
				if (minEndp16 != maxEndp16) {
					float3 colors[4];
					EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted
					mask = MatchColorsBlock(srcPixelsBlock, colors);
				} else {
					mask = 0u;
					bStopRefinement = true;
				}
			}

			bStopRefinement = mask == lastMask || bStopRefinement;
		}
	}

	// write the color block
	if (maxEndp16 < minEndp16) {
		const float tmpValue = minEndp16;
		minEndp16 = maxEndp16;
		maxEndp16 = tmpValue;
		mask ^= 0x55555555u;
	}

	uint2 outputBytes;
	outputBytes.x = uint(maxEndp16) | (uint(minEndp16) << 16u);
	outputBytes.y = mask;

	uint2 dstUV = gl_GlobalInvocationID.xy;
	imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u));
}
Betsy: Add caching and BC1 compression support 2024-08-20 15:14:48 +02:00			`#[versions]`

			`standard = "";`
			`dithered = "#define BC1_DITHER";`

			`#[compute]`
			`#version 450`

			`#include "CrossPlatformSettings_piece_all.glsl"`
			`#include "UavCrossPlatform_piece_all.glsl"`

			`#define FLT_MAX 340282346638528859811704183484516925440.0f`

			`layout(binding = 0) uniform sampler2D srcTex;`
			`layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;`

			`layout(std430, binding = 2) readonly restrict buffer globalBuffer {`
			`float2 c_oMatch5[256];`
			`float2 c_oMatch6[256];`
			`};`

			`layout(push_constant, std430) uniform Params {`
			`uint p_numRefinements;`
			`uint p_padding[3];`
			`}`
			`params;`

			`layout(local_size_x = 8, //`
			`local_size_y = 8, //`
			`local_size_z = 1) in;`

			`float3 rgb565to888(float rgb565) {`
			`float3 retVal;`
			`retVal.x = floor(rgb565 / 2048.0f);`
			`retVal.y = floor(mod(rgb565, 2048.0f) / 32.0f);`
			`retVal.z = floor(mod(rgb565, 32.0f));`

			`// This is the correct 565 to 888 conversion:`
			`// rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f )`
			`//`
			`// However stb_dxt follows a different one:`
			`// rb = floor( rb * ( 256 / 32 + 8 / 32 ) );`
			`// g = floor( g * ( 256 / 64 + 4 / 64 ) );`
			`//`
			`// I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded`
			`// It's quite possible this is the reason:`
			`// http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/`
			`//`
			`// Or maybe it's just because it's cheap to do with integer shifts.`
			`// Anyway, we follow stb_dxt's conversion just in case`
			`// (gives almost the same result, with 1 or -1 of difference for a very few values)`
			`//`
			`// Perhaps when we make 888 -> 565 -> 888 it doesn't matter`
			`// because they end up mapping to the original number`

			`return floor(retVal * float3(8.25f, 4.0625f, 8.25f));`
			`}`

			`float rgb888to565(float3 rgbValue) {`
			`rgbValue.rb = floor(rgbValue.rb * 31.0f / 255.0f + 0.5f);`
			`rgbValue.g = floor(rgbValue.g * 63.0f / 255.0f + 0.5f);`

			`return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b;`
			`}`

			`// linear interpolation at 1/3 point between a and b, using desired rounding type`
			`float3 lerp13(float3 a, float3 b) {`
			`#ifdef STB_DXT_USE_ROUNDING_BIAS`
			`// with rounding bias`
			`return a + floor((b - a) * (1.0f / 3.0f) + 0.5f);`
			`#else`
			`// without rounding bias`
			`return floor((2.0f * a + b) / 3.0f);`
			`#endif`
			`}`

			`/// Unpacks a block of 4 colors from two 16-bit endpoints`
			`void EvalColors(out float3 colors[4], float c0, float c1) {`
			`colors[0] = rgb565to888(c0);`
			`colors[1] = rgb565to888(c1);`
			`colors[2] = lerp13(colors[0], colors[1]);`
			`colors[3] = lerp13(colors[1], colors[0]);`
			`}`

			`/** The color optimization function. (Clever code, part 1)`
			`@param outMinEndp16 [out]`
			`Minimum endpoint, in RGB565`
			`@param outMaxEndp16 [out]`
			`Maximum endpoint, in RGB565`
			`*/`
			`void OptimizeColorsBlock(const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16) {`
			`// determine color distribution`
			`float3 avgColor;`
			`float3 minColor;`
			`float3 maxColor;`

			`avgColor = minColor = maxColor = unpackUnorm4x8(srcPixelsBlock[0]).xyz;`
			`for (int i = 1; i < 16; ++i) {`
			`const float3 currColorUnorm = unpackUnorm4x8(srcPixelsBlock[i]).xyz;`
			`avgColor += currColorUnorm;`
			`minColor = min(minColor, currColorUnorm);`
			`maxColor = max(maxColor, currColorUnorm);`
			`}`

			`avgColor = round(avgColor * 255.0f / 16.0f);`
			`maxColor *= 255.0f;`
			`minColor *= 255.0f;`

			`// determine covariance matrix`
			`float cov[6];`
			`for (int i = 0; i < 6; ++i)`
			`cov[i] = 0;`

			`for (int i = 0; i < 16; ++i) {`
			`const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;`
			`float3 rgbDiff = currColor - avgColor;`

			`cov[0] += rgbDiff.r * rgbDiff.r;`
			`cov[1] += rgbDiff.r * rgbDiff.g;`
			`cov[2] += rgbDiff.r * rgbDiff.b;`
			`cov[3] += rgbDiff.g * rgbDiff.g;`
			`cov[4] += rgbDiff.g * rgbDiff.b;`
			`cov[5] += rgbDiff.b * rgbDiff.b;`
			`}`

			`// convert covariance matrix to float, find principal axis via power iter`
			`for (int i = 0; i < 6; ++i)`
			`cov[i] /= 255.0f;`

			`float3 vF = maxColor - minColor;`

			`const int nIterPower = 4;`
			`for (int iter = 0; iter < nIterPower; ++iter) {`
			`const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2];`
			`const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4];`
			`const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5];`

			`vF.r = r;`
			`vF.g = g;`
			`vF.b = b;`
			`}`

			`float magn = max3(abs(vF.r), abs(vF.g), abs(vF.b));`
			`float3 v;`

			`if (magn < 4.0f) { // too small, default to luminance`
			`v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000.`
			`v.g = 587.0f;`
			`v.b = 114.0f;`
			`} else {`
			`v = trunc(vF * (512.0f / magn));`
			`}`

			`// Pick colors at extreme points`
			`float3 minEndpoint, maxEndpoint;`
			`float minDot = FLT_MAX;`
			`float maxDot = -FLT_MAX;`
			`for (int i = 0; i < 16; ++i) {`
			`const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;`
			`const float dotValue = dot(currColor, v);`

			`if (dotValue < minDot) {`
			`minDot = dotValue;`
			`minEndpoint = currColor;`
			`}`

			`if (dotValue > maxDot) {`
			`maxDot = dotValue;`
			`maxEndpoint = currColor;`
			`}`
			`}`

			`outMinEndp16 = rgb888to565(minEndpoint);`
			`outMaxEndp16 = rgb888to565(maxEndpoint);`
			`}`

			`// The color matching function`
			`uint MatchColorsBlock(const uint srcPixelsBlock[16], float3 color[4]) {`
			`uint mask = 0u;`
			`float3 dir = color[0] - color[1];`
			`float stops[4];`

			`for (int i = 0; i < 4; ++i)`
			`stops[i] = dot(color[i], dir);`

			`// think of the colors as arranged on a line; project point onto that line, then choose`
			`// next color out of available ones. we compute the crossover points for "best color in top`
			`// half"/"best in bottom half" and then the same inside that subinterval.`
			`//`
			`// relying on this 1d approximation isn't always optimal in terms of euclidean distance,`
			`// but it's very close and a lot faster.`
			`// http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html`

			`float c0Point = trunc((stops[1] + stops[3]) * 0.5f);`
			`float halfPoint = trunc((stops[3] + stops[2]) * 0.5f);`
			`float c3Point = trunc((stops[2] + stops[0]) * 0.5f);`

			`#ifndef BC1_DITHER`
			`// the version without dithering is straightforward`
			`for (uint i = 16u; i-- > 0u;) {`
			`const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;`

			`const float dotValue = dot(currColor, dir);`
			`mask <<= 2u;`

			`if (dotValue < halfPoint)`
			`mask \|= ((dotValue < c0Point) ? 1u : 3u);`
			`else`
			`mask \|= ((dotValue < c3Point) ? 2u : 0u);`
			`}`
			`#else`
			`// with floyd-steinberg dithering`
			`float4 ep1 = float4(0, 0, 0, 0);`
			`float4 ep2 = float4(0, 0, 0, 0);`

			`c0Point *= 16.0f;`
			`halfPoint *= 16.0f;`
			`c3Point *= 16.0f;`

			`for (uint y = 0u; y < 4u; ++y) {`
			`float ditherDot;`
			`uint lmask, step;`

			`float3 currColor;`
			`float dotValue;`

			`currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 0]).xyz * 255.0f;`
			`dotValue = dot(currColor, dir);`

			`ditherDot = (dotValue * 16.0f) + (3 * ep2[1] + 5 * ep2[0]);`
			`if (ditherDot < halfPoint)`
			`step = (ditherDot < c0Point) ? 1u : 3u;`
			`else`
			`step = (ditherDot < c3Point) ? 2u : 0u;`
			`ep1[0] = dotValue - stops[step];`
			`lmask = step;`

			`currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 1]).xyz * 255.0f;`
			`dotValue = dot(currColor, dir);`

			`ditherDot = (dotValue * 16.0f) + (7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]);`
			`if (ditherDot < halfPoint)`
			`step = (ditherDot < c0Point) ? 1u : 3u;`
			`else`
			`step = (ditherDot < c3Point) ? 2u : 0u;`
			`ep1[1] = dotValue - stops[step];`
			`lmask \|= step << 2u;`

			`currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;`
			`dotValue = dot(currColor, dir);`

			`ditherDot = (dotValue * 16.0f) + (7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]);`
			`if (ditherDot < halfPoint)`
			`step = (ditherDot < c0Point) ? 1u : 3u;`
			`else`
			`step = (ditherDot < c3Point) ? 2u : 0u;`
			`ep1[2] = dotValue - stops[step];`
			`lmask \|= step << 4u;`

			`currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;`
			`dotValue = dot(currColor, dir);`

			`ditherDot = (dotValue * 16.0f) + (7 * ep1[2] + 5 * ep2[3] + ep2[2]);`
			`if (ditherDot < halfPoint)`
			`step = (ditherDot < c0Point) ? 1u : 3u;`
			`else`
			`step = (ditherDot < c3Point) ? 2u : 0u;`
			`ep1[3] = dotValue - stops[step];`
			`lmask \|= step << 6u;`

			`mask \|= lmask << (y * 8u);`
			`{`
			`float4 tmp = ep1;`
			`ep1 = ep2;`
			`ep2 = tmp;`
			`} // swap`
			`}`
			`#endif`

			`return mask;`
			`}`

			`// The refinement function. (Clever code, part 2)`
			`// Tries to optimize colors to suit block contents better.`
			`// (By solving a least squares system via normal equations+Cramer's rule)`
			`bool RefineBlock(const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16,`
			`inout float inOutMaxEndp16) {`
			`float newMin16, newMax16;`
			`const float oldMin = inOutMinEndp16;`
			`const float oldMax = inOutMaxEndp16;`

			`if ((mask ^ (mask << 2u)) < 4u) // all pixels have the same index?`
			`{`
			`// yes, linear system would be singular; solve using optimal`
			`// single-color match on average color`
			`float3 rgbVal = float3(8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f);`
			`for (int i = 0; i < 16; ++i)`
			`rgbVal += unpackUnorm4x8(srcPixelsBlock[i]).xyz;`

			`rgbVal = floor(rgbVal * (255.0f / 16.0f));`

			`newMax16 = c_oMatch5[uint(rgbVal.r)][0] * 2048.0f + //`
			`c_oMatch6[uint(rgbVal.g)][0] * 32.0f + //`
			`c_oMatch5[uint(rgbVal.b)][0];`
			`newMin16 = c_oMatch5[uint(rgbVal.r)][1] * 2048.0f + //`
			`c_oMatch6[uint(rgbVal.g)][1] * 32.0f + //`
			`c_oMatch5[uint(rgbVal.b)][1];`
			`} else {`
			`const float w1Tab[4] = { 3, 0, 2, 1 };`
			`const float prods[4] = { 589824.0f, 2304.0f, 262402.0f, 66562.0f };`
			`// ^some magic to save a lot of multiplies in the accumulating loop...`
			`// (precomputed products of weights for least squares system, accumulated inside one 32-bit`
			`// register)`

			`float akku = 0.0f;`
			`uint cm = mask;`
			`float3 at1 = float3(0, 0, 0);`
			`float3 at2 = float3(0, 0, 0);`
			`for (int i = 0; i < 16; ++i, cm >>= 2u) {`
			`const float3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;`

			`const uint step = cm & 3u;`
			`const float w1 = w1Tab[step];`
			`akku += prods[step];`
			`at1 += currColor * w1;`
			`at2 += currColor;`
			`}`

			`at2 = 3.0f * at2 - at1;`

			`// extract solutions and decide solvability`
			`const float xx = floor(akku / 65535.0f);`
			`const float yy = floor(mod(akku, 65535.0f) / 256.0f);`
			`const float xy = mod(akku, 256.0f);`

			`float2 f_rb_g;`
			`f_rb_g.x = 3.0f * 31.0f / 255.0f / (xx * yy - xy * xy);`
			`f_rb_g.y = f_rb_g.x * 63.0f / 31.0f;`

			`// solve.`
			`const float3 newMaxVal = clamp(floor((at1 * yy - at2 * xy) * f_rb_g.xyx + 0.5f),`
			`float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31));`
			`newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z;`

			`const float3 newMinVal = clamp(floor((at2 * xx - at1 * xy) * f_rb_g.xyx + 0.5f),`
			`float3(0.0f, 0.0f, 0.0f), float3(31, 63, 31));`
			`newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z;`
			`}`

			`inOutMinEndp16 = newMin16;`
			`inOutMaxEndp16 = newMax16;`

			`return oldMin != newMin16 \|\| oldMax != newMax16;`
			`}`

			`#ifdef BC1_DITHER`
			`/// Quantizes 'srcValue' which is originally in 888 (full range),`
			`/// converting it to 565 and then back to 888 (quantized)`
			`float3 quant(float3 srcValue) {`
			`srcValue = clamp(srcValue, 0.0f, 255.0f);`
			`// Convert 888 -> 565`
			`srcValue = floor(srcValue * float3(31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f) + 0.5f);`
			`// Convert 565 -> 888 back`
			`srcValue = floor(srcValue * float3(8.25f, 4.0625f, 8.25f));`

			`return srcValue;`
			`}`

			`void DitherBlock(const uint srcPixBlck[16], out uint dthPixBlck[16]) {`
			`float3 ep1[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) };`
			`float3 ep2[4] = { float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0) };`

			`for (uint y = 0u; y < 16u; y += 4u) {`
			`float3 srcPixel, dithPixel;`

			`srcPixel = unpackUnorm4x8(srcPixBlck[y + 0u]).xyz * 255.0f;`
			`dithPixel = quant(srcPixel + trunc((3 * ep2[1] + 5 * ep2[0]) * (1.0f / 16.0f)));`
			`ep1[0] = srcPixel - dithPixel;`
			`dthPixBlck[y + 0u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));`

			`srcPixel = unpackUnorm4x8(srcPixBlck[y + 1u]).xyz * 255.0f;`
			`dithPixel = quant(`
			`srcPixel + trunc((7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]) * (1.0f / 16.0f)));`
			`ep1[1] = srcPixel - dithPixel;`
			`dthPixBlck[y + 1u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));`

			`srcPixel = unpackUnorm4x8(srcPixBlck[y + 2u]).xyz * 255.0f;`
			`dithPixel = quant(`
			`srcPixel + trunc((7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]) * (1.0f / 16.0f)));`
			`ep1[2] = srcPixel - dithPixel;`
			`dthPixBlck[y + 2u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));`

			`srcPixel = unpackUnorm4x8(srcPixBlck[y + 3u]).xyz * 255.0f;`
			`dithPixel = quant(srcPixel + trunc((7 * ep1[2] + 5 * ep2[3] + ep2[2]) * (1.0f / 16.0f)));`
			`ep1[3] = srcPixel - dithPixel;`
			`dthPixBlck[y + 3u] = packUnorm4x8(float4(dithPixel * (1.0f / 255.0f), 1.0f));`

			`// swap( ep1, ep2 )`
			`for (uint i = 0u; i < 4u; ++i) {`
			`float3 tmp = ep1[i];`
			`ep1[i] = ep2[i];`
			`ep2[i] = tmp;`
			`}`
			`}`
			`}`
			`#endif`

			`void main() {`
			`uint srcPixelsBlock[16];`

			`bool bAllColorsEqual = true;`

			`// Load the whole 4x4 block`
			`const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u;`
			`for (uint i = 0u; i < 16u; ++i) {`
			`const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i & 0x03u, i >> 2u);`
			`const float3 srcPixels0 = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyz;`
			`srcPixelsBlock[i] = packUnorm4x8(float4(srcPixels0, 1.0f));`
			`bAllColorsEqual = bAllColorsEqual && srcPixelsBlock[0] == srcPixelsBlock[i];`
			`}`

			`float maxEndp16, minEndp16;`
			`uint mask = 0u;`

			`if (bAllColorsEqual) {`
			`const uint3 rgbVal = uint3(unpackUnorm4x8(srcPixelsBlock[0]).xyz * 255.0f);`
			`mask = 0xAAAAAAAAu;`
			`maxEndp16 =`
			`c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0];`
			`minEndp16 =`
			`c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1];`
			`} else {`
			`#ifdef BC1_DITHER`
			`uint ditherPixelsBlock[16];`
			`// first step: compute dithered version for PCA if desired`
			`DitherBlock(srcPixelsBlock, ditherPixelsBlock);`
			`#else`
			`#define ditherPixelsBlock srcPixelsBlock`
			`#endif`

			`// second step: pca+map along principal axis`
			`OptimizeColorsBlock(ditherPixelsBlock, minEndp16, maxEndp16);`
			`if (minEndp16 != maxEndp16) {`
			`float3 colors[4];`
			`EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted`
			`mask = MatchColorsBlock(srcPixelsBlock, colors);`
			`}`

			`// third step: refine (multiple times if requested)`
			`bool bStopRefinement = false;`
			`for (uint i = 0u; i < params.p_numRefinements && !bStopRefinement; ++i) {`
			`const uint lastMask = mask;`

			`if (RefineBlock(ditherPixelsBlock, mask, minEndp16, maxEndp16)) {`
			`if (minEndp16 != maxEndp16) {`
			`float3 colors[4];`
			`EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted`
			`mask = MatchColorsBlock(srcPixelsBlock, colors);`
			`} else {`
			`mask = 0u;`
			`bStopRefinement = true;`
			`}`
			`}`

			`bStopRefinement = mask == lastMask \|\| bStopRefinement;`
			`}`
			`}`

			`// write the color block`
			`if (maxEndp16 < minEndp16) {`
			`const float tmpValue = minEndp16;`
			`minEndp16 = maxEndp16;`
			`maxEndp16 = tmpValue;`
			`mask ^= 0x55555555u;`
			`}`

			`uint2 outputBytes;`
			`outputBytes.x = uint(maxEndp16) \| (uint(minEndp16) << 16u);`
			`outputBytes.y = mask;`

			`uint2 dstUV = gl_GlobalInvocationID.xy;`
			`imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u));`
			`}`