virtualx-engine/thirdparty/amd-fsr2/shaders/ffx_spd.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

937 lines
37 KiB
C++
Raw Normal View History

2024-02-18 21:31:05 +01:00
// This file is part of the FidelityFX SDK.
//
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifdef FFX_CPU
FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant
FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant
FfxUInt32x4 rectInfo, // left, top, width, height
FfxInt32 mips) // optional: if -1, calculate based on rect width and height
{
workGroupOffset[0] = rectInfo[0] / 64; // rectInfo[0] = left
workGroupOffset[1] = rectInfo[1] / 64; // rectInfo[1] = top
FfxUInt32 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width
FfxUInt32 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height
dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0];
dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1];
numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]);
if (mips >= 0)
{
numWorkGroupsAndMips[1] = FfxUInt32(mips);
}
else
{
// calculate based on rect width and height
FfxUInt32 resolution = ffxMax(rectInfo[2], rectInfo[3]);
numWorkGroupsAndMips[1] = FfxUInt32((ffxMin(floor(log2(FfxFloat32(resolution))), FfxFloat32(12))));
}
}
FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant
FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant
FfxUInt32x4 rectInfo) // left, top, width, height
{
SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1);
}
#endif // #ifdef FFX_CPU
//==============================================================================================================================
// NON-PACKED VERSION
//==============================================================================================================================
#ifdef FFX_GPU
#ifdef SPD_PACKED_ONLY
// Avoid compiler error
FfxFloat32x4 SpdLoadSourceImage(FfxInt32x2 p, FfxUInt32 slice)
{
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
}
FfxFloat32x4 SpdLoad(FfxInt32x2 p, FfxUInt32 slice)
{
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
}
void SpdStore(FfxInt32x2 p, FfxFloat32x4 value, FfxUInt32 mip, FfxUInt32 slice)
{
}
FfxFloat32x4 SpdLoadIntermediate(FfxUInt32 x, FfxUInt32 y)
{
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
}
void SpdStoreIntermediate(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value)
{
}
FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3)
{
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
}
#endif // #ifdef SPD_PACKED_ONLY
//_____________________________________________________________/\_______________________________________________________________
#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
#extension GL_KHR_shader_subgroup_quad:require
#endif
void SpdWorkgroupShuffleBarrier()
{
#ifdef FFX_GLSL
barrier();
#endif
#ifdef FFX_HLSL
GroupMemoryBarrierWithGroupSync();
#endif
}
// Only last active workgroup should proceed
bool SpdExitWorkgroup(FfxUInt32 numWorkGroups, FfxUInt32 localInvocationIndex, FfxUInt32 slice)
{
// global atomic counter
if (localInvocationIndex == 0)
{
SpdIncreaseAtomicCounter(slice);
}
SpdWorkgroupShuffleBarrier();
return (SpdGetAtomicCounter() != (numWorkGroups - 1));
}
// User defined: FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3);
FfxFloat32x4 SpdReduceQuad(FfxFloat32x4 v)
{
#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
FfxFloat32x4 v0 = v;
FfxFloat32x4 v1 = subgroupQuadSwapHorizontal(v);
FfxFloat32x4 v2 = subgroupQuadSwapVertical(v);
FfxFloat32x4 v3 = subgroupQuadSwapDiagonal(v);
return SpdReduce4(v0, v1, v2, v3);
#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
// requires SM6.0
FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);
FfxFloat32x4 v0 = v;
FfxFloat32x4 v1 = WaveReadLaneAt(v, quad | 1);
FfxFloat32x4 v2 = WaveReadLaneAt(v, quad | 2);
FfxFloat32x4 v3 = WaveReadLaneAt(v, quad | 3);
return SpdReduce4(v0, v1, v2, v3);
/*
// if SM6.0 is not available, you can use the AMD shader intrinsics
// the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
// https://gpuopen.com/amd-gpu-services-ags-library/
// works for DX11
FfxFloat32x4 v0 = v;
FfxFloat32x4 v1;
v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
FfxFloat32x4 v2;
v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
FfxFloat32x4 v3;
v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
return SpdReduce4(v0, v1, v2, v3);
*/
#endif
return v;
}
FfxFloat32x4 SpdReduceIntermediate(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)
{
FfxFloat32x4 v0 = SpdLoadIntermediate(i0.x, i0.y);
FfxFloat32x4 v1 = SpdLoadIntermediate(i1.x, i1.y);
FfxFloat32x4 v2 = SpdLoadIntermediate(i2.x, i2.y);
FfxFloat32x4 v3 = SpdLoadIntermediate(i3.x, i3.y);
return SpdReduce4(v0, v1, v2, v3);
}
FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
{
FfxFloat32x4 v0 = SpdLoad(FfxInt32x2(i0), slice);
FfxFloat32x4 v1 = SpdLoad(FfxInt32x2(i1), slice);
FfxFloat32x4 v2 = SpdLoad(FfxInt32x2(i2), slice);
FfxFloat32x4 v3 = SpdLoad(FfxInt32x2(i3), slice);
return SpdReduce4(v0, v1, v2, v3);
}
FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 base, FfxUInt32 slice)
{
return SpdReduceLoad4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
}
FfxFloat32x4 SpdReduceLoadSourceImage4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
{
FfxFloat32x4 v0 = SpdLoadSourceImage(FfxInt32x2(i0), slice);
FfxFloat32x4 v1 = SpdLoadSourceImage(FfxInt32x2(i1), slice);
FfxFloat32x4 v2 = SpdLoadSourceImage(FfxInt32x2(i2), slice);
FfxFloat32x4 v3 = SpdLoadSourceImage(FfxInt32x2(i3), slice);
return SpdReduce4(v0, v1, v2, v3);
}
FfxFloat32x4 SpdReduceLoadSourceImage(FfxUInt32x2 base, FfxUInt32 slice)
{
#ifdef SPD_LINEAR_SAMPLER
return SpdLoadSourceImage(FfxInt32x2(base), slice);
#else
return SpdReduceLoadSourceImage4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
#endif
}
void SpdDownsampleMips_0_1_Intrinsics(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
{
FfxFloat32x4 v[4];
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
v[0] = SpdReduceLoadSourceImage(tex, slice);
SpdStore(pix, v[0], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
v[1] = SpdReduceLoadSourceImage(tex, slice);
SpdStore(pix, v[1], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
v[2] = SpdReduceLoadSourceImage(tex, slice);
SpdStore(pix, v[2], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
v[3] = SpdReduceLoadSourceImage(tex, slice);
SpdStore(pix, v[3], 0, slice);
if (mip <= 1)
return;
v[0] = SpdReduceQuad(v[0]);
v[1] = SpdReduceQuad(v[1]);
v[2] = SpdReduceQuad(v[2]);
v[3] = SpdReduceQuad(v[3]);
if ((localInvocationIndex % 4) == 0)
{
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);
SpdStoreIntermediate(x / 2, y / 2, v[0]);
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);
SpdStoreIntermediate(x / 2 + 8, y / 2, v[1]);
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);
SpdStoreIntermediate(x / 2, y / 2 + 8, v[2]);
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
SpdStoreIntermediate(x / 2 + 8, y / 2 + 8, v[3]);
}
}
void SpdDownsampleMips_0_1_LDS(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
{
FfxFloat32x4 v[4];
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
v[0] = SpdReduceLoadSourceImage(tex, slice);
SpdStore(pix, v[0], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
v[1] = SpdReduceLoadSourceImage(tex, slice);
SpdStore(pix, v[1], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
v[2] = SpdReduceLoadSourceImage(tex, slice);
SpdStore(pix, v[2], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
v[3] = SpdReduceLoadSourceImage(tex, slice);
SpdStore(pix, v[3], 0, slice);
if (mip <= 1)
return;
for (FfxUInt32 i = 0; i < 4; i++)
{
SpdStoreIntermediate(x, y, v[i]);
SpdWorkgroupShuffleBarrier();
if (localInvocationIndex < 64)
{
v[i] = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
}
SpdWorkgroupShuffleBarrier();
}
if (localInvocationIndex < 64)
{
SpdStoreIntermediate(x + 0, y + 0, v[0]);
SpdStoreIntermediate(x + 8, y + 0, v[1]);
SpdStoreIntermediate(x + 0, y + 8, v[2]);
SpdStoreIntermediate(x + 8, y + 8, v[3]);
}
}
void SpdDownsampleMips_0_1(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
{
#ifdef SPD_NO_WAVE_OPERATIONS
SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice);
#else
SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice);
#endif
}
void SpdDownsampleMip_2(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
{
#ifdef SPD_NO_WAVE_OPERATIONS
if (localInvocationIndex < 64)
{
FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);
// store to LDS, try to reduce bank conflicts
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
// ...
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
SpdStoreIntermediate(x * 2 + y % 2, y * 2, v);
}
#else
FfxFloat32x4 v = SpdLoadIntermediate(x, y);
v = SpdReduceQuad(v);
// quad index 0 stores result
if (localInvocationIndex % 4 == 0)
{
SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
SpdStoreIntermediate(x + (y / 2) % 2, y, v);
}
#endif
}
void SpdDownsampleMip_3(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
{
#ifdef SPD_NO_WAVE_OPERATIONS
if (localInvocationIndex < 16)
{
// x 0 x 0
// 0 0 0 0
// 0 x 0 x
// 0 0 0 0
FfxFloat32x4 v =
SpdReduceIntermediate(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));
SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);
// store to LDS
// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
// ...
// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
// ...
// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
// ...
SpdStoreIntermediate(x * 4 + y, y * 4, v);
}
#else
if (localInvocationIndex < 64)
{
FfxFloat32x4 v = SpdLoadIntermediate(x * 2 + y % 2, y * 2);
v = SpdReduceQuad(v);
// quad index 0 stores result
if (localInvocationIndex % 4 == 0)
{
SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
SpdStoreIntermediate(x * 2 + y / 2, y * 2, v);
}
}
#endif
}
void SpdDownsampleMip_4(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
{
#ifdef SPD_NO_WAVE_OPERATIONS
if (localInvocationIndex < 4)
{
// x 0 0 0 x 0 0 0
// ...
// 0 x 0 0 0 x 0 0
FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);
// store to LDS
// x x x x 0 ...
// 0 ...
SpdStoreIntermediate(x + y * 2, 0, v);
}
#else
if (localInvocationIndex < 16)
{
FfxFloat32x4 v = SpdLoadIntermediate(x * 4 + y, y * 4);
v = SpdReduceQuad(v);
// quad index 0 stores result
if (localInvocationIndex % 4 == 0)
{
SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
SpdStoreIntermediate(x / 2 + y, 0, v);
}
}
#endif
}
void SpdDownsampleMip_5(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
{
#ifdef SPD_NO_WAVE_OPERATIONS
if (localInvocationIndex < 1)
{
// x x x x 0 ...
// 0 ...
FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));
SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);
}
#else
if (localInvocationIndex < 4)
{
FfxFloat32x4 v = SpdLoadIntermediate(localInvocationIndex, 0);
v = SpdReduceQuad(v);
// quad index 0 stores result
if (localInvocationIndex % 4 == 0)
{
SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);
}
}
#endif
}
void SpdDownsampleMips_6_7(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)
{
FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);
FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);
FfxFloat32x4 v0 = SpdReduceLoad4(tex, slice);
SpdStore(pix, v0, 6, slice);
tex = FfxInt32x2(x * 4 + 2, y * 4 + 0);
pix = FfxInt32x2(x * 2 + 1, y * 2 + 0);
FfxFloat32x4 v1 = SpdReduceLoad4(tex, slice);
SpdStore(pix, v1, 6, slice);
tex = FfxInt32x2(x * 4 + 0, y * 4 + 2);
pix = FfxInt32x2(x * 2 + 0, y * 2 + 1);
FfxFloat32x4 v2 = SpdReduceLoad4(tex, slice);
SpdStore(pix, v2, 6, slice);
tex = FfxInt32x2(x * 4 + 2, y * 4 + 2);
pix = FfxInt32x2(x * 2 + 1, y * 2 + 1);
FfxFloat32x4 v3 = SpdReduceLoad4(tex, slice);
SpdStore(pix, v3, 6, slice);
if (mips <= 7)
return;
// no barrier needed, working on values only from the same thread
FfxFloat32x4 v = SpdReduce4(v0, v1, v2, v3);
SpdStore(FfxInt32x2(x, y), v, 7, slice);
SpdStoreIntermediate(x, y, v);
}
void SpdDownsampleNextFour(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)
{
if (mips <= baseMip)
return;
SpdWorkgroupShuffleBarrier();
SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice);
if (mips <= baseMip + 1)
return;
SpdWorkgroupShuffleBarrier();
SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
if (mips <= baseMip + 2)
return;
SpdWorkgroupShuffleBarrier();
SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
if (mips <= baseMip + 3)
return;
SpdWorkgroupShuffleBarrier();
SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice);
}
void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)
{
FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);
FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice);
SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
if (mips <= 6)
return;
if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
return;
SpdResetAtomicCounter(slice);
// After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
SpdDownsampleMips_6_7(x, y, mips, slice);
SpdDownsampleNextFour(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);
}
void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)
{
SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//==============================================================================================================================
// PACKED VERSION
//==============================================================================================================================
#if FFX_HALF
#ifdef FFX_GLSL
#extension GL_EXT_shader_subgroup_extended_types_float16:require
#endif
FfxFloat16x4 SpdReduceQuadH(FfxFloat16x4 v)
{
#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
FfxFloat16x4 v0 = v;
FfxFloat16x4 v1 = subgroupQuadSwapHorizontal(v);
FfxFloat16x4 v2 = subgroupQuadSwapVertical(v);
FfxFloat16x4 v3 = subgroupQuadSwapDiagonal(v);
return SpdReduce4H(v0, v1, v2, v3);
#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
// requires SM6.0
FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);
FfxFloat16x4 v0 = v;
FfxFloat16x4 v1 = WaveReadLaneAt(v, quad | 1);
FfxFloat16x4 v2 = WaveReadLaneAt(v, quad | 2);
FfxFloat16x4 v3 = WaveReadLaneAt(v, quad | 3);
return SpdReduce4H(v0, v1, v2, v3);
/*
// if SM6.0 is not available, you can use the AMD shader intrinsics
// the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
// https://gpuopen.com/amd-gpu-services-ags-library/
// works for DX11
FfxFloat16x4 v0 = v;
FfxFloat16x4 v1;
v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
FfxFloat16x4 v2;
v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
FfxFloat16x4 v3;
v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
return SpdReduce4H(v0, v1, v2, v3);
*/
#endif
return FfxFloat16x4(0.0, 0.0, 0.0, 0.0);
}
FfxFloat16x4 SpdReduceIntermediateH(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)
{
FfxFloat16x4 v0 = SpdLoadIntermediateH(i0.x, i0.y);
FfxFloat16x4 v1 = SpdLoadIntermediateH(i1.x, i1.y);
FfxFloat16x4 v2 = SpdLoadIntermediateH(i2.x, i2.y);
FfxFloat16x4 v3 = SpdLoadIntermediateH(i3.x, i3.y);
return SpdReduce4H(v0, v1, v2, v3);
}
FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
{
FfxFloat16x4 v0 = SpdLoadH(FfxInt32x2(i0), slice);
FfxFloat16x4 v1 = SpdLoadH(FfxInt32x2(i1), slice);
FfxFloat16x4 v2 = SpdLoadH(FfxInt32x2(i2), slice);
FfxFloat16x4 v3 = SpdLoadH(FfxInt32x2(i3), slice);
return SpdReduce4H(v0, v1, v2, v3);
}
FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 base, FfxUInt32 slice)
{
return SpdReduceLoad4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
}
FfxFloat16x4 SpdReduceLoadSourceImage4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
{
FfxFloat16x4 v0 = SpdLoadSourceImageH(FfxInt32x2(i0), slice);
FfxFloat16x4 v1 = SpdLoadSourceImageH(FfxInt32x2(i1), slice);
FfxFloat16x4 v2 = SpdLoadSourceImageH(FfxInt32x2(i2), slice);
FfxFloat16x4 v3 = SpdLoadSourceImageH(FfxInt32x2(i3), slice);
return SpdReduce4H(v0, v1, v2, v3);
}
FfxFloat16x4 SpdReduceLoadSourceImageH(FfxUInt32x2 base, FfxUInt32 slice)
{
#ifdef SPD_LINEAR_SAMPLER
return SpdLoadSourceImageH(FfxInt32x2(base), slice);
#else
return SpdReduceLoadSourceImage4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
#endif
}
void SpdDownsampleMips_0_1_IntrinsicsH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
{
FfxFloat16x4 v[4];
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
v[0] = SpdReduceLoadSourceImageH(tex, slice);
SpdStoreH(pix, v[0], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
v[1] = SpdReduceLoadSourceImageH(tex, slice);
SpdStoreH(pix, v[1], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
v[2] = SpdReduceLoadSourceImageH(tex, slice);
SpdStoreH(pix, v[2], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
v[3] = SpdReduceLoadSourceImageH(tex, slice);
SpdStoreH(pix, v[3], 0, slice);
if (mips <= 1)
return;
v[0] = SpdReduceQuadH(v[0]);
v[1] = SpdReduceQuadH(v[1]);
v[2] = SpdReduceQuadH(v[2]);
v[3] = SpdReduceQuadH(v[3]);
if ((localInvocationIndex % 4) == 0)
{
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);
SpdStoreIntermediateH(x / 2, y / 2, v[0]);
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);
SpdStoreIntermediateH(x / 2 + 8, y / 2, v[1]);
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);
SpdStoreIntermediateH(x / 2, y / 2 + 8, v[2]);
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
SpdStoreIntermediateH(x / 2 + 8, y / 2 + 8, v[3]);
}
}
void SpdDownsampleMips_0_1_LDSH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
{
FfxFloat16x4 v[4];
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
v[0] = SpdReduceLoadSourceImageH(tex, slice);
SpdStoreH(pix, v[0], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
v[1] = SpdReduceLoadSourceImageH(tex, slice);
SpdStoreH(pix, v[1], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
v[2] = SpdReduceLoadSourceImageH(tex, slice);
SpdStoreH(pix, v[2], 0, slice);
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
v[3] = SpdReduceLoadSourceImageH(tex, slice);
SpdStoreH(pix, v[3], 0, slice);
if (mips <= 1)
return;
for (FfxInt32 i = 0; i < 4; i++)
{
SpdStoreIntermediateH(x, y, v[i]);
SpdWorkgroupShuffleBarrier();
if (localInvocationIndex < 64)
{
v[i] = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
}
SpdWorkgroupShuffleBarrier();
}
if (localInvocationIndex < 64)
{
SpdStoreIntermediateH(x + 0, y + 0, v[0]);
SpdStoreIntermediateH(x + 8, y + 0, v[1]);
SpdStoreIntermediateH(x + 0, y + 8, v[2]);
SpdStoreIntermediateH(x + 8, y + 8, v[3]);
}
}
void SpdDownsampleMips_0_1H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
{
#ifdef SPD_NO_WAVE_OPERATIONS
SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice);
#else
SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice);
#endif
}
void SpdDownsampleMip_2H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
{
#ifdef SPD_NO_WAVE_OPERATIONS
if (localInvocationIndex < 64)
{
FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);
// store to LDS, try to reduce bank conflicts
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
// ...
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v);
}
#else
FfxFloat16x4 v = SpdLoadIntermediateH(x, y);
v = SpdReduceQuadH(v);
// quad index 0 stores result
if (localInvocationIndex % 4 == 0)
{
SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
SpdStoreIntermediateH(x + (y / 2) % 2, y, v);
}
#endif
}
void SpdDownsampleMip_3H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
{
#ifdef SPD_NO_WAVE_OPERATIONS
if (localInvocationIndex < 16)
{
// x 0 x 0
// 0 0 0 0
// 0 x 0 x
// 0 0 0 0
FfxFloat16x4 v =
SpdReduceIntermediateH(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));
SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);
// store to LDS
// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
// ...
// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
// ...
// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
// ...
SpdStoreIntermediateH(x * 4 + y, y * 4, v);
}
#else
if (localInvocationIndex < 64)
{
FfxFloat16x4 v = SpdLoadIntermediateH(x * 2 + y % 2, y * 2);
v = SpdReduceQuadH(v);
// quad index 0 stores result
if (localInvocationIndex % 4 == 0)
{
SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
SpdStoreIntermediateH(x * 2 + y / 2, y * 2, v);
}
}
#endif
}
void SpdDownsampleMip_4H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
{
#ifdef SPD_NO_WAVE_OPERATIONS
if (localInvocationIndex < 4)
{
// x 0 0 0 x 0 0 0
// ...
// 0 x 0 0 0 x 0 0
FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);
// store to LDS
// x x x x 0 ...
// 0 ...
SpdStoreIntermediateH(x + y * 2, 0, v);
}
#else
if (localInvocationIndex < 16)
{
FfxFloat16x4 v = SpdLoadIntermediateH(x * 4 + y, y * 4);
v = SpdReduceQuadH(v);
// quad index 0 stores result
if (localInvocationIndex % 4 == 0)
{
SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
SpdStoreIntermediateH(x / 2 + y, 0, v);
}
}
#endif
}
void SpdDownsampleMip_5H(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
{
#ifdef SPD_NO_WAVE_OPERATIONS
if (localInvocationIndex < 1)
{
// x x x x 0 ...
// 0 ...
FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));
SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);
}
#else
if (localInvocationIndex < 4)
{
FfxFloat16x4 v = SpdLoadIntermediateH(localInvocationIndex, 0);
v = SpdReduceQuadH(v);
// quad index 0 stores result
if (localInvocationIndex % 4 == 0)
{
SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);
}
}
#endif
}
void SpdDownsampleMips_6_7H(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)
{
FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);
FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);
FfxFloat16x4 v0 = SpdReduceLoad4H(tex, slice);
SpdStoreH(pix, v0, 6, slice);
tex = FfxInt32x2(x * 4 + 2, y * 4 + 0);
pix = FfxInt32x2(x * 2 + 1, y * 2 + 0);
FfxFloat16x4 v1 = SpdReduceLoad4H(tex, slice);
SpdStoreH(pix, v1, 6, slice);
tex = FfxInt32x2(x * 4 + 0, y * 4 + 2);
pix = FfxInt32x2(x * 2 + 0, y * 2 + 1);
FfxFloat16x4 v2 = SpdReduceLoad4H(tex, slice);
SpdStoreH(pix, v2, 6, slice);
tex = FfxInt32x2(x * 4 + 2, y * 4 + 2);
pix = FfxInt32x2(x * 2 + 1, y * 2 + 1);
FfxFloat16x4 v3 = SpdReduceLoad4H(tex, slice);
SpdStoreH(pix, v3, 6, slice);
if (mips < 8)
return;
// no barrier needed, working on values only from the same thread
FfxFloat16x4 v = SpdReduce4H(v0, v1, v2, v3);
SpdStoreH(FfxInt32x2(x, y), v, 7, slice);
SpdStoreIntermediateH(x, y, v);
}
void SpdDownsampleNextFourH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)
{
if (mips <= baseMip)
return;
SpdWorkgroupShuffleBarrier();
SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice);
if (mips <= baseMip + 1)
return;
SpdWorkgroupShuffleBarrier();
SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
if (mips <= baseMip + 2)
return;
SpdWorkgroupShuffleBarrier();
SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
if (mips <= baseMip + 3)
return;
SpdWorkgroupShuffleBarrier();
SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice);
}
void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)
{
FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);
FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice);
SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
if (mips < 7)
return;
if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
return;
SpdResetAtomicCounter(slice);
// After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
SpdDownsampleMips_6_7H(x, y, mips, slice);
SpdDownsampleNextFourH(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);
}
void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)
{
SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
}
#endif // #if FFX_HALF
#endif // #ifdef FFX_GPU