From 95fd45d9a5d0a0efe074bc40ff854d9d008ed7d9 Mon Sep 17 00:00:00 2001 From: BlueCube3310 <53150244+BlueCube3310@users.noreply.github.com> Date: Sat, 21 Sep 2024 22:12:28 +0200 Subject: [PATCH] Betsy: Implement BC4 compression --- core/io/image.cpp | 2 +- doc/classes/ProjectSettings.xml | 4 +- modules/betsy/SCsub | 1 + modules/betsy/bc4.glsl | 151 +++++++++++++++++++++++++ modules/betsy/image_compress_betsy.cpp | 74 ++++++++---- modules/betsy/image_compress_betsy.h | 7 ++ thirdparty/README.md | 2 +- 7 files changed, 218 insertions(+), 23 deletions(-) create mode 100644 modules/betsy/bc4.glsl diff --git a/core/io/image.cpp b/core/io/image.cpp index aa391b77dd4..d782af931f0 100644 --- a/core/io/image.cpp +++ b/core/io/image.cpp @@ -2751,7 +2751,7 @@ Error Image::compress_from_channels(CompressMode p_mode, UsedChannels p_channels case COMPRESS_S3TC: { // BC3 is unsupported currently. - if ((p_channels == USED_CHANNELS_RGB || p_channels == USED_CHANNELS_L) && _image_compress_bc_rd_func) { + if ((p_channels == USED_CHANNELS_R || p_channels == USED_CHANNELS_RGB || p_channels == USED_CHANNELS_L) && _image_compress_bc_rd_func) { Error result = _image_compress_bc_rd_func(this, p_channels); // If the image was compressed successfully, we return here. If not, we fall back to the default compression scheme. diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml index 758e98ad857..65863c79cc1 100644 --- a/doc/classes/ProjectSettings.xml +++ b/doc/classes/ProjectSettings.xml @@ -2903,8 +2903,8 @@ If [code]true[/code], the texture importer will utilize the GPU for compressing textures, improving the import time of large images. - [b]Note:[/b] This setting requires either Vulkan or D3D12 available as a rendering backend. - [b]Note:[/b] Currently this only affects BC1 and BC6H compression, which are used on Desktop and Console for fully opaque and HDR images respectively. + [b]Note:[/b] This only functions on a device which supports either Vulkan, D3D12, or Metal available as a rendering backend. + [b]Note:[/b] Currently this only affects certain compressed formats (BC1, BC4, and BC6), all of which are exclusive to desktop platforms and consoles. If [code]true[/code], the texture importer will import VRAM-compressed textures using the Ericsson Texture Compression 2 algorithm for lower quality textures and normal maps and Adaptable Scalable Texture Compression algorithm for high quality textures (in 4×4 block size). diff --git a/modules/betsy/SCsub b/modules/betsy/SCsub index ed5dcbf58b7..bd800526b82 100644 --- a/modules/betsy/SCsub +++ b/modules/betsy/SCsub @@ -5,6 +5,7 @@ Import("env_modules") env_betsy = env_modules.Clone() env_betsy.GLSL_HEADER("bc6h.glsl") env_betsy.GLSL_HEADER("bc1.glsl") +env_betsy.GLSL_HEADER("bc4.glsl") env_betsy.Depends(Glob("*.glsl.gen.h"), ["#glsl_builders.py"]) # Thirdparty source files diff --git a/modules/betsy/bc4.glsl b/modules/betsy/bc4.glsl new file mode 100644 index 00000000000..b7a5f6a6867 --- /dev/null +++ b/modules/betsy/bc4.glsl @@ -0,0 +1,151 @@ +#[versions] + +unsigned = ""; +signed = "#define SNORM"; + +#[compute] +#version 450 + +#include "CrossPlatformSettings_piece_all.glsl" +#include "UavCrossPlatform_piece_all.glsl" + +#VERSION_DEFINES + +shared float2 g_minMaxValues[4u * 4u * 4u]; +shared uint2 g_mask[4u * 4u]; + +layout(binding = 0) uniform sampler2D srcTex; +layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture; + +layout(push_constant, std430) uniform Params { + uint p_channelIdx; + uint p_padding[3]; +} +params; + +layout(local_size_x = 4, // + local_size_y = 4, // + local_size_z = 4) in; + +/// Each block is 16 pixels +/// Each thread works on 4 pixels +/// Therefore each block needs 4 threads, generating 8 masks +/// At the end these 8 masks get merged into 2 and results written to output +/// +/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?** +/// +/// A: It's a sweetspot. +/// - Very short threads cannot fill expensive GPUs with enough work (dispatch bound) +/// - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks) +/// overhead, and also more LDS usage which reduces occupancy. +/// - Long threads (e.g. 1 thread per block) misses parallelism opportunities +void main() { + float minVal, maxVal; + float4 srcPixel; + + const uint blockThreadId = gl_LocalInvocationID.x; + + const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u; + + for (uint i = 0u; i < 4u; ++i) { + const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i, blockThreadId); + + const float4 value = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyzw; + srcPixel[i] = params.p_channelIdx == 0 ? value.x : (params.p_channelIdx == 1 ? value.y : value.w); + srcPixel[i] *= 255.0f; + } + + minVal = min3(srcPixel.x, srcPixel.y, srcPixel.z); + maxVal = max3(srcPixel.x, srcPixel.y, srcPixel.z); + minVal = min(minVal, srcPixel.w); + maxVal = max(maxVal, srcPixel.w); + + const uint minMaxIdxBase = (gl_LocalInvocationID.z << 4u) + (gl_LocalInvocationID.y << 2u); + const uint maskIdxBase = (gl_LocalInvocationID.z << 2u) + gl_LocalInvocationID.y; + + g_minMaxValues[minMaxIdxBase + blockThreadId] = float2(minVal, maxVal); + g_mask[maskIdxBase] = uint2(0u, 0u); + + memoryBarrierShared(); + barrier(); + + // Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded + for (uint i = 0u; i < 4u; ++i) { + minVal = min(g_minMaxValues[minMaxIdxBase + i].x, minVal); + maxVal = max(g_minMaxValues[minMaxIdxBase + i].y, maxVal); + } + + // determine bias and emit color indices + // given the choice of maxVal/minVal, these indices are optimal: + // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/ + float dist = maxVal - minVal; + float dist4 = dist * 4.0f; + float dist2 = dist * 2.0f; + float bias = (dist < 8) ? (dist - 1) : (trunc(dist * 0.5f) + 2); + bias -= minVal * 7; + + uint mask0 = 0u, mask1 = 0u; + + for (uint i = 0u; i < 4u; ++i) { + float a = srcPixel[i] * 7.0f + bias; + + int ind = 0; + + // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max). + if (a >= dist4) { + ind = 4; + a -= dist4; + } + + if (a >= dist2) { + ind += 2; + a -= dist2; + } + + if (a >= dist) + ind += 1; + + // turn linear scale into DXT index (0/1 are extremal pts) + ind = -ind & 7; + ind ^= (2 > ind) ? 1 : 0; + + // write index + const uint bits = 16u + ((blockThreadId << 2u) + i) * 3u; + if (bits < 32u) { + mask0 |= uint(ind) << bits; + if (bits + 3u > 32u) { + mask1 |= uint(ind) >> (32u - bits); + } + } else { + mask1 |= uint(ind) << (bits - 32u); + } + } + + if (mask0 != 0u) + atomicOr(g_mask[maskIdxBase].x, mask0); + if (mask1 != 0u) + atomicOr(g_mask[maskIdxBase].y, mask1); + + memoryBarrierShared(); + barrier(); + + if (blockThreadId == 0u) { + // Save data + uint2 outputBytes; + +#ifdef SNORM + outputBytes.x = + packSnorm4x8(float4(maxVal * (1.0f / 255.0f) * 2.0f - 1.0f, + minVal * (1.0f / 255.0f) * 2.0f - 1.0f, 0.0f, 0.0f)); +#else + outputBytes.x = packUnorm4x8( + float4(maxVal * (1.0f / 255.0f), minVal * (1.0f / 255.0f), 0.0f, 0.0f)); +#endif + + outputBytes.x |= g_mask[maskIdxBase].x; + outputBytes.y = g_mask[maskIdxBase].y; + + uint2 dstUV = gl_GlobalInvocationID.yz; + imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u)); + } +} diff --git a/modules/betsy/image_compress_betsy.cpp b/modules/betsy/image_compress_betsy.cpp index 7b4d8b3dfb4..6bfe01f65cc 100644 --- a/modules/betsy/image_compress_betsy.cpp +++ b/modules/betsy/image_compress_betsy.cpp @@ -35,6 +35,7 @@ #include "betsy_bc1.h" #include "bc1.glsl.gen.h" +#include "bc4.glsl.gen.h" #include "bc6h.glsl.gen.h" static Mutex betsy_mutex; @@ -165,6 +166,10 @@ static String get_shader_name(BetsyFormat p_format) { case BETSY_FORMAT_BC3: return "BC3"; + case BETSY_FORMAT_BC4_SIGNED: + case BETSY_FORMAT_BC4_UNSIGNED: + return "BC4"; + case BETSY_FORMAT_BC6_SIGNED: case BETSY_FORMAT_BC6_UNSIGNED: return "BC6"; @@ -202,6 +207,12 @@ Error BetsyCompressor::_compress(BetsyFormat p_format, Image *r_img) { dest_format = Image::FORMAT_DXT1; break; + case BETSY_FORMAT_BC4_UNSIGNED: + version = "unsigned"; + dst_rd_format = RD::DATA_FORMAT_R32G32_UINT; + dest_format = Image::FORMAT_RGTC_R; + break; + case BETSY_FORMAT_BC6_SIGNED: version = "signed"; dst_rd_format = RD::DATA_FORMAT_R32G32B32A32_UINT; @@ -235,8 +246,13 @@ Error BetsyCompressor::_compress(BetsyFormat p_format, Image *r_img) { err = source->parse_versions_from_text(bc1_shader_glsl); break; - case BETSY_FORMAT_BC6_UNSIGNED: + case BETSY_FORMAT_BC4_SIGNED: + case BETSY_FORMAT_BC4_UNSIGNED: + err = source->parse_versions_from_text(bc4_shader_glsl); + break; + case BETSY_FORMAT_BC6_SIGNED: + case BETSY_FORMAT_BC6_UNSIGNED: err = source->parse_versions_from_text(bc6h_shader_glsl); break; @@ -430,26 +446,45 @@ Error BetsyCompressor::_compress(BetsyFormat p_format, Image *r_img) { compress_rd->compute_list_bind_compute_pipeline(compute_list, shader.pipeline); compress_rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0); - if (dest_format == Image::FORMAT_BPTC_RGBFU || dest_format == Image::FORMAT_BPTC_RGBF) { - BC6PushConstant push_constant; - push_constant.sizeX = 1.0f / width; - push_constant.sizeY = 1.0f / height; - push_constant.padding[0] = 0; - push_constant.padding[1] = 0; + switch (dest_format) { + case Image::FORMAT_BPTC_RGBFU: + case Image::FORMAT_BPTC_RGBF: { + BC6PushConstant push_constant; + push_constant.sizeX = 1.0f / width; + push_constant.sizeY = 1.0f / height; + push_constant.padding[0] = 0; + push_constant.padding[1] = 0; - compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant)); + compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant)); + compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1); + } break; - } else { - BC1PushConstant push_constant; - push_constant.num_refines = 2; - push_constant.padding[0] = 0; - push_constant.padding[1] = 0; - push_constant.padding[2] = 0; + case Image::FORMAT_DXT1: { + BC1PushConstant push_constant; + push_constant.num_refines = 2; + push_constant.padding[0] = 0; + push_constant.padding[1] = 0; + push_constant.padding[2] = 0; - compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC1PushConstant)); + compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC1PushConstant)); + compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1); + } break; + + case Image::FORMAT_RGTC_R: { + BC4PushConstant push_constant; + push_constant.channel_idx = 0; + push_constant.padding[0] = 0; + push_constant.padding[1] = 0; + push_constant.padding[2] = 0; + + compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC4PushConstant)); + compress_rd->compute_list_dispatch(compute_list, 1, get_next_multiple(width, 16) / 16, get_next_multiple(height, 16) / 16); + } break; + + default: { + } break; } - compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1); compress_rd->compute_list_end(); compress_rd->submit(); @@ -511,13 +546,14 @@ Error _betsy_compress_s3tc(Image *r_img, Image::UsedChannels p_channels) { switch (p_channels) { case Image::USED_CHANNELS_RGB: - result = betsy->compress(BETSY_FORMAT_BC1_DITHER, r_img); - break; - case Image::USED_CHANNELS_L: result = betsy->compress(BETSY_FORMAT_BC1, r_img); break; + case Image::USED_CHANNELS_R: + result = betsy->compress(BETSY_FORMAT_BC4_UNSIGNED, r_img); + break; + default: break; } diff --git a/modules/betsy/image_compress_betsy.h b/modules/betsy/image_compress_betsy.h index 70e4ae85edb..4e0bf0538f1 100644 --- a/modules/betsy/image_compress_betsy.h +++ b/modules/betsy/image_compress_betsy.h @@ -50,6 +50,8 @@ enum BetsyFormat { BETSY_FORMAT_BC1, BETSY_FORMAT_BC1_DITHER, BETSY_FORMAT_BC3, + BETSY_FORMAT_BC4_SIGNED, + BETSY_FORMAT_BC4_UNSIGNED, BETSY_FORMAT_BC6_SIGNED, BETSY_FORMAT_BC6_UNSIGNED, }; @@ -65,6 +67,11 @@ struct BC1PushConstant { uint32_t padding[3]; }; +struct BC4PushConstant { + uint32_t channel_idx; + uint32_t padding[3]; +}; + void free_device(); Error _betsy_compress_bptc(Image *r_img, Image::UsedChannels p_channels); diff --git a/thirdparty/README.md b/thirdparty/README.md index a6686c539a6..6f1014cf9ba 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -78,7 +78,7 @@ fix build with our own copy of zstd (patch in `patches`). Files extracted from upstream source: -- `bc6h.glsl`, `bc1.glsl`, `CrossPlatformSettings_piece_all.glsl` and `UavCrossPlatform_piece_all.glsl`. +- `bc6h.glsl`, `bc1.glsl`, `bc4.glsl`, `CrossPlatformSettings_piece_all.glsl` and `UavCrossPlatform_piece_all.glsl`. - `LICENSE.md`