From cd4c298504dfb4ea293409dc7761daa1c14b9c8f Mon Sep 17 00:00:00 2001 From: Jonas Gilg Date: Sun, 2 Feb 2025 18:05:03 +0100 Subject: [PATCH] :wrench: Beautify shader code. --- src/cs-graphics/LuminanceMipMap.cpp | 88 ++++++++++++++--------------- 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/src/cs-graphics/LuminanceMipMap.cpp b/src/cs-graphics/LuminanceMipMap.cpp index 4f9ad5882..48847e880 100644 --- a/src/cs-graphics/LuminanceMipMap.cpp +++ b/src/cs-graphics/LuminanceMipMap.cpp @@ -31,8 +31,8 @@ static const char* sComputeAverage = R"( layout (rg32f, binding = 1) uniform image1D uOutLuminance; - // Returns the luminance for the pixel in a 2D vector. - vec2 sampleHDRBuffer(ivec2 pos) { + // Returns the luminance for the pixel. + float sampleHDRBuffer(ivec2 pos) { #if NUM_MULTISAMPLES > 0 vec3 color = vec3(0.0); for (int i = 0; i < NUM_MULTISAMPLES; ++i) { @@ -42,8 +42,7 @@ static const char* sComputeAverage = R"( #else vec3 color = imageLoad(uInHDRBuffer, pos).rgb; #endif - float val = max(max(color.r, color.g), color.b); - return vec2(val); + return max(max(color.r, color.g), color.b); } ivec2 indexToPos(uint index, int width) { @@ -56,7 +55,7 @@ static const char* sComputeAverage = R"( layout (local_size_x = WORKGROUP_SIZE) in; // Each subgroup gets a space in shared memory. - shared float sTotal[SUBGROUP_SIZE]; + shared float sSum[SUBGROUP_SIZE]; shared float sMax[SUBGROUP_SIZE]; // This shader makes great use of subgroup optimization. We will have subgroupSize squared @@ -67,27 +66,27 @@ static const char* sComputeAverage = R"( // single value. This value will be written to the output buffer. void main() { ivec2 bufferSize = imageSize(uInHDRBuffer); - int maxSize = bufferSize.x * bufferSize.y; + int maxSize = bufferSize.x * bufferSize.y; // 1. Step // Each thread grabs two values from the HDR buffer. We need to be careful with the indexing // here, so that neighboring threads access neighboring indices in both calls. - uint i = gl_WorkGroupID.x * gl_WorkGroupSize.x * 2 + gl_LocalInvocationID.x; - vec2 left = i < maxSize ? sampleHDRBuffer(indexToPos(i, bufferSize.x)) : vec2(0); + uint i = gl_WorkGroupID.x * gl_WorkGroupSize.x * 2 + gl_LocalInvocationID.x; + float left = i < maxSize ? sampleHDRBuffer(indexToPos(i, bufferSize.x)) : 0; - uint j = i + gl_WorkGroupSize.x; - vec2 right = j < maxSize ? sampleHDRBuffer(indexToPos(j, bufferSize.x)) : vec2(0); + uint j = i + gl_WorkGroupSize.x; + float right = j < maxSize ? sampleHDRBuffer(indexToPos(j, bufferSize.x)) : 0; // 2. Step // All threads in a subgroup will sum their values up and calculate their max. - float initialSum = subgroupAdd(left.x + right.x); - float initialMax = subgroupMax(max(left.y, right.y)); + float initialSum = subgroupAdd(left + right); + float initialMax = subgroupMax(max(left, right)); // The subgroup max and sum are being combined and written to this subgroups shared memory // address. if (subgroupElect()) { - sTotal[gl_SubgroupID] = initialSum; - sMax[gl_SubgroupID] = initialMax; + sSum[gl_SubgroupID] = initialSum; + sMax[gl_SubgroupID] = initialMax; } // Wait for all threads in the work group to finish. @@ -98,10 +97,10 @@ static const char* sComputeAverage = R"( // The lowest indexed subgroup grab the remaining values from shared memory and reduce them. // The result is written to the output buffer at the location of this work groups id. if (gl_SubgroupID == 0) { - float sum = subgroupAdd(sTotal[gl_SubgroupInvocationID]); + float sum = subgroupAdd(sSum[gl_SubgroupInvocationID]); float max = subgroupMax(sMax[gl_SubgroupInvocationID]); if (subgroupElect()) { - imageStore(uOutLuminance, int(gl_WorkGroupID.x), vec4(sum, sum, 0.0, 0.0)); + imageStore(uOutLuminance, int(gl_WorkGroupID.x), vec4(sum, max, 0.0, 0.0)); } } } @@ -109,30 +108,29 @@ static const char* sComputeAverage = R"( #else // We don't have support for subgroups and do a standard parallel reduction! layout (local_size_x = 1024) in; - shared float sTotal[1024]; + shared float sSum[1024]; shared float sMax[1024]; // This shader does a standard parallel reduction based on a CUDA webinar: // https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf void main() { - uint tid = gl_LocalInvocationID.x; - uint gid = gl_GlobalInvocationID.x; + uint tid = gl_LocalInvocationID.x; ivec2 bufferSize = imageSize(uInHDRBuffer); - int maxSize = bufferSize.x * bufferSize.y; + int maxSize = bufferSize.x * bufferSize.y; // 1. Step // We have half as many threads, as pixels on screen. // Each thread grabs two values from the HDR buffer. - uint i = gl_WorkGroupID.x * gl_WorkGroupSize.x * 2 + tid; - vec2 left = i < maxSize ? sampleHDRBuffer(indexToPos(i, bufferSize.x)) : vec2(0); + uint i = gl_WorkGroupID.x * gl_WorkGroupSize.x * 2 + tid; + float left = i < maxSize ? sampleHDRBuffer(indexToPos(i, bufferSize.x)) : 0; - uint j = i + gl_WorkGroupSize.x; - vec2 right = j < maxSize ? sampleHDRBuffer(indexToPos(j, bufferSize.x)) : vec2(0); + uint j = i + gl_WorkGroupSize.x; + float right = j < maxSize ? sampleHDRBuffer(indexToPos(j, bufferSize.x)) : 0; // The two values are being combined and written to this threads shared memory address. - sTotal[tid] = left.x + right.x; - sMax[tid] = max(left.y, right.y); + sSum[tid] = left + right; + sMax[tid] = max(left, right); // Wait for all threads in the work group to finish. memoryBarrierShared(); @@ -144,24 +142,24 @@ static const char* sComputeAverage = R"( // We repeat this step until one warp is left. // We could do this in a loop, but manual unrolling is faster (I profiled this!). if (tid < 256) { - sTotal[tid] += sTotal[tid + 256]; - sMax[tid] = max(sMax[tid], sMax[tid + 256]); + sSum[tid] += sSum[tid + 256]; + sMax[tid] = max(sMax[tid], sMax[tid + 256]); } memoryBarrierShared(); barrier(); if (tid < 128) { - sTotal[tid] += sTotal[tid + 128]; - sMax[tid] = max(sMax[tid], sMax[tid + 128]); + sSum[tid] += sSum[tid + 128]; + sMax[tid] = max(sMax[tid], sMax[tid + 128]); } memoryBarrierShared(); barrier(); if (tid < 64) { - sTotal[tid] += sTotal[tid + 64]; - sMax[tid] = max(sMax[tid], sMax[tid + 64]); + sSum[tid] += sSum[tid + 64]; + sMax[tid] = max(sMax[tid], sMax[tid + 64]); } memoryBarrierShared(); @@ -171,37 +169,37 @@ static const char* sComputeAverage = R"( // We don't need to check the thread id for the last warp, since they are more efficient // doing the same work. if (tid < 32) { - sTotal[tid] += sTotal[tid + 32]; - sMax[tid] = max(sMax[tid], sMax[tid + 32]); + sSum[tid] += sSum[tid + 32]; + sMax[tid] = max(sMax[tid], sMax[tid + 32]); memoryBarrierShared(); barrier(); - sTotal[tid] += sTotal[tid + 16]; - sMax[tid] = max(sMax[tid], sMax[tid + 16]); + sSum[tid] += sSum[tid + 16]; + sMax[tid] = max(sMax[tid], sMax[tid + 16]); memoryBarrierShared(); barrier(); - sTotal[tid] += sTotal[tid + 8]; - sMax[tid] = max(sMax[tid], sMax[tid + 8]); + sSum[tid] += sSum[tid + 8]; + sMax[tid] = max(sMax[tid], sMax[tid + 8]); memoryBarrierShared(); barrier(); - sTotal[tid] += sTotal[tid + 4]; - sMax[tid] = max(sMax[tid], sMax[tid + 4]); + sSum[tid] += sSum[tid + 4]; + sMax[tid] = max(sMax[tid], sMax[tid + 4]); memoryBarrierShared(); barrier(); - sTotal[tid] += sTotal[tid + 2]; - sMax[tid] = max(sMax[tid], sMax[tid + 2]); + sSum[tid] += sSum[tid + 2]; + sMax[tid] = max(sMax[tid], sMax[tid + 2]); memoryBarrierShared(); barrier(); - float total = sTotal[tid] + sTotal[tid + 1]; - float max = max(sMax[tid], sMax[tid + 1]); + float sum = sSum[tid] + sSum[tid + 1]; + float max = max(sMax[tid], sMax[tid + 1]); // The first thread in each work group writes the final value to the output. if (tid == 0) { - imageStore(uOutLuminance, int(gl_WorkGroupID.x), vec4(total, max, 0.0, 0.0)); + imageStore(uOutLuminance, int(gl_WorkGroupID.x), vec4(sum, max, 0.0, 0.0)); } } }