From 5d6322a3abfd16b25e9c400a2585879dcb65a222 Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Thu, 2 Oct 2025 12:00:31 +0330 Subject: [PATCH 01/30] created a structured like fft --- .../nbl/builtin/hlsl/bitonic_sort/common.hlsl | 19 ++++++++++++++ .../builtin/hlsl/subgroup/bitonic_sort.hlsl | 26 +++++++++++++++++++ .../builtin/hlsl/workgroup/bitonic_sort.hlsl | 23 ++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 include/nbl/builtin/hlsl/bitonic_sort/common.hlsl create mode 100644 include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl create mode 100644 include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl diff --git a/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl new file mode 100644 index 0000000000..5f0c916816 --- /dev/null +++ b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl @@ -0,0 +1,19 @@ +#ifndef _NBL_BUILTIN_HLSL_BITONIC_SORT_COMMON_INCLUDED_ +#define _NBL_BUILTIN_HLSL_BITONIC_SORT_COMMON_INCLUDED_ + +#include +#include +#include + +namespace nbl +{ + namespace hlsl + { + namespace bitonic_sort + { + + } + } +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl new file mode 100644 index 0000000000..5d61a27fd0 --- /dev/null +++ b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl @@ -0,0 +1,26 @@ +#ifndef _NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED_ + +#include "nbl/builtin/hlsl/bitonic_sort/common.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_shuffle.hlsl" + +namespace nbl +{ + namespace hlsl + { + namespace bitonic_sort + { + + // ----------------------------------------------------------------------------------------------------------------------------------------------------------------- + template + struct bitonic_sort + { + static void __call(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi); + }; + + } + } +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl new file mode 100644 index 0000000000..17f1fa7a55 --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl @@ -0,0 +1,23 @@ +#include +#include +#include + +#ifndef _NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED_ +#define _NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED_ + +// ------------------------------- COMMON ----------------------------------------- + +namespace nbl +{ + namespace hlsl + { + namespace workgroup + { + namespace bitonic_sort + { + } + } + } +} + +#endif \ No newline at end of file From 264650c49604e47e8144e7b090ea13ba44ab6743 Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Thu, 2 Oct 2025 15:19:20 +0330 Subject: [PATCH 02/30] config question --- .../nbl/builtin/hlsl/bitonic_sort/common.hlsl | 46 +++++++++++++++++++ .../builtin/hlsl/subgroup/bitonic_sort.hlsl | 7 --- .../builtin/hlsl/workgroup/bitonic_sort.hlsl | 1 - 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl index 5f0c916816..516af22a14 100644 --- a/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl +++ b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl @@ -11,6 +11,52 @@ namespace nbl { namespace bitonic_sort { + template 0 && _Log2ThreadsPerSubgroup >= 4) + struct ConstevalParameters + { + using scalar_t = _Scalar; + using key_t = _KeyType; + using value_t = _ValueType; + + struct ThreadConfig + { + NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerThread = uint16_t(1) << _Log2ElementsPerThread; + }; + + struct SubgroupConfig + { + using thread_config_t = ThreadConfig; + thread_config_t thread; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ThreadsPerSubgroup = uint16_t(1) << _Log2ThreadsPerSubgroup; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerSubgroup = thread.ElementsPerThread * ThreadsPerSubgroup; + NBL_CONSTEXPR_STATIC_INLINE uint16_t Log2ElementsPerSubgroup = thread.Log2ElementsPerThread + Log2ThreadsPerSubgroup; + }; + + struct WorkgroupConfig + { + using subgroup_config_t = SubgroupConfig; + subgroup_config_t subgroup; + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerWorkgroup = uint16_t(1) << _Log2SubgroupsPerWorkgroup; + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = subgroup.ThreadsPerSubgroup * SubgroupsPerWorkgroup; // threads per workgroup + NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerWorkgroup = subgroup.ElementsPerSubgroup * SubgroupsPerWorkgroup; + NBL_CONSTEXPR_STATIC_INLINE uint16_t Log2ElementsPerWorkgroup = subgroup.Log2ElementsPerSubgroup + Log2SubgroupsPerWorkgroup; + }; + + using thread_config_t = ThreadConfig; + using subgroup_config_t = SubgroupConfig; + using workgroup_config_t = WorkgroupConfig; + + workgroup_config_t workgroup; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t TotalSize = uint32_t(1) << workgroup.Log2ElementsPerWorkgroup; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t KeySize = sizeof(key_t); + NBL_CONSTEXPR_STATIC_INLINE uint32_t ValueSize = sizeof(value_t); + NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryBytes = TotalSize * (KeySize + ValueSize); + NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryDWORDs = SharedMemoryBytes / sizeof(uint32_t); + + }; } } diff --git a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl index 5d61a27fd0..4535d0c9f1 100644 --- a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl @@ -12,13 +12,6 @@ namespace nbl namespace bitonic_sort { - // ----------------------------------------------------------------------------------------------------------------------------------------------------------------- - template - struct bitonic_sort - { - static void __call(NBL_REF_ARG(complex_t) lo, NBL_REF_ARG(complex_t) hi); - }; - } } } diff --git a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl index 17f1fa7a55..6d76d5428a 100644 --- a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl @@ -5,7 +5,6 @@ #ifndef _NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED_ #define _NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED_ -// ------------------------------- COMMON ----------------------------------------- namespace nbl { From 268949e40da8e41cc9fe7bcdea5f1b8bcd978271 Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Sun, 5 Oct 2025 14:55:42 +0330 Subject: [PATCH 03/30] subgroupsort --- .../builtin/hlsl/subgroup/bitonic_sort.hlsl | 120 ++++++++++++++++-- 1 file changed, 112 insertions(+), 8 deletions(-) diff --git a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl index 4535d0c9f1..9d4e672255 100644 --- a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl @@ -1,19 +1,123 @@ -#ifndef _NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED_ -#define _NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED_ +#ifndef NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED +#define NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED #include "nbl/builtin/hlsl/bitonic_sort/common.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_shuffle.hlsl" +#include "nbl/builtin/hlsl/functional.hlsl" namespace nbl { - namespace hlsl - { - namespace bitonic_sort - { +namespace hlsl +{ +namespace subgroup +{ + +template> +struct bitonic_sort_config +{ + using key_t = KeyType; + using value_t = ValueType; + using comparator_t = Comparator; +}; + +template +struct bitonic_sort; + +template +struct bitonic_sort, device_capabilities> +{ + using config_t = bitonic_sort_config; + using key_t = typename config_t::key_t; + using value_t = typename config_t::value_t; + using comparator_t = typename config_t::comparator_t; + + struct KeyValuePair + { + key_t key; + value_t val; + }; + + static void compareAndSwap(bool ascending, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + comparator_t comp; + const bool shouldSwap = ascending ? comp(hiKey, loKey) : comp(loKey, hiKey); + + if (shouldSwap) + { + // Swap keys + key_t tempKey = loKey; + loKey = hiKey; + hiKey = tempKey; + + // Swap values + value_t tempVal = loVal; + loVal = hiVal; + hiVal = tempVal; + } + } + + static void bitonicMergeStep(uint32_t stride, bool ascending, + NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); + + const bool topHalf = bool(invocationID & stride); - } - } + KeyValuePair toTrade; + toTrade.key = topHalf ? loKey : hiKey; + toTrade.val = topHalf ? loVal : hiVal; + + KeyValuePair exchanged = glsl::subgroupShuffleXor(toTrade, stride); + + if (topHalf) + { + loKey = exchanged.key; + loVal = exchanged.val; + } + else + { + hiKey = exchanged.key; + hiVal = exchanged.val; + } + + compareAndSwap(ascending, loKey, hiKey, loVal, hiVal); + } + + static void __call(NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + const uint32_t subgroupSize = glsl::gl_SubgroupSize(); + const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); + + compareAndSwap(Ascending, loKey, hiKey, loVal, hiVal); + + [unroll] + for (uint32_t k = 2; k <= subgroupSize; k <<= 1) + { + const bool sequenceAscending = ((invocationID & (k >> 1)) == 0); + + const bool dir = Ascending ? sequenceAscending : !sequenceAscending; + + [unroll] + for (uint32_t stride = k >> 1; stride > 0; stride >>= 1) + { + bitonicMergeStep(stride, dir, loKey, hiKey, loVal, hiVal); + } + } + + [unroll] + for (uint32_t stride = subgroupSize; stride > 0; stride >>= 1) + { + bitonicMergeStep(stride, Ascending, loKey, hiKey, loVal, hiVal); + } + } +}; + +} +} } #endif \ No newline at end of file From 73aa820afcd5ee170174d979144fe38d2b7c58a6 Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Sun, 5 Oct 2025 14:59:21 +0330 Subject: [PATCH 04/30] added bitonic_sort name space --- include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl index 9d4e672255..246c59efd7 100644 --- a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl @@ -12,7 +12,8 @@ namespace hlsl { namespace subgroup { - +namespace bitonic_sort +{ template> struct bitonic_sort_config { From b84a4bd7f42c9ce0d427104005b1efbf266aa5e4 Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Tue, 7 Oct 2025 21:57:17 +0330 Subject: [PATCH 05/30] subgroup changes --- .../builtin/hlsl/subgroup/bitonic_sort.hlsl | 190 ++++++++---------- 1 file changed, 81 insertions(+), 109 deletions(-) diff --git a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl index 246c59efd7..6613d4ee8a 100644 --- a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl @@ -1,124 +1,96 @@ #ifndef NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED #define NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED - #include "nbl/builtin/hlsl/bitonic_sort/common.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_shuffle.hlsl" #include "nbl/builtin/hlsl/functional.hlsl" - namespace nbl { -namespace hlsl -{ -namespace subgroup -{ -namespace bitonic_sort -{ -template> -struct bitonic_sort_config -{ - using key_t = KeyType; - using value_t = ValueType; - using comparator_t = Comparator; -}; - -template -struct bitonic_sort; - -template -struct bitonic_sort, device_capabilities> -{ - using config_t = bitonic_sort_config; - using key_t = typename config_t::key_t; - using value_t = typename config_t::value_t; - using comparator_t = typename config_t::comparator_t; - - struct KeyValuePair - { - key_t key; - value_t val; - }; - - static void compareAndSwap(bool ascending, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + namespace hlsl { - comparator_t comp; - const bool shouldSwap = ascending ? comp(hiKey, loKey) : comp(loKey, hiKey); - - if (shouldSwap) + namespace subgroup { - // Swap keys - key_t tempKey = loKey; - loKey = hiKey; - hiKey = tempKey; - - // Swap values - value_t tempVal = loVal; - loVal = hiVal; - hiVal = tempVal; - } - } - - static void bitonicMergeStep(uint32_t stride, bool ascending, - NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) - { - const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); - - const bool topHalf = bool(invocationID & stride); - - KeyValuePair toTrade; - toTrade.key = topHalf ? loKey : hiKey; - toTrade.val = topHalf ? loVal : hiVal; - - KeyValuePair exchanged = glsl::subgroupShuffleXor(toTrade, stride); - - if (topHalf) - { - loKey = exchanged.key; - loVal = exchanged.val; - } - else - { - hiKey = exchanged.key; - hiVal = exchanged.val; - } - - compareAndSwap(ascending, loKey, hiKey, loVal, hiVal); - } - - static void __call(NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) - { - const uint32_t subgroupSize = glsl::gl_SubgroupSize(); - const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); - - compareAndSwap(Ascending, loKey, hiKey, loVal, hiVal); - - [unroll] - for (uint32_t k = 2; k <= subgroupSize; k <<= 1) - { - const bool sequenceAscending = ((invocationID & (k >> 1)) == 0); - - const bool dir = Ascending ? sequenceAscending : !sequenceAscending; - - [unroll] - for (uint32_t stride = k >> 1; stride > 0; stride >>= 1) + template > + struct bitonic_sort_config { - bitonicMergeStep(stride, dir, loKey, hiKey, loVal, hiVal); - } - } - - [unroll] - for (uint32_t stride = subgroupSize; stride > 0; stride >>= 1) - { - bitonicMergeStep(stride, Ascending, loKey, hiKey, loVal, hiVal); + using key_t = KeyType; + using value_t = ValueType; + using comparator_t = Comparator; + }; + template + struct bitonic_sort; + template + struct bitonic_sort, device_capabilities> + { + using config_t = bitonic_sort_config; + using key_t = typename config_t::key_t; + using value_t = typename config_t::value_t; + using comparator_t = typename config_t::comparator_t; + // Thread-level compare and swap (operates on lo/hi in registers) + static void compareAndSwap(bool ascending, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + comparator_t comp; + const bool shouldSwap = ascending ? comp(hiKey, loKey) : comp(loKey, hiKey); + if (shouldSwap) + { + // Swap keys + key_t tempKey = loKey; + loKey = hiKey; + hiKey = tempKey; + // Swap values + value_t tempVal = loVal; + loVal = hiVal; + hiVal = tempVal; + } + } + static void __call(NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); + const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); + [unroll] + for (uint32_t stage = 0; stage <= subgroupSizeLog2; stage++) + { + const bool bitonicAscending = (stage == subgroupSizeLog2) ? Ascending : !bool(invocationID & (1u << stage)); + // Passes within this stage + [unroll] + for (uint32_t pass = 0; pass <= stage; pass++) + { + const uint32_t stride = 1u << (stage - pass); // Element stride + const uint32_t threadStride = stride >> 1; + if (threadStride == 0) + { + // Local compare and swap for stage 0 + compareAndSwap(bitonicAscending, loKey, hiKey, loVal, hiVal); + } + else + { + // Shuffle from partner using XOR + const key_t pLoKey = glsl::subgroupShuffleXor(loKey, threadStride); + const key_t pHiKey = glsl::subgroupShuffleXor(hiKey, threadStride); + const value_t pLoVal = glsl::subgroupShuffleXor(loVal, threadStride); + const value_t pHiVal = glsl::subgroupShuffleXor(hiVal, threadStride); + // Determine if we're upper or lower half + const bool upperHalf = bool(invocationID & threadStride); + const bool takeLarger = upperHalf == bitonicAscending; + comparator_t comp; + if (takeLarger) + { + if (comp(loKey, pLoKey)) { loKey = pLoKey; loVal = pLoVal; } + if (comp(hiKey, pHiKey)) { hiKey = pHiKey; hiVal = pHiVal; } + } + else + { + if (comp(pLoKey, loKey)) { loKey = pLoKey; loVal = pLoVal; } + if (comp(pHiKey, hiKey)) { hiKey = pHiKey; hiVal = pHiVal; } + } + } + } + } + } + }; } } -}; - -} -} } - #endif \ No newline at end of file From 779815e3174565c774f66daee2e2bad2c06d98d4 Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Tue, 7 Oct 2025 22:00:41 +0330 Subject: [PATCH 06/30] removed unused --- .../nbl/builtin/hlsl/bitonic_sort/common.hlsl | 58 ++----------------- .../builtin/hlsl/workgroup/bitonic_sort.hlsl | 16 ----- 2 files changed, 6 insertions(+), 68 deletions(-) diff --git a/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl index 516af22a14..84615be645 100644 --- a/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl +++ b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl @@ -7,59 +7,13 @@ namespace nbl { - namespace hlsl - { - namespace bitonic_sort - { - template 0 && _Log2ThreadsPerSubgroup >= 4) - struct ConstevalParameters - { - using scalar_t = _Scalar; - using key_t = _KeyType; - using value_t = _ValueType; - - struct ThreadConfig - { - NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerThread = uint16_t(1) << _Log2ElementsPerThread; - }; - - struct SubgroupConfig - { - using thread_config_t = ThreadConfig; - thread_config_t thread; - NBL_CONSTEXPR_STATIC_INLINE uint16_t ThreadsPerSubgroup = uint16_t(1) << _Log2ThreadsPerSubgroup; - NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerSubgroup = thread.ElementsPerThread * ThreadsPerSubgroup; - NBL_CONSTEXPR_STATIC_INLINE uint16_t Log2ElementsPerSubgroup = thread.Log2ElementsPerThread + Log2ThreadsPerSubgroup; - }; - - struct WorkgroupConfig - { - using subgroup_config_t = SubgroupConfig; - subgroup_config_t subgroup; - NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerWorkgroup = uint16_t(1) << _Log2SubgroupsPerWorkgroup; - NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = subgroup.ThreadsPerSubgroup * SubgroupsPerWorkgroup; // threads per workgroup - NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerWorkgroup = subgroup.ElementsPerSubgroup * SubgroupsPerWorkgroup; - NBL_CONSTEXPR_STATIC_INLINE uint16_t Log2ElementsPerWorkgroup = subgroup.Log2ElementsPerSubgroup + Log2SubgroupsPerWorkgroup; - }; - - using thread_config_t = ThreadConfig; - using subgroup_config_t = SubgroupConfig; - using workgroup_config_t = WorkgroupConfig; - - workgroup_config_t workgroup; - - NBL_CONSTEXPR_STATIC_INLINE uint32_t TotalSize = uint32_t(1) << workgroup.Log2ElementsPerWorkgroup; - - NBL_CONSTEXPR_STATIC_INLINE uint32_t KeySize = sizeof(key_t); - NBL_CONSTEXPR_STATIC_INLINE uint32_t ValueSize = sizeof(value_t); - NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryBytes = TotalSize * (KeySize + ValueSize); - NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryDWORDs = SharedMemoryBytes / sizeof(uint32_t); - - }; +namespace hlsl +{ +namespace bitonic_sort +{ - } - } +} +} } #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl index 6d76d5428a..7dd27bbf32 100644 --- a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl @@ -4,19 +4,3 @@ #ifndef _NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED_ #define _NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED_ - - -namespace nbl -{ - namespace hlsl - { - namespace workgroup - { - namespace bitonic_sort - { - } - } - } -} - -#endif \ No newline at end of file From ad7a4c55b204c5647e25f873678a4e4d0d3f0259 Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Sun, 19 Oct 2025 14:30:59 +0330 Subject: [PATCH 07/30] added last merge step as a function --- .../builtin/hlsl/subgroup/bitonic_sort.hlsl | 110 ++++++++++++------ 1 file changed, 75 insertions(+), 35 deletions(-) diff --git a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl index 6613d4ee8a..b2850e2ca6 100644 --- a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl @@ -10,6 +10,7 @@ namespace nbl { namespace subgroup { + template > struct bitonic_sort_config { @@ -44,52 +45,91 @@ namespace nbl hiVal = tempVal; } } + + + static void lastMergeStage(uint32_t stage, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + [unroll] + for (uint32_t pass = 0; pass <= stage; pass++) + { + const uint32_t stride = 1u << (stage - pass); // Element stride + const uint32_t threadStride = stride >> 1; + if (threadStride == 0) + { + // Local compare and swap for stage 0 + compareAndSwap(Ascending, loKey, hiKey, loVal, hiVal); + } + else + { + // Shuffle from partner using XOR + const key_t pLoKey = glsl::subgroupShuffleXor(loKey, threadStride); + const key_t pHiKey = glsl::subgroupShuffleXor(hiKey, threadStride); + const value_t pLoVal = glsl::subgroupShuffleXor(loVal, threadStride); + const value_t pHiVal = glsl::subgroupShuffleXor(hiVal, threadStride); + comparator_t comp; + if (comp(loKey, pLoKey)) { loKey = pLoKey; loVal = pLoVal; } + if (comp(hiKey, pHiKey)) { hiKey = pHiKey; hiVal = pHiVal; } + + } + + } + } + + static void mergeStage(uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + [unroll] + for (uint32_t pass = 0; pass <= stage; pass++) + { + const uint32_t stride = 1u << (stage - pass); // Element stride + const uint32_t threadStride = stride >> 1; + if (threadStride == 0) + { + // Local compare and swap for stage 0 + compareAndSwap(bitonicAscending, loKey, hiKey, loVal, hiVal); + } + else + { + // Shuffle from partner using XOR + const key_t pLoKey = glsl::subgroupShuffleXor(loKey, threadStride); + const key_t pHiKey = glsl::subgroupShuffleXor(hiKey, threadStride); + const value_t pLoVal = glsl::subgroupShuffleXor(loVal, threadStride); + const value_t pHiVal = glsl::subgroupShuffleXor(hiVal, threadStride); + // Determine if we're upper or lower half + const bool upperHalf = bool(invocationID & threadStride); + const bool takeLarger = upperHalf == bitonicAscending; + comparator_t comp; + if (takeLarger) + { + if (comp(loKey, pLoKey)) { loKey = pLoKey; loVal = pLoVal; } + if (comp(hiKey, pHiKey)) { hiKey = pHiKey; hiVal = pHiVal; } + } + else + { + if (comp(pLoKey, loKey)) { loKey = pLoKey; loVal = pLoVal; } + if (comp(pHiKey, hiKey)) { hiKey = pHiKey; hiVal = pHiVal; } + } + } + } + } + static void __call(NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) { const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); [unroll] - for (uint32_t stage = 0; stage <= subgroupSizeLog2; stage++) + for (uint32_t stage = 0; stage < subgroupSizeLog2; stage++) { const bool bitonicAscending = (stage == subgroupSizeLog2) ? Ascending : !bool(invocationID & (1u << stage)); - // Passes within this stage - [unroll] - for (uint32_t pass = 0; pass <= stage; pass++) - { - const uint32_t stride = 1u << (stage - pass); // Element stride - const uint32_t threadStride = stride >> 1; - if (threadStride == 0) - { - // Local compare and swap for stage 0 - compareAndSwap(bitonicAscending, loKey, hiKey, loVal, hiVal); - } - else - { - // Shuffle from partner using XOR - const key_t pLoKey = glsl::subgroupShuffleXor(loKey, threadStride); - const key_t pHiKey = glsl::subgroupShuffleXor(hiKey, threadStride); - const value_t pLoVal = glsl::subgroupShuffleXor(loVal, threadStride); - const value_t pHiVal = glsl::subgroupShuffleXor(hiVal, threadStride); - // Determine if we're upper or lower half - const bool upperHalf = bool(invocationID & threadStride); - const bool takeLarger = upperHalf == bitonicAscending; - comparator_t comp; - if (takeLarger) - { - if (comp(loKey, pLoKey)) { loKey = pLoKey; loVal = pLoVal; } - if (comp(hiKey, pHiKey)) { hiKey = pHiKey; hiVal = pHiVal; } - } - else - { - if (comp(pLoKey, loKey)) { loKey = pLoKey; loVal = pLoVal; } - if (comp(pHiKey, hiKey)) { hiKey = pHiKey; hiVal = pHiVal; } - } - } - } + mergeStage(stage, bitonicAscending, invocationID, loKey, hiKey, loVal, hiVal); } + lastMergeStage(subgroupSizeLog2, invocationID, loKey, hiKey, loVal, hiVal); + } }; + } } } From b80283a72bed090362a7535df005be1cb7cb13b8 Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Sun, 19 Oct 2025 15:43:53 +0330 Subject: [PATCH 08/30] uncomplete workgroup fn --- .../builtin/hlsl/workgroup/bitonic_sort.hlsl | 64 +++++++++++++++++-- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl index 7dd27bbf32..efd269b604 100644 --- a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl @@ -1,6 +1,60 @@ -#include -#include -#include +#ifndef NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED +#define NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED +#include "nbl/builtin/hlsl/bitonic_sort/common.hlsl" +#include "nbl/builtin/hlsl/functional.hlsl" +namespace nbl +{ + namespace hlsl + { + namespace workgroup + { + namespace bitonic_sort + { + template > + struct bitonic_sort_config + { + using key_t = KeyType; + using value_t = ValueType; + using comparator_t = Comparator; + }; -#ifndef _NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED_ -#define _NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED_ + template + struct bitonic_sort; + template + struct bitonic_sort, device_capabilities> + { + using config_t = bitonic_sort_config; + using key_t = typename config_t::key_t; + using value_t = typename config_t::value_t; + using comparator_t = typename config_t::comparator_t; + + using SortConfig = subgroup::bitonic_sort_config >; + + static void __call(NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); + const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); + + //first sort all subgroup inside wg + subgroup::bitonic_sort::__call(loKey, hiKey, loVal, hiVal); + //then we go over first work group shuffle + //we have n = log2(x), where n is how many wgshuffle we have to do on x(subgroup num) + + [unroll] + for (uint32_t stride = glsl::gl_SubgroupSize() << 1u; stride <= (WorkgroupSize >> 1u); stride <<= 1u) + { + //WorkGroup Shuffle with shuffleXor + subgroup::bitonic_sort::lastMergeStage(subgroupSizeLog2, invocationIDloKey, hiKey, loVal, hiVal); + workgroupExecutionAndMemoryBarrier(); + } + + + } + }; + + } + } + } +} +#endif \ No newline at end of file From 7c9174411cfafc06268dd7e480b286664635a612 Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Sun, 19 Oct 2025 16:15:26 +0330 Subject: [PATCH 09/30] complete the logic for some pr questions --- .../builtin/hlsl/workgroup/bitonic_sort.hlsl | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl index efd269b604..dc3cafcb43 100644 --- a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl @@ -30,6 +30,20 @@ namespace nbl using SortConfig = subgroup::bitonic_sort_config >; + + static void mergeWGStage(uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + [unroll] + for (uint32_t pass = 0; pass <= stage; pass++) + { + const uint32_t stride = 1u << ((stage - pass) + subgroupSizeLog2); // Element stride shifts to inter-subgroup scale + // Shuffle from partner using WG XOR need to implument + + } + } + + static void __call(NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) { @@ -42,10 +56,10 @@ namespace nbl //we have n = log2(x), where n is how many wgshuffle we have to do on x(subgroup num) [unroll] - for (uint32_t stride = glsl::gl_SubgroupSize() << 1u; stride <= (WorkgroupSize >> 1u); stride <<= 1u) + for (uint32_t stage = 1; stage <= n; ++stage) { - //WorkGroup Shuffle with shuffleXor - subgroup::bitonic_sort::lastMergeStage(subgroupSizeLog2, invocationIDloKey, hiKey, loVal, hiVal); + mergeWGStage(stage, Ascending, invocationID, hiKey, loKey, loVal, hiVal); + subgroup::bitonic_sort::lastMergeStage(subgroupSizeLog2, invocationIDloKey, hiKey, loKey,loVal, hiVal); workgroupExecutionAndMemoryBarrier(); } From 4d253f38797a2ff0718082a899001804dfdc9913 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 22 Oct 2025 17:12:14 +0330 Subject: [PATCH 10/30] Refactor bitonic sort for workgroup + Accessor support --- .../builtin/hlsl/workgroup/bitonic_sort.hlsl | 211 ++++++++++++------ 1 file changed, 146 insertions(+), 65 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl index dc3cafcb43..a7562baddd 100644 --- a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl @@ -1,74 +1,155 @@ -#ifndef NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED -#define NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED +#ifndef NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED +#define NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED #include "nbl/builtin/hlsl/bitonic_sort/common.hlsl" +#include "nbl/builtin/hlsl/memory_accessor.hlsl" #include "nbl/builtin/hlsl/functional.hlsl" +#include "nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl" +#include "nbl/builtin/hlsl/bit.hlsl" +#include "nbl/builtin/hlsl/workgroup/shuffle.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" + namespace nbl { - namespace hlsl +namespace hlsl +{ +namespace workgroup +{ +namespace bitonic_sort +{ +// Reorder: non-type parameters FIRST, then typename parameters with defaults +// This matches FFT's pattern and avoids DXC bugs +template > +struct bitonic_sort_config +{ + using key_t = KeyType; + using value_t = ValueType; + using comparator_t = Comparator; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = _ElementsPerInvocationLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerInvocation = 1u << ElementsPerInvocationLog2; + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = 1u << WorkgroupSizeLog2; +}; +} + +template +struct BitonicSort; + + +template +struct BitonicSort, device_capabilities> +{ + using config_t = bitonic_sort::bitonic_sort_config; + using key_t = KeyType; + using value_t = ValueType; + using comparator_t = Comparator; + + using SortConfig = subgroup::bitonic_sort_config; + + template + static void mergeStage(NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor, uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) { - namespace workgroup + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; + using adaptor_t = accessor_adaptors::StructureOfArrays; + adaptor_t sharedmemAdaptor; + sharedmemAdaptor.accessor = sharedmemAccessor; + + const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); + + [unroll] + for (uint32_t pass = 0; pass <= stage; pass++) { - namespace bitonic_sort - { - template > - struct bitonic_sort_config - { - using key_t = KeyType; - using value_t = ValueType; - using comparator_t = Comparator; - }; - - template - struct bitonic_sort; - template - struct bitonic_sort, device_capabilities> - { - using config_t = bitonic_sort_config; - using key_t = typename config_t::key_t; - using value_t = typename config_t::value_t; - using comparator_t = typename config_t::comparator_t; - - using SortConfig = subgroup::bitonic_sort_config >; - - - static void mergeWGStage(uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) - { - [unroll] - for (uint32_t pass = 0; pass <= stage; pass++) - { - const uint32_t stride = 1u << ((stage - pass) + subgroupSizeLog2); // Element stride shifts to inter-subgroup scale - // Shuffle from partner using WG XOR need to implument - - } - } - - - static void __call(NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) - { - const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); - const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); - - //first sort all subgroup inside wg - subgroup::bitonic_sort::__call(loKey, hiKey, loVal, hiVal); - //then we go over first work group shuffle - //we have n = log2(x), where n is how many wgshuffle we have to do on x(subgroup num) - - [unroll] - for (uint32_t stage = 1; stage <= n; ++stage) - { - mergeWGStage(stage, Ascending, invocationID, hiKey, loKey, loVal, hiVal); - subgroup::bitonic_sort::lastMergeStage(subgroupSizeLog2, invocationIDloKey, hiKey, loKey,loVal, hiVal); - workgroupExecutionAndMemoryBarrier(); - } - - - } - }; - - } + // Stride calculation: stage S merges 2^(S+1) subgroups + const uint32_t stridePower = (stage - pass + 1) + subgroupSizeLog2; + const uint32_t stride = 1u << stridePower; + const uint32_t threadStride = stride >> 1; + + // Separate shuffles for lo/hi streams (two-round shuffle as per PR review) + // TODO: Consider single-round shuffle of key-value pairs for better performance + key_t pLoKey = loKey; + shuffleXor(pLoKey, threadStride, sharedmemAdaptor); + value_t pLoVal = loVal; + shuffleXor(pLoVal, threadStride, sharedmemAdaptor); + + key_t pHiKey = hiKey; + shuffleXor(pHiKey, threadStride, sharedmemAdaptor); + value_t pHiVal = hiVal; + shuffleXor(pHiVal, threadStride, sharedmemAdaptor); + + const bool isUpper = (invocationID & threadStride) != 0; + const bool takeLarger = isUpper == bitonicAscending; + + comparator_t comp; + + // lo update + const bool loSelfSmaller = comp(loKey, pLoKey); + const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; + loKey = takePartnerLo ? pLoKey : loKey; + loVal = takePartnerLo ? pLoVal : loVal; + + // hi update + const bool hiSelfSmaller = comp(hiKey, pHiKey); + const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; + hiKey = takePartnerHi ? pHiKey : hiKey; + hiVal = takePartnerHi ? pHiVal : hiVal; + + sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); + } + } + + template + static void __call( + NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor, + NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; + + const uint32_t invocationID = glsl::gl_LocalInvocationID().x; + const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); + const uint32_t subgroupSize = 1u << subgroupSizeLog2; + const uint32_t subgroupID = glsl::gl_SubgroupID(); + const uint32_t numSubgroups = WorkgroupSize / subgroupSize; + const uint32_t numSubgroupsLog2 = findMSB(numSubgroups); + + + const bool subgroupAscending = (subgroupID & 1) == 0; + subgroup::bitonic_sort::__call(subgroupAscending, loKey, hiKey, loVal, hiVal); + + + [unroll] + for (uint32_t stage = 0; stage < numSubgroupsLog2; ++stage) + { + const bool isLastStage = (stage == numSubgroupsLog2 - 1); + const bool bitonicAscending = isLastStage ? true : !bool(invocationID & (subgroupSize << (stage + 1))); + + mergeStage(sharedmemAccessor, stage, bitonicAscending, invocationID, loKey, hiKey, loVal, hiVal); + + const uint32_t subgroupInvocationID = glsl::gl_SubgroupInvocationID(); + subgroup::bitonic_sort::mergeStage(subgroupSizeLog2, bitonicAscending, subgroupInvocationID, loKey, hiKey, loVal, hiVal); + } + + + // Final: ensure lo <= hi within each thread (for ascending sort) + comparator_t comp; + if (comp(hiKey, loKey)) + { + // Swap keys + key_t tempKey = loKey; + loKey = hiKey; + hiKey = tempKey; + // Swap values + value_t tempVal = loVal; + loVal = hiVal; + hiVal = tempVal; } } +}; + +} } -#endif \ No newline at end of file +} + +#endif From f03b8b2a4aea6344685d65477888d232c04058c9 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 22 Oct 2025 17:19:03 +0330 Subject: [PATCH 11/30] Update bitonic_sort.hlsl --- .../builtin/hlsl/subgroup/bitonic_sort.hlsl | 84 +++++++------------ 1 file changed, 29 insertions(+), 55 deletions(-) diff --git a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl index b2850e2ca6..c7bed6f823 100644 --- a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl @@ -18,15 +18,18 @@ namespace nbl using value_t = ValueType; using comparator_t = Comparator; }; - template + + template struct bitonic_sort; - template - struct bitonic_sort, device_capabilities> + + template + struct bitonic_sort, device_capabilities> { using config_t = bitonic_sort_config; using key_t = typename config_t::key_t; using value_t = typename config_t::value_t; using comparator_t = typename config_t::comparator_t; + // Thread-level compare and swap (operates on lo/hi in registers) static void compareAndSwap(bool ascending, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) @@ -46,36 +49,6 @@ namespace nbl } } - - static void lastMergeStage(uint32_t stage, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) - { - [unroll] - for (uint32_t pass = 0; pass <= stage; pass++) - { - const uint32_t stride = 1u << (stage - pass); // Element stride - const uint32_t threadStride = stride >> 1; - if (threadStride == 0) - { - // Local compare and swap for stage 0 - compareAndSwap(Ascending, loKey, hiKey, loVal, hiVal); - } - else - { - // Shuffle from partner using XOR - const key_t pLoKey = glsl::subgroupShuffleXor(loKey, threadStride); - const key_t pHiKey = glsl::subgroupShuffleXor(hiKey, threadStride); - const value_t pLoVal = glsl::subgroupShuffleXor(loVal, threadStride); - const value_t pHiVal = glsl::subgroupShuffleXor(hiVal, threadStride); - comparator_t comp; - if (comp(loKey, pLoKey)) { loKey = pLoKey; loVal = pLoVal; } - if (comp(hiKey, pHiKey)) { hiKey = pHiKey; hiVal = pHiVal; } - - } - - } - } - static void mergeStage(uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) { @@ -96,41 +69,42 @@ namespace nbl const key_t pHiKey = glsl::subgroupShuffleXor(hiKey, threadStride); const value_t pLoVal = glsl::subgroupShuffleXor(loVal, threadStride); const value_t pHiVal = glsl::subgroupShuffleXor(hiVal, threadStride); - // Determine if we're upper or lower half - const bool upperHalf = bool(invocationID & threadStride); - const bool takeLarger = upperHalf == bitonicAscending; + + // Branchless compare-and-swap + const bool isUpper = bool(invocationID & threadStride); + const bool takeLarger = isUpper == bitonicAscending; comparator_t comp; - if (takeLarger) - { - if (comp(loKey, pLoKey)) { loKey = pLoKey; loVal = pLoVal; } - if (comp(hiKey, pHiKey)) { hiKey = pHiKey; hiVal = pHiVal; } - } - else - { - if (comp(pLoKey, loKey)) { loKey = pLoKey; loVal = pLoVal; } - if (comp(pHiKey, hiKey)) { hiKey = pHiKey; hiVal = pHiVal; } - } + + // lo update + const bool loSelfSmaller = comp(loKey, pLoKey); + const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; + loKey = takePartnerLo ? pLoKey : loKey; + loVal = takePartnerLo ? pLoVal : loVal; + + // hi update + const bool hiSelfSmaller = comp(hiKey, pHiKey); + const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; + hiKey = takePartnerHi ? pHiKey : hiKey; + hiVal = takePartnerHi ? pHiVal : hiVal; } } } - static void __call(NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + static void __call(bool ascending, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) { const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); [unroll] - for (uint32_t stage = 0; stage < subgroupSizeLog2; stage++) - { - const bool bitonicAscending = (stage == subgroupSizeLog2) ? Ascending : !bool(invocationID & (1u << stage)); - mergeStage(stage, bitonicAscending, invocationID, loKey, hiKey, loVal, hiVal); - } - lastMergeStage(subgroupSizeLog2, invocationID, loKey, hiKey, loVal, hiVal); - + for (uint32_t stage = 0; stage <= subgroupSizeLog2; stage++) + { + const bool bitonicAscending = (stage == subgroupSizeLog2) ? ascending : !bool(invocationID & (1u << stage)); + mergeStage(stage, bitonicAscending, invocationID, loKey, hiKey, loVal, hiVal); + } } }; } } } -#endif \ No newline at end of file +#endif From 555dcbe868bab71c8ba909443b57ccdef1fc6ef2 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Mon, 27 Oct 2025 21:42:25 +0330 Subject: [PATCH 12/30] VT implumentation --- .../builtin/hlsl/workgroup/bitonic_sort.hlsl | 474 ++++++++++++++---- 1 file changed, 379 insertions(+), 95 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl index a7562baddd..e4d507598e 100644 --- a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl @@ -21,132 +21,416 @@ namespace bitonic_sort template > struct bitonic_sort_config { - using key_t = KeyType; - using value_t = ValueType; - using comparator_t = Comparator; + using key_t = KeyType; + using value_t = ValueType; + using comparator_t = Comparator; - NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = _ElementsPerInvocationLog2; - NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = _ElementsPerInvocationLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; - NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerInvocation = 1u << ElementsPerInvocationLog2; - NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = 1u << WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerInvocation = 1u << ElementsPerInvocationLog2; + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = 1u << WorkgroupSizeLog2; }; } template struct BitonicSort; +// ==================== ElementsPerThreadLog2 = 1 Specialization (No Virtual Threading) ==================== +// This handles arrays of size WorkgroupSize * 2 using subgroup + workgroup operations +template +struct BitonicSort, device_capabilities> +{ + using config_t = bitonic_sort::bitonic_sort_config<1, WorkgroupSizeLog2, KeyType, ValueType, Comparator>; + using key_t = KeyType; + using value_t = ValueType; + using comparator_t = Comparator; + + using SortConfig = subgroup::bitonic_sort_config; + + template + static void mergeStage(NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor, uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; + using key_adaptor = accessor_adaptors::StructureOfArrays; + using value_adaptor = accessor_adaptors::StructureOfArrays; + + key_adaptor sharedmemAdaptorKey; + sharedmemAdaptorKey.accessor = sharedmemAccessor; + + value_adaptor sharedmemAdaptorValue; + sharedmemAdaptorValue.accessor = sharedmemAccessor; + + const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); + + [unroll] + for (uint32_t pass = 0; pass <= stage; pass++) + { + // Stride calculation: stage S merges 2^(S+1) subgroups + const uint32_t stridePower = (stage - pass + 1) + subgroupSizeLog2; + const uint32_t stride = 1u << stridePower; + const uint32_t threadStride = stride >> 1; + + // Separate shuffles for lo/hi streams (two-round shuffle as per PR review) + // TODO: Consider single-round shuffle of key-value pairs for better performance + key_t pLoKey = loKey; + shuffleXor(pLoKey, threadStride, sharedmemAdaptorKey); + value_t pLoVal = loVal; + shuffleXor(pLoVal, threadStride, sharedmemAdaptorValue); + + sharedmemAdaptorKey.workgroupExecutionAndMemoryBarrier(); + sharedmemAdaptorValue.workgroupExecutionAndMemoryBarrier(); + + key_t pHiKey = hiKey; + shuffleXor(pHiKey, threadStride, sharedmemAdaptorKey); + value_t pHiVal = hiVal; + shuffleXor(pHiVal, threadStride, sharedmemAdaptorValue); + + const bool isUpper = (invocationID & threadStride) != 0; + const bool takeLarger = isUpper == bitonicAscending; + + comparator_t comp; + + // lo update + const bool loSelfSmaller = comp(loKey, pLoKey); + const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; + loKey = takePartnerLo ? pLoKey : loKey; + loVal = takePartnerLo ? pLoVal : loVal; + + // hi update + const bool hiSelfSmaller = comp(hiKey, pHiKey); + const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; + hiKey = takePartnerHi ? pHiKey : hiKey; + hiVal = takePartnerHi ? pHiVal : hiVal; + + sharedmemAdaptorKey.workgroupExecutionAndMemoryBarrier(); + sharedmemAdaptorValue.workgroupExecutionAndMemoryBarrier(); + } + } + + template + static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) + { + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; + + const uint32_t invocationID = glsl::gl_LocalInvocationID().x; + const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); + const uint32_t subgroupSize = 1u << subgroupSizeLog2; + const uint32_t subgroupID = glsl::gl_SubgroupID(); + const uint32_t numSubgroups = WorkgroupSize / subgroupSize; + const uint32_t numSubgroupsLog2 = findMSB(numSubgroups); -template -struct BitonicSort, device_capabilities> + // Load this thread's 2 elements from accessor + const uint32_t loIdx = invocationID * 2; + const uint32_t hiIdx = loIdx + 1; + key_t loKey, hiKey; + value_t loVal, hiVal; + accessor.template get(loIdx, loKey); + accessor.template get(hiIdx, hiKey); + accessor.template get(loIdx, loVal); + accessor.template get(hiIdx, hiVal); + + const bool subgroupAscending = (subgroupID & 1) == 0; + subgroup::bitonic_sort::__call(subgroupAscending, loKey, hiKey, loVal, hiVal); + + const uint32_t subgroupInvocationID = glsl::gl_SubgroupInvocationID(); + + [unroll] + for (uint32_t stage = 0; stage < numSubgroupsLog2; ++stage) + { + const bool bitonicAscending = !bool(invocationID & (subgroupSize << (stage + 1))); + + mergeStage(sharedmemAccessor, stage, bitonicAscending, invocationID, loKey, hiKey, loVal, hiVal); + + subgroup::bitonic_sort::mergeStage(subgroupSizeLog2, bitonicAscending, subgroupInvocationID, loKey, hiKey, loVal, hiVal); + } + + // Final: ensure lo <= hi within each thread (for ascending sort) + comparator_t comp; + if (comp(hiKey, loKey)) + { + // Swap keys + key_t tempKey = loKey; + loKey = hiKey; + hiKey = tempKey; + // Swap values + value_t tempVal = loVal; + loVal = hiVal; + hiVal = tempVal; + } + + // Store results back + accessor.template set(loIdx, loKey); + accessor.template set(hiIdx, hiKey); + accessor.template set(loIdx, loVal); + accessor.template set(hiIdx, hiVal); + } +}; +// ==================== ElementsPerThreadLog2 = 2 Specialization (Virtual Threading) ==================== +template +struct BitonicSort, device_capabilities> { - using config_t = bitonic_sort::bitonic_sort_config; + using config_t = bitonic_sort::bitonic_sort_config<2, WorkgroupSizeLog2, KeyType, ValueType, Comparator>; + using simple_config_t = bitonic_sort::bitonic_sort_config<1, WorkgroupSizeLog2, KeyType, ValueType, Comparator>; + using key_t = KeyType; - using value_t = ValueType; + using value_t = ValueType; using comparator_t = Comparator; - using SortConfig = subgroup::bitonic_sort_config; - - template - static void mergeStage(NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor, uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + template + static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) { - NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; - using adaptor_t = accessor_adaptors::StructureOfArrays; - adaptor_t sharedmemAdaptor; - sharedmemAdaptor.accessor = sharedmemAccessor; + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerThread = config_t::ElementsPerInvocation; + NBL_CONSTEXPR_STATIC_INLINE uint32_t TotalElements = WorkgroupSize * ElementsPerThread; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerSimpleSort = WorkgroupSize * 2; // E=1 handles WG*2 elements + + const uint32_t threadID = glsl::gl_LocalInvocationID().x; + comparator_t comp; - const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); + accessor_adaptors::Offset offsetAccessor; + offsetAccessor.accessor = accessor; [unroll] - for (uint32_t pass = 0; pass <= stage; pass++) + for (uint32_t k = 0; k < ElementsPerThread; k += 2) { - // Stride calculation: stage S merges 2^(S+1) subgroups - const uint32_t stridePower = (stage - pass + 1) + subgroupSizeLog2; - const uint32_t stride = 1u << stridePower; - const uint32_t threadStride = stride >> 1; - - // Separate shuffles for lo/hi streams (two-round shuffle as per PR review) - // TODO: Consider single-round shuffle of key-value pairs for better performance - key_t pLoKey = loKey; - shuffleXor(pLoKey, threadStride, sharedmemAdaptor); - value_t pLoVal = loVal; - shuffleXor(pLoVal, threadStride, sharedmemAdaptor); - - key_t pHiKey = hiKey; - shuffleXor(pHiKey, threadStride, sharedmemAdaptor); - value_t pHiVal = hiVal; - shuffleXor(pHiVal, threadStride, sharedmemAdaptor); - - const bool isUpper = (invocationID & threadStride) != 0; - const bool takeLarger = isUpper == bitonicAscending; - - comparator_t comp; - - // lo update - const bool loSelfSmaller = comp(loKey, pLoKey); - const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; - loKey = takePartnerLo ? pLoKey : loKey; - loVal = takePartnerLo ? pLoVal : loVal; - - // hi update - const bool hiSelfSmaller = comp(hiKey, pHiKey); - const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; - hiKey = takePartnerHi ? pHiKey : hiKey; - hiVal = takePartnerHi ? pHiVal : hiVal; - - sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); - } - } + if (k) + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); - template - static void __call( - NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor, - NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) - { - NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; + offsetAccessor.offset = ElementsPerSimpleSort * (k / 2); - const uint32_t invocationID = glsl::gl_LocalInvocationID().x; - const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); - const uint32_t subgroupSize = 1u << subgroupSizeLog2; - const uint32_t subgroupID = glsl::gl_SubgroupID(); - const uint32_t numSubgroups = WorkgroupSize / subgroupSize; - const uint32_t numSubgroupsLog2 = findMSB(numSubgroups); + BitonicSort::template __call(offsetAccessor, sharedmemAccessor); + } + accessor = offsetAccessor.accessor; - const bool subgroupAscending = (subgroupID & 1) == 0; - subgroup::bitonic_sort::__call(subgroupAscending, loKey, hiKey, loVal, hiVal); + const uint32_t simpleLog = hlsl::findMSB(ElementsPerSimpleSort - 1) + 1u; + const uint32_t totalLog = hlsl::findMSB(TotalElements - 1) + 1u; - [unroll] - for (uint32_t stage = 0; stage < numSubgroupsLog2; ++stage) + for (uint32_t blockLog = simpleLog + 1u; blockLog <= totalLog; blockLog++) { - const bool isLastStage = (stage == numSubgroupsLog2 - 1); - const bool bitonicAscending = isLastStage ? true : !bool(invocationID & (subgroupSize << (stage + 1))); + // Reverse odd halves for bitonic property + const uint32_t halfLog = blockLog - 1u; + const uint32_t halfSize = 1u << halfLog; + const uint32_t numHalves = TotalElements >> halfLog; - mergeStage(sharedmemAccessor, stage, bitonicAscending, invocationID, loKey, hiKey, loVal, hiVal); + // Process only odd-indexed halves (no thread divergence) + [unroll] + for (uint32_t halfIdx = 1u; halfIdx < numHalves; halfIdx += 2u) + { + const uint32_t halfBaseIdx = halfIdx << halfLog; - const uint32_t subgroupInvocationID = glsl::gl_SubgroupInvocationID(); - subgroup::bitonic_sort::mergeStage(subgroupSizeLog2, bitonicAscending, subgroupInvocationID, loKey, hiKey, loVal, hiVal); - } - + [unroll] + for (uint32_t strideLog = halfLog - 1u; strideLog + 1u > 0u; strideLog--) + { + const uint32_t stride = 1u << strideLog; + const uint32_t virtualThreadsInHalf = halfSize >> 1u; - // Final: ensure lo <= hi within each thread (for ascending sort) - comparator_t comp; - if (comp(hiKey, loKey)) - { - // Swap keys - key_t tempKey = loKey; - loKey = hiKey; - hiKey = tempKey; - // Swap values - value_t tempVal = loVal; - loVal = hiVal; - hiVal = tempVal; + [unroll] + for (uint32_t virtualThreadID = threadID; virtualThreadID < virtualThreadsInHalf; virtualThreadID += WorkgroupSize) + { + const uint32_t localLoIx = ((virtualThreadID & (~(stride - 1u))) << 1u) | (virtualThreadID & (stride - 1u)); + const uint32_t loIx = halfBaseIdx + localLoIx; + const uint32_t hiIx = loIx | stride; + + key_t loKeyGlobal, hiKeyGlobal; + value_t loValGlobal, hiValGlobal; + accessor.template get(loIx, loKeyGlobal); + accessor.template get(hiIx, hiKeyGlobal); + accessor.template get(loIx, loValGlobal); + accessor.template get(hiIx, hiValGlobal); + + key_t tempKey = loKeyGlobal; + loKeyGlobal = hiKeyGlobal; + hiKeyGlobal = tempKey; + + value_t tempVal = loValGlobal; + loValGlobal = hiValGlobal; + hiValGlobal = tempVal; + + accessor.template set(loIx, loKeyGlobal); + accessor.template set(hiIx, hiKeyGlobal); + accessor.template set(loIx, loValGlobal); + accessor.template set(hiIx, hiValGlobal); + } + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + } + } + + const uint32_t k = 1u << blockLog; + [unroll] + for (uint32_t strideLog = blockLog - 1u; strideLog + 1u > 0u; strideLog--) + { + const uint32_t stride = 1u << strideLog; + + [unroll] + for (uint32_t virtualThreadID = threadID; virtualThreadID < TotalElements / 2; virtualThreadID += WorkgroupSize) + { + const uint32_t loIx = ((virtualThreadID & (~(stride - 1u))) << 1u) | (virtualThreadID & (stride - 1u)); + const uint32_t hiIx = loIx | stride; + + const bool bitonicAscending = ((loIx & k) == 0u); + + key_t loKeyGlobal, hiKeyGlobal; + value_t loValGlobal, hiValGlobal; + accessor.template get(loIx, loKeyGlobal); + accessor.template get(hiIx, hiKeyGlobal); + accessor.template get(loIx, loValGlobal); + accessor.template get(hiIx, hiValGlobal); + + const bool shouldSwap = comp(hiKeyGlobal, loKeyGlobal); + if (shouldSwap == bitonicAscending) + { + key_t tempKey = loKeyGlobal; + loKeyGlobal = hiKeyGlobal; + hiKeyGlobal = tempKey; + + value_t tempVal = loValGlobal; + loValGlobal = hiValGlobal; + hiValGlobal = tempVal; + } + + accessor.template set(loIx, loKeyGlobal); + accessor.template set(hiIx, hiKeyGlobal); + accessor.template set(loIx, loValGlobal); + accessor.template set(hiIx, hiValGlobal); + } + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + } } } }; +// ==================== ElementsPerThreadLog2 > 2 Specialization (Virtual Threading) ==================== +// This handles larger arrays by combining global memory stages with recursive E=1 workgroup sorts +template +struct BitonicSort, device_capabilities> +{ + using config_t = bitonic_sort::bitonic_sort_config; + using simple_config_t = bitonic_sort::bitonic_sort_config<1, WorkgroupSizeLog2, KeyType, ValueType, Comparator>; + + using key_t = KeyType; + using value_t = ValueType; + using comparator_t = Comparator; + + template + static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) + { + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerThread = config_t::ElementsPerInvocation; + NBL_CONSTEXPR_STATIC_INLINE uint32_t TotalElements = WorkgroupSize * ElementsPerThread; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerSimpleSort = WorkgroupSize * 2; + + const uint32_t threadID = glsl::gl_LocalInvocationID().x; + comparator_t comp; + + // PHASE 1: Sub-sorts in chunks of size WorkgroupSize*2 + accessor_adaptors::Offset offsetAccessor; + offsetAccessor.accessor = accessor; + + const uint32_t numSub = TotalElements / ElementsPerSimpleSort; + + [unroll] + for (uint32_t sub = 0; sub < numSub; sub++) + { + if (sub) + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + + offsetAccessor.offset = sub * ElementsPerSimpleSort; + + // Call E=1 workgroup sort + BitonicSort::template __call(offsetAccessor, sharedmemAccessor); + } + + // PHASE 2: Reverse odd-indexed chunks to form bitonic sequences + const uint32_t simpleLog = hlsl::findMSB(ElementsPerSimpleSort - 1) + 1u; + [unroll] + for (uint32_t sub = 1; sub < numSub; sub += 2) // Only odd-indexed chunks + { + offsetAccessor.offset = sub * ElementsPerSimpleSort; + [unroll] + for (uint32_t strideLog = simpleLog - 1u; strideLog + 1u > 0u; strideLog--) + { + const uint32_t stride = 1u << strideLog; + [unroll] + for (uint32_t virtualThreadID = threadID; virtualThreadID < ElementsPerSimpleSort / 2; virtualThreadID += WorkgroupSize) + { + const uint32_t loIx = (((virtualThreadID & (~(stride - 1u))) << 1u) | (virtualThreadID & (stride - 1u))) + offsetAccessor.offset; + const uint32_t hiIx = loIx | stride; + + key_t loKeyGlobal, hiKeyGlobal; + value_t loValGlobal, hiValGlobal; + accessor.template get(loIx, loKeyGlobal); + accessor.template get(hiIx, hiKeyGlobal); + accessor.template get(loIx, loValGlobal); + accessor.template get(hiIx, hiValGlobal); + + // Always swap to reverse + key_t tempKey = loKeyGlobal; + loKeyGlobal = hiKeyGlobal; + hiKeyGlobal = tempKey; + + value_t tempVal = loValGlobal; + loValGlobal = hiValGlobal; + hiValGlobal = tempVal; + + accessor.template set(loIx, loKeyGlobal); + accessor.template set(hiIx, hiKeyGlobal); + accessor.template set(loIx, loValGlobal); + accessor.template set(hiIx, hiValGlobal); + } + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + } + } + + // PHASE 3: Global memory bitonic merge + const uint32_t totalLog = hlsl::findMSB(TotalElements - 1) + 1u; + [unroll] + for (uint32_t blockLog = simpleLog + 1u; blockLog <= totalLog; blockLog++) + { + const uint32_t k = 1u << blockLog; + [unroll] + for (uint32_t strideLog = blockLog - 1u; strideLog + 1u > 0u; strideLog--) + { + const uint32_t stride = 1u << strideLog; + [unroll] + for (uint32_t virtualThreadID = threadID; virtualThreadID < TotalElements / 2; virtualThreadID += WorkgroupSize) + { + const uint32_t loIx = ((virtualThreadID & (~(stride - 1u))) << 1u) | (virtualThreadID & (stride - 1u)); + const uint32_t hiIx = loIx | stride; + + const bool bitonicAscending = ((loIx & k) == 0u); + + key_t loKeyGlobal, hiKeyGlobal; + value_t loValGlobal, hiValGlobal; + accessor.template get(loIx, loKeyGlobal); + accessor.template get(hiIx, hiKeyGlobal); + accessor.template get(loIx, loValGlobal); + accessor.template get(hiIx, hiValGlobal); + + const bool shouldSwap = comp(hiKeyGlobal, loKeyGlobal); + if (shouldSwap == bitonicAscending) + { + key_t tempKey = loKeyGlobal; + loKeyGlobal = hiKeyGlobal; + hiKeyGlobal = tempKey; + + value_t tempVal = loValGlobal; + loValGlobal = hiValGlobal; + hiValGlobal = tempVal; + } + + accessor.template set(loIx, loKeyGlobal); + accessor.template set(hiIx, hiKeyGlobal); + accessor.template set(loIx, loValGlobal); + accessor.template set(hiIx, hiValGlobal); + } + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + } + } + } +}; } } From 0eb301ee4ded73e2d81998fb70fad7d84f2dfd6f Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 29 Oct 2025 19:54:43 +0330 Subject: [PATCH 13/30] Update bitonic_sort.hlsl --- .../builtin/hlsl/workgroup/bitonic_sort.hlsl | 114 +++++------------- 1 file changed, 28 insertions(+), 86 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl index e4d507598e..68d21308b6 100644 --- a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl @@ -16,8 +16,7 @@ namespace workgroup { namespace bitonic_sort { -// Reorder: non-type parameters FIRST, then typename parameters with defaults -// This matches FFT's pattern and avoids DXC bugs + template > struct bitonic_sort_config { @@ -52,7 +51,7 @@ struct BitonicSort; using value_adaptor = accessor_adaptors::StructureOfArrays; @@ -63,56 +62,45 @@ struct BitonicSort> 1; - // Separate shuffles for lo/hi streams (two-round shuffle as per PR review) - // TODO: Consider single-round shuffle of key-value pairs for better performance key_t pLoKey = loKey; shuffleXor(pLoKey, threadStride, sharedmemAdaptorKey); + sharedmemAdaptorKey.workgroupExecutionAndMemoryBarrier(); + value_t pLoVal = loVal; shuffleXor(pLoVal, threadStride, sharedmemAdaptorValue); - - sharedmemAdaptorKey.workgroupExecutionAndMemoryBarrier(); sharedmemAdaptorValue.workgroupExecutionAndMemoryBarrier(); key_t pHiKey = hiKey; shuffleXor(pHiKey, threadStride, sharedmemAdaptorKey); + sharedmemAdaptorKey.workgroupExecutionAndMemoryBarrier(); + value_t pHiVal = hiVal; shuffleXor(pHiVal, threadStride, sharedmemAdaptorValue); const bool isUpper = (invocationID & threadStride) != 0; const bool takeLarger = isUpper == bitonicAscending; - comparator_t comp; - - // lo update - const bool loSelfSmaller = comp(loKey, pLoKey); - const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; - loKey = takePartnerLo ? pLoKey : loKey; - loVal = takePartnerLo ? pLoVal : loVal; - - // hi update - const bool hiSelfSmaller = comp(hiKey, pHiKey); - const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; - hiKey = takePartnerHi ? pHiKey : hiKey; - hiVal = takePartnerHi ? pHiVal : hiVal; + nbl::hlsl::bitonic_sort::compareExchangeWithPartner(takeLarger, loKey, pLoKey, hiKey, pHiKey, loVal, pLoVal, hiVal, pHiVal, comp); - sharedmemAdaptorKey.workgroupExecutionAndMemoryBarrier(); - sharedmemAdaptorValue.workgroupExecutionAndMemoryBarrier(); } } template static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) { - NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; + const uint32_t WorkgroupSize = config_t::WorkgroupSize; const uint32_t invocationID = glsl::gl_LocalInvocationID().x; const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); @@ -121,9 +109,8 @@ struct BitonicSort(loIdx, loKey); @@ -146,21 +133,7 @@ struct BitonicSort::mergeStage(subgroupSizeLog2, bitonicAscending, subgroupInvocationID, loKey, hiKey, loVal, hiVal); } - // Final: ensure lo <= hi within each thread (for ascending sort) - comparator_t comp; - if (comp(hiKey, loKey)) - { - // Swap keys - key_t tempKey = loKey; - loKey = hiKey; - hiKey = tempKey; - // Swap values - value_t tempVal = loVal; - loVal = hiVal; - hiVal = tempVal; - } - // Store results back accessor.template set(loIdx, loKey); accessor.template set(hiIdx, hiKey); accessor.template set(loIdx, loVal); @@ -181,10 +154,10 @@ struct BitonicSort static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) { - NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; - NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerThread = config_t::ElementsPerInvocation; - NBL_CONSTEXPR_STATIC_INLINE uint32_t TotalElements = WorkgroupSize * ElementsPerThread; - NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerSimpleSort = WorkgroupSize * 2; // E=1 handles WG*2 elements + const uint32_t WorkgroupSize = config_t::WorkgroupSize; + const uint32_t ElementsPerThread = config_t::ElementsPerInvocation; + const uint32_t TotalElements = WorkgroupSize * ElementsPerThread; + const uint32_t ElementsPerSimpleSort = WorkgroupSize * 2; // E=1 handles WG*2 elements const uint32_t threadID = glsl::gl_LocalInvocationID().x; comparator_t comp; @@ -202,6 +175,7 @@ struct BitonicSort::template __call(offsetAccessor, sharedmemAccessor); } + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); accessor = offsetAccessor.accessor; @@ -242,13 +216,7 @@ struct BitonicSort(loIx, loValGlobal); accessor.template get(hiIx, hiValGlobal); - key_t tempKey = loKeyGlobal; - loKeyGlobal = hiKeyGlobal; - hiKeyGlobal = tempKey; - - value_t tempVal = loValGlobal; - loValGlobal = hiValGlobal; - hiValGlobal = tempVal; + nbl::hlsl::bitonic_sort::swap(loKeyGlobal, hiKeyGlobal, loValGlobal, hiValGlobal); accessor.template set(loIx, loKeyGlobal); accessor.template set(hiIx, hiKeyGlobal); @@ -280,17 +248,7 @@ struct BitonicSort(loIx, loValGlobal); accessor.template get(hiIx, hiValGlobal); - const bool shouldSwap = comp(hiKeyGlobal, loKeyGlobal); - if (shouldSwap == bitonicAscending) - { - key_t tempKey = loKeyGlobal; - loKeyGlobal = hiKeyGlobal; - hiKeyGlobal = tempKey; - - value_t tempVal = loValGlobal; - loValGlobal = hiValGlobal; - hiValGlobal = tempVal; - } + nbl::hlsl::bitonic_sort::compareSwap(bitonicAscending, loKeyGlobal, hiKeyGlobal, loValGlobal, hiValGlobal, comp); accessor.template set(loIx, loKeyGlobal); accessor.template set(hiIx, hiKeyGlobal); @@ -317,10 +275,10 @@ struct BitonicSort static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) { - NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; - NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerThread = config_t::ElementsPerInvocation; - NBL_CONSTEXPR_STATIC_INLINE uint32_t TotalElements = WorkgroupSize * ElementsPerThread; - NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerSimpleSort = WorkgroupSize * 2; + const uint32_t WorkgroupSize = config_t::WorkgroupSize; + const uint32_t ElementsPerThread = config_t::ElementsPerInvocation; + const uint32_t TotalElements = WorkgroupSize * ElementsPerThread; + const uint32_t ElementsPerSimpleSort = WorkgroupSize * 2; const uint32_t threadID = glsl::gl_LocalInvocationID().x; comparator_t comp; @@ -342,11 +300,12 @@ struct BitonicSort::template __call(offsetAccessor, sharedmemAccessor); } + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); // PHASE 2: Reverse odd-indexed chunks to form bitonic sequences const uint32_t simpleLog = hlsl::findMSB(ElementsPerSimpleSort - 1) + 1u; [unroll] - for (uint32_t sub = 1; sub < numSub; sub += 2) // Only odd-indexed chunks + for (uint32_t sub = 1; sub < numSub; sub += 2) { offsetAccessor.offset = sub * ElementsPerSimpleSort; [unroll] @@ -366,14 +325,7 @@ struct BitonicSort(loIx, loValGlobal); accessor.template get(hiIx, hiValGlobal); - // Always swap to reverse - key_t tempKey = loKeyGlobal; - loKeyGlobal = hiKeyGlobal; - hiKeyGlobal = tempKey; - - value_t tempVal = loValGlobal; - loValGlobal = hiValGlobal; - hiValGlobal = tempVal; + nbl::hlsl::bitonic_sort::swap(loKeyGlobal, hiKeyGlobal, loValGlobal, hiValGlobal); accessor.template set(loIx, loKeyGlobal); accessor.template set(hiIx, hiKeyGlobal); @@ -409,17 +361,7 @@ struct BitonicSort(loIx, loValGlobal); accessor.template get(hiIx, hiValGlobal); - const bool shouldSwap = comp(hiKeyGlobal, loKeyGlobal); - if (shouldSwap == bitonicAscending) - { - key_t tempKey = loKeyGlobal; - loKeyGlobal = hiKeyGlobal; - hiKeyGlobal = tempKey; - - value_t tempVal = loValGlobal; - loValGlobal = hiValGlobal; - hiValGlobal = tempVal; - } + nbl::hlsl::bitonic_sort::compareSwap(bitonicAscending, loKeyGlobal, hiKeyGlobal, loValGlobal, hiValGlobal, comp); accessor.template set(loIx, loKeyGlobal); accessor.template set(hiIx, hiKeyGlobal); From 52a7cb8ccc26f46f32380a8c718527fb4dbe6c68 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 29 Oct 2025 19:55:34 +0330 Subject: [PATCH 14/30] Update common.hlsl --- .../nbl/builtin/hlsl/bitonic_sort/common.hlsl | 67 ++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl index 84615be645..d2c99606a9 100644 --- a/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl +++ b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl @@ -12,8 +12,73 @@ namespace hlsl namespace bitonic_sort { +template +void compareExchangeWithPartner( + bool takeLarger, + NBL_REF_ARG(KeyType) loKey, + NBL_CONST_REF_ARG(KeyType) partnerLoKey, + NBL_REF_ARG(KeyType) hiKey, + NBL_CONST_REF_ARG(KeyType) partnerHiKey, + NBL_REF_ARG(ValueType) loVal, + NBL_CONST_REF_ARG(ValueType) partnerLoVal, + NBL_REF_ARG(ValueType) hiVal, + NBL_CONST_REF_ARG(ValueType) partnerHiVal, + NBL_CONST_REF_ARG(Comparator) comp) +{ + // Process lo pair + const bool loSelfSmaller = comp(loKey, partnerLoKey); + const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; + loKey = takePartnerLo ? partnerLoKey : loKey; + loVal = takePartnerLo ? partnerLoVal : loVal; + + // Process hi pair + const bool hiSelfSmaller = comp(hiKey, partnerHiKey); + const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; + hiKey = takePartnerHi ? partnerHiKey : hiKey; + hiVal = takePartnerHi ? partnerHiVal : hiVal; +} + + +template +void compareSwap( + bool ascending, + NBL_REF_ARG(KeyType) loKey, + NBL_REF_ARG(KeyType) hiKey, + NBL_REF_ARG(ValueType) loVal, + NBL_REF_ARG(ValueType) hiVal, + NBL_CONST_REF_ARG(Comparator) comp) +{ + const bool shouldSwap = comp(hiKey, loKey); + + const bool doSwap = (shouldSwap == ascending); + + KeyType tempKey = loKey; + loKey = doSwap ? hiKey : loKey; + hiKey = doSwap ? tempKey : hiKey; + + ValueType tempVal = loVal; + loVal = doSwap ? hiVal : loVal; + hiVal = doSwap ? tempVal : hiVal; +} + +template +void swap( + NBL_REF_ARG(KeyType) loKey, + NBL_REF_ARG(KeyType) hiKey, + NBL_REF_ARG(ValueType) loVal, + NBL_REF_ARG(ValueType) hiVal) +{ + KeyType tempKey = loKey; + loKey = hiKey; + hiKey = tempKey; + + ValueType tempVal = loVal; + loVal = hiVal; + hiVal = tempVal; +} + } } } -#endif \ No newline at end of file +#endif From 0b60d0c9c71aafa27dd04a36dbbe88607fcfb9a2 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 29 Oct 2025 19:56:21 +0330 Subject: [PATCH 15/30] Update bitonic_sort.hlsl --- .../builtin/hlsl/subgroup/bitonic_sort.hlsl | 151 +++++++----------- 1 file changed, 61 insertions(+), 90 deletions(-) diff --git a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl index c7bed6f823..9a905d36b7 100644 --- a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl @@ -6,105 +6,76 @@ #include "nbl/builtin/hlsl/functional.hlsl" namespace nbl { - namespace hlsl - { - namespace subgroup - { - - template > - struct bitonic_sort_config - { - using key_t = KeyType; - using value_t = ValueType; - using comparator_t = Comparator; - }; - - template - struct bitonic_sort; +namespace hlsl +{ +namespace subgroup +{ - template - struct bitonic_sort, device_capabilities> - { - using config_t = bitonic_sort_config; - using key_t = typename config_t::key_t; - using value_t = typename config_t::value_t; - using comparator_t = typename config_t::comparator_t; +template > +struct bitonic_sort_config +{ + using key_t = KeyType; + using value_t = ValueType; + using comparator_t = Comparator; +}; - // Thread-level compare and swap (operates on lo/hi in registers) - static void compareAndSwap(bool ascending, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) - { - comparator_t comp; - const bool shouldSwap = ascending ? comp(hiKey, loKey) : comp(loKey, hiKey); - if (shouldSwap) - { - // Swap keys - key_t tempKey = loKey; - loKey = hiKey; - hiKey = tempKey; - // Swap values - value_t tempVal = loVal; - loVal = hiVal; - hiVal = tempVal; - } - } +template +struct bitonic_sort; - static void mergeStage(uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) - { - [unroll] - for (uint32_t pass = 0; pass <= stage; pass++) - { - const uint32_t stride = 1u << (stage - pass); // Element stride - const uint32_t threadStride = stride >> 1; - if (threadStride == 0) - { - // Local compare and swap for stage 0 - compareAndSwap(bitonicAscending, loKey, hiKey, loVal, hiVal); - } - else - { - // Shuffle from partner using XOR - const key_t pLoKey = glsl::subgroupShuffleXor(loKey, threadStride); - const key_t pHiKey = glsl::subgroupShuffleXor(hiKey, threadStride); - const value_t pLoVal = glsl::subgroupShuffleXor(loVal, threadStride); - const value_t pHiVal = glsl::subgroupShuffleXor(hiVal, threadStride); +template +struct bitonic_sort, device_capabilities> +{ + using config_t = bitonic_sort_config; + using key_t = typename config_t::key_t; + using value_t = typename config_t::value_t; + using comparator_t = typename config_t::comparator_t; - // Branchless compare-and-swap - const bool isUpper = bool(invocationID & threadStride); - const bool takeLarger = isUpper == bitonicAscending; - comparator_t comp; + static void mergeStage(uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + comparator_t comp; - // lo update - const bool loSelfSmaller = comp(loKey, pLoKey); - const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; - loKey = takePartnerLo ? pLoKey : loKey; - loVal = takePartnerLo ? pLoVal : loVal; + [unroll] + for (uint32_t pass = 0; pass <= stage; pass++) + { + const uint32_t stride = 1u << (stage - pass); // Element stride + const uint32_t threadStride = stride >> 1; + if (threadStride == 0) + { + // Local compare and swap for stage 0 + nbl::hlsl::bitonic_sort::compareSwap(bitonicAscending, loKey, hiKey, loVal, hiVal, comp); + } + else + { + // Shuffle from partner using XOR + const key_t pLoKey = glsl::subgroupShuffleXor(loKey, threadStride); + const key_t pHiKey = glsl::subgroupShuffleXor(hiKey, threadStride); + const value_t pLoVal = glsl::subgroupShuffleXor(loVal, threadStride); + const value_t pHiVal = glsl::subgroupShuffleXor(hiVal, threadStride); - // hi update - const bool hiSelfSmaller = comp(hiKey, pHiKey); - const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; - hiKey = takePartnerHi ? pHiKey : hiKey; - hiVal = takePartnerHi ? pHiVal : hiVal; - } - } - } + const bool isUpper = bool(invocationID & threadStride); + const bool takeLarger = isUpper == bitonicAscending; - static void __call(bool ascending, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) - { - const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); - const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); - [unroll] - for (uint32_t stage = 0; stage <= subgroupSizeLog2; stage++) - { - const bool bitonicAscending = (stage == subgroupSizeLog2) ? ascending : !bool(invocationID & (1u << stage)); - mergeStage(stage, bitonicAscending, invocationID, loKey, hiKey, loVal, hiVal); - } - } - }; + nbl::hlsl::bitonic_sort::compareExchangeWithPartner(takeLarger, loKey, pLoKey, hiKey, pHiKey, loVal, pLoVal, hiVal, pHiVal, comp); + } + } + } + static void __call(bool ascending, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, + NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + { + const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); + const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); + [unroll] + for (uint32_t stage = 0; stage <= subgroupSizeLog2; stage++) + { + const bool bitonicAscending = (stage == subgroupSizeLog2) ? ascending : !bool(invocationID & (1u << stage)); + mergeStage(stage, bitonicAscending, invocationID, loKey, hiKey, loVal, hiVal); } } +}; + +} +} } #endif From cbfa188510b210bc166dd6bee7cf0673edc3333a Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:01:47 +0330 Subject: [PATCH 16/30] Update common.hlsl --- .../nbl/builtin/hlsl/bitonic_sort/common.hlsl | 92 ++++++++++++++----- 1 file changed, 70 insertions(+), 22 deletions(-) diff --git a/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl index d2c99606a9..30b94b442f 100644 --- a/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl +++ b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl @@ -4,6 +4,7 @@ #include #include #include +#include namespace nbl { @@ -14,24 +15,22 @@ namespace bitonic_sort template void compareExchangeWithPartner( - bool takeLarger, - NBL_REF_ARG(KeyType) loKey, - NBL_CONST_REF_ARG(KeyType) partnerLoKey, - NBL_REF_ARG(KeyType) hiKey, - NBL_CONST_REF_ARG(KeyType) partnerHiKey, - NBL_REF_ARG(ValueType) loVal, - NBL_CONST_REF_ARG(ValueType) partnerLoVal, - NBL_REF_ARG(ValueType) hiVal, - NBL_CONST_REF_ARG(ValueType) partnerHiVal, - NBL_CONST_REF_ARG(Comparator) comp) +bool takeLarger, +NBL_REF_ARG(KeyType) loKey, +NBL_CONST_REF_ARG(KeyType) partnerLoKey, +NBL_REF_ARG(KeyType) hiKey, +NBL_CONST_REF_ARG(KeyType) partnerHiKey, +NBL_REF_ARG(ValueType) loVal, +NBL_CONST_REF_ARG(ValueType) partnerLoVal, +NBL_REF_ARG(ValueType) hiVal, +NBL_CONST_REF_ARG(ValueType) partnerHiVal, +NBL_CONST_REF_ARG(Comparator) comp) { - // Process lo pair const bool loSelfSmaller = comp(loKey, partnerLoKey); const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; loKey = takePartnerLo ? partnerLoKey : loKey; loVal = takePartnerLo ? partnerLoVal : loVal; - // Process hi pair const bool hiSelfSmaller = comp(hiKey, partnerHiKey); const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; hiKey = takePartnerHi ? partnerHiKey : hiKey; @@ -41,12 +40,12 @@ void compareExchangeWithPartner( template void compareSwap( - bool ascending, - NBL_REF_ARG(KeyType) loKey, - NBL_REF_ARG(KeyType) hiKey, - NBL_REF_ARG(ValueType) loVal, - NBL_REF_ARG(ValueType) hiVal, - NBL_CONST_REF_ARG(Comparator) comp) +bool ascending, +NBL_REF_ARG(KeyType) loKey, +NBL_REF_ARG(KeyType) hiKey, +NBL_REF_ARG(ValueType) loVal, +NBL_REF_ARG(ValueType) hiVal, +NBL_CONST_REF_ARG(Comparator) comp) { const bool shouldSwap = comp(hiKey, loKey); @@ -63,10 +62,10 @@ void compareSwap( template void swap( - NBL_REF_ARG(KeyType) loKey, - NBL_REF_ARG(KeyType) hiKey, - NBL_REF_ARG(ValueType) loVal, - NBL_REF_ARG(ValueType) hiVal) +NBL_REF_ARG(KeyType) loKey, +NBL_REF_ARG(KeyType) hiKey, +NBL_REF_ARG(ValueType) loVal, +NBL_REF_ARG(ValueType) hiVal) { KeyType tempKey = loKey; loKey = hiKey; @@ -77,6 +76,55 @@ void swap( hiVal = tempVal; } + + +template +void compareExchangeWithPartner( +bool takeLarger, +NBL_REF_ARG(pair) loPair, +NBL_CONST_REF_ARG(pair) partnerLoPair, +NBL_REF_ARG(pair) hiPair, +NBL_CONST_REF_ARG(pair) partnerHiPair, +NBL_CONST_REF_ARG(Comparator) comp) +{ + const bool loSelfSmaller = comp(loPair.first, partnerLoPair.first); + const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; + loPair.first = takePartnerLo ? partnerLoPair.first : loPair.first; + loPair.second = takePartnerLo ? partnerLoPair.second : loPair.second; + + const bool hiSelfSmaller = comp(hiPair.first, partnerHiPair.first); + const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; + hiPair.first = takePartnerHi ? partnerHiPair.first : hiPair.first; + hiPair.second = takePartnerHi ? partnerHiPair.second : hiPair.second; +} + +template +void compareSwap( +bool ascending, +NBL_REF_ARG(pair) loPair, +NBL_REF_ARG(pair) hiPair, +NBL_CONST_REF_ARG(Comparator) comp) +{ + const bool shouldSwap = comp(hiPair.first, loPair.first); + const bool doSwap = (shouldSwap == ascending); + + KeyType tempKey = loPair.first; + ValueType tempVal = loPair.second; + loPair.first = doSwap ? hiPair.first : loPair.first; + loPair.second = doSwap ? hiPair.second : loPair.second; + hiPair.first = doSwap ? tempKey : hiPair.first; + hiPair.second = doSwap ? tempVal : hiPair.second; +} + +template +void swap( +NBL_REF_ARG(pair) loPair, +NBL_REF_ARG(pair) hiPair) +{ + pair temp = loPair; + loPair = hiPair; + hiPair = temp; +} } } } From 0de0167397f92218fc97827149c71f8a4581f2df Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:02:29 +0330 Subject: [PATCH 17/30] Update bitonic_sort.hlsl --- .../builtin/hlsl/workgroup/bitonic_sort.hlsl | 258 ++++-------------- 1 file changed, 54 insertions(+), 204 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl index 68d21308b6..8edccee7a0 100644 --- a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl @@ -7,6 +7,7 @@ #include "nbl/builtin/hlsl/bit.hlsl" #include "nbl/builtin/hlsl/workgroup/shuffle.hlsl" #include "nbl/builtin/hlsl/workgroup/basic.hlsl" +#include "nbl/builtin/hlsl/concepts/accessors/bitonic_sort.hlsl" namespace nbl { @@ -17,18 +18,19 @@ namespace workgroup namespace bitonic_sort { -template > +template NBL_PRIMARY_REQUIRES(_ElementsPerInvocationLog2 >= 1 && _WorkgroupSizeLog2 >= 5) struct bitonic_sort_config { using key_t = KeyType; using value_t = ValueType; using comparator_t = Comparator; - NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = _ElementsPerInvocationLog2; NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerInvocation = 1u << ElementsPerInvocationLog2; NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = 1u << WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedmemDWORDs = sizeof(pair) / sizeof(uint32_t) * WorkgroupSize; + }; } @@ -48,19 +50,10 @@ struct BitonicSort; template - static void mergeStage(NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor, uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + static void mergeStage(NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor, uint32_t stage, bool bitonicAscending, uint32_t invocationID, + NBL_REF_ARG(nbl::hlsl::pair) loPair, NBL_REF_ARG(nbl::hlsl::pair) hiPair) { const uint32_t WorkgroupSize = config_t::WorkgroupSize; - using key_adaptor = accessor_adaptors::StructureOfArrays; - using value_adaptor = accessor_adaptors::StructureOfArrays; - - key_adaptor sharedmemAdaptorKey; - sharedmemAdaptorKey.accessor = sharedmemAccessor; - - value_adaptor sharedmemAdaptorValue; - sharedmemAdaptorValue.accessor = sharedmemAccessor; - const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); comparator_t comp; @@ -68,36 +61,27 @@ struct BitonicSort> 1; - key_t pLoKey = loKey; - shuffleXor(pLoKey, threadStride, sharedmemAdaptorKey); - sharedmemAdaptorKey.workgroupExecutionAndMemoryBarrier(); - - value_t pLoVal = loVal; - shuffleXor(pLoVal, threadStride, sharedmemAdaptorValue); - sharedmemAdaptorValue.workgroupExecutionAndMemoryBarrier(); + nbl::hlsl::pair pLoPair = loPair; + shuffleXor(pLoPair, threadStride, sharedmemAccessor); + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); - key_t pHiKey = hiKey; - shuffleXor(pHiKey, threadStride, sharedmemAdaptorKey); - sharedmemAdaptorKey.workgroupExecutionAndMemoryBarrier(); - - value_t pHiVal = hiVal; - shuffleXor(pHiVal, threadStride, sharedmemAdaptorValue); + nbl::hlsl::pair pHiPair = hiPair; + shuffleXor(pHiPair, threadStride, sharedmemAccessor); const bool isUpper = (invocationID & threadStride) != 0; const bool takeLarger = isUpper == bitonicAscending; - nbl::hlsl::bitonic_sort::compareExchangeWithPartner(takeLarger, loKey, pLoKey, hiKey, pHiKey, loVal, pLoVal, hiVal, pHiVal, comp); - + nbl::hlsl::bitonic_sort::compareExchangeWithPartner(takeLarger, loPair, pLoPair, hiPair, pHiPair, comp); } } - template + template&& bitonic_sort::BitonicSortSharedMemoryAccessor) static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) { const uint32_t WorkgroupSize = config_t::WorkgroupSize; @@ -111,15 +95,13 @@ struct BitonicSort(loIdx, loKey); - accessor.template get(hiIdx, hiKey); - accessor.template get(loIdx, loVal); - accessor.template get(hiIdx, hiVal); + + nbl::hlsl::pair loPair, hiPair; + accessor.template get >(loIdx, loPair); + accessor.template get >(hiIdx, hiPair); const bool subgroupAscending = (subgroupID & 1) == 0; - subgroup::bitonic_sort::__call(subgroupAscending, loKey, hiKey, loVal, hiVal); + subgroup::bitonic_sort::__call(subgroupAscending, loPair.first, hiPair.first, loPair.second, hiPair.second); const uint32_t subgroupInvocationID = glsl::gl_SubgroupInvocationID(); @@ -128,139 +110,17 @@ struct BitonicSort::mergeStage(subgroupSizeLog2, bitonicAscending, subgroupInvocationID, loKey, hiKey, loVal, hiVal); + subgroup::bitonic_sort::mergeStage(subgroupSizeLog2, bitonicAscending, subgroupInvocationID, loPair.first, hiPair.first, loPair.second, hiPair.second); } - - accessor.template set(loIdx, loKey); - accessor.template set(hiIdx, hiKey); - accessor.template set(loIdx, loVal); - accessor.template set(hiIdx, hiVal); + accessor.template set >(loIdx, loPair); + accessor.template set >(hiIdx, hiPair); } }; -// ==================== ElementsPerThreadLog2 = 2 Specialization (Virtual Threading) ==================== -template -struct BitonicSort, device_capabilities> -{ - using config_t = bitonic_sort::bitonic_sort_config<2, WorkgroupSizeLog2, KeyType, ValueType, Comparator>; - using simple_config_t = bitonic_sort::bitonic_sort_config<1, WorkgroupSizeLog2, KeyType, ValueType, Comparator>; - using key_t = KeyType; - using value_t = ValueType; - using comparator_t = Comparator; - - template - static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor) - { - const uint32_t WorkgroupSize = config_t::WorkgroupSize; - const uint32_t ElementsPerThread = config_t::ElementsPerInvocation; - const uint32_t TotalElements = WorkgroupSize * ElementsPerThread; - const uint32_t ElementsPerSimpleSort = WorkgroupSize * 2; // E=1 handles WG*2 elements - - const uint32_t threadID = glsl::gl_LocalInvocationID().x; - comparator_t comp; - - accessor_adaptors::Offset offsetAccessor; - offsetAccessor.accessor = accessor; - - [unroll] - for (uint32_t k = 0; k < ElementsPerThread; k += 2) - { - if (k) - sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); - - offsetAccessor.offset = ElementsPerSimpleSort * (k / 2); - - BitonicSort::template __call(offsetAccessor, sharedmemAccessor); - } - sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); - - accessor = offsetAccessor.accessor; - - const uint32_t simpleLog = hlsl::findMSB(ElementsPerSimpleSort - 1) + 1u; - const uint32_t totalLog = hlsl::findMSB(TotalElements - 1) + 1u; - - [unroll] - for (uint32_t blockLog = simpleLog + 1u; blockLog <= totalLog; blockLog++) - { - // Reverse odd halves for bitonic property - const uint32_t halfLog = blockLog - 1u; - const uint32_t halfSize = 1u << halfLog; - const uint32_t numHalves = TotalElements >> halfLog; - - // Process only odd-indexed halves (no thread divergence) - [unroll] - for (uint32_t halfIdx = 1u; halfIdx < numHalves; halfIdx += 2u) - { - const uint32_t halfBaseIdx = halfIdx << halfLog; - - [unroll] - for (uint32_t strideLog = halfLog - 1u; strideLog + 1u > 0u; strideLog--) - { - const uint32_t stride = 1u << strideLog; - const uint32_t virtualThreadsInHalf = halfSize >> 1u; - - [unroll] - for (uint32_t virtualThreadID = threadID; virtualThreadID < virtualThreadsInHalf; virtualThreadID += WorkgroupSize) - { - const uint32_t localLoIx = ((virtualThreadID & (~(stride - 1u))) << 1u) | (virtualThreadID & (stride - 1u)); - const uint32_t loIx = halfBaseIdx + localLoIx; - const uint32_t hiIx = loIx | stride; - - key_t loKeyGlobal, hiKeyGlobal; - value_t loValGlobal, hiValGlobal; - accessor.template get(loIx, loKeyGlobal); - accessor.template get(hiIx, hiKeyGlobal); - accessor.template get(loIx, loValGlobal); - accessor.template get(hiIx, hiValGlobal); - - nbl::hlsl::bitonic_sort::swap(loKeyGlobal, hiKeyGlobal, loValGlobal, hiValGlobal); - - accessor.template set(loIx, loKeyGlobal); - accessor.template set(hiIx, hiKeyGlobal); - accessor.template set(loIx, loValGlobal); - accessor.template set(hiIx, hiValGlobal); - } - sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); - } - } - - const uint32_t k = 1u << blockLog; - [unroll] - for (uint32_t strideLog = blockLog - 1u; strideLog + 1u > 0u; strideLog--) - { - const uint32_t stride = 1u << strideLog; - - [unroll] - for (uint32_t virtualThreadID = threadID; virtualThreadID < TotalElements / 2; virtualThreadID += WorkgroupSize) - { - const uint32_t loIx = ((virtualThreadID & (~(stride - 1u))) << 1u) | (virtualThreadID & (stride - 1u)); - const uint32_t hiIx = loIx | stride; - - const bool bitonicAscending = ((loIx & k) == 0u); - - key_t loKeyGlobal, hiKeyGlobal; - value_t loValGlobal, hiValGlobal; - accessor.template get(loIx, loKeyGlobal); - accessor.template get(hiIx, hiKeyGlobal); - accessor.template get(loIx, loValGlobal); - accessor.template get(hiIx, hiValGlobal); - - nbl::hlsl::bitonic_sort::compareSwap(bitonicAscending, loKeyGlobal, hiKeyGlobal, loValGlobal, hiValGlobal, comp); - - accessor.template set(loIx, loKeyGlobal); - accessor.template set(hiIx, hiKeyGlobal); - accessor.template set(loIx, loValGlobal); - accessor.template set(hiIx, hiValGlobal); - } - sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); - } - } - } -}; -// ==================== ElementsPerThreadLog2 > 2 Specialization (Virtual Threading) ==================== +// ==================== ElementsPerThreadLog2 > 1 Specialization (Virtual Threading) ==================== // This handles larger arrays by combining global memory stages with recursive E=1 workgroup sorts template struct BitonicSort, device_capabilities> @@ -295,10 +155,10 @@ struct BitonicSort::template __call(offsetAccessor, sharedmemAccessor); + // Call E=1 workgroup sort + BitonicSort::template __call(offsetAccessor, sharedmemAccessor); } sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); @@ -318,59 +178,49 @@ struct BitonicSort(loIx, loKeyGlobal); - accessor.template get(hiIx, hiKeyGlobal); - accessor.template get(loIx, loValGlobal); - accessor.template get(hiIx, hiValGlobal); + nbl::hlsl::pair loPair, hiPair; + accessor.template get >(loIx, loPair); + accessor.template get >(hiIx, hiPair); - nbl::hlsl::bitonic_sort::swap(loKeyGlobal, hiKeyGlobal, loValGlobal, hiValGlobal); + nbl::hlsl::bitonic_sort::swap(loPair.first, hiPair.first, loPair.second, hiPair.second); - accessor.template set(loIx, loKeyGlobal); - accessor.template set(hiIx, hiKeyGlobal); - accessor.template set(loIx, loValGlobal); - accessor.template set(hiIx, hiValGlobal); + accessor.template set >(loIx, loPair); + accessor.template set >(hiIx, hiPair); } sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + } } - } - // PHASE 3: Global memory bitonic merge - const uint32_t totalLog = hlsl::findMSB(TotalElements - 1) + 1u; - [unroll] - for (uint32_t blockLog = simpleLog + 1u; blockLog <= totalLog; blockLog++) - { - const uint32_t k = 1u << blockLog; + // PHASE 3: Global memory bitonic merge + const uint32_t totalLog = hlsl::findMSB(TotalElements - 1) + 1u; [unroll] - for (uint32_t strideLog = blockLog - 1u; strideLog + 1u > 0u; strideLog--) + for (uint32_t blockLog = simpleLog + 1u; blockLog <= totalLog; blockLog++) { - const uint32_t stride = 1u << strideLog; + const uint32_t k = 1u << blockLog; [unroll] - for (uint32_t virtualThreadID = threadID; virtualThreadID < TotalElements / 2; virtualThreadID += WorkgroupSize) + for (uint32_t strideLog = blockLog - 1u; strideLog + 1u > 0u; strideLog--) { - const uint32_t loIx = ((virtualThreadID & (~(stride - 1u))) << 1u) | (virtualThreadID & (stride - 1u)); - const uint32_t hiIx = loIx | stride; + const uint32_t stride = 1u << strideLog; + [unroll] + for (uint32_t virtualThreadID = threadID; virtualThreadID < TotalElements / 2; virtualThreadID += WorkgroupSize) + { + const uint32_t loIx = ((virtualThreadID & (~(stride - 1u))) << 1u) | (virtualThreadID & (stride - 1u)); + const uint32_t hiIx = loIx | stride; - const bool bitonicAscending = ((loIx & k) == 0u); + const bool bitonicAscending = ((loIx & k) == 0u); - key_t loKeyGlobal, hiKeyGlobal; - value_t loValGlobal, hiValGlobal; - accessor.template get(loIx, loKeyGlobal); - accessor.template get(hiIx, hiKeyGlobal); - accessor.template get(loIx, loValGlobal); - accessor.template get(hiIx, hiValGlobal); + nbl::hlsl::pair loPair, hiPair; + accessor.template get >(loIx, loPair); + accessor.template get >(hiIx, hiPair); - nbl::hlsl::bitonic_sort::compareSwap(bitonicAscending, loKeyGlobal, hiKeyGlobal, loValGlobal, hiValGlobal, comp); + nbl::hlsl::bitonic_sort::compareSwap(bitonicAscending, loPair.first, hiPair.first, loPair.second, hiPair.second, comp); - accessor.template set(loIx, loKeyGlobal); - accessor.template set(hiIx, hiKeyGlobal); - accessor.template set(loIx, loValGlobal); - accessor.template set(hiIx, hiValGlobal); + accessor.template set >(loIx, loPair); + accessor.template set >(hiIx, hiPair); + } + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); } - sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); } - } } }; From 1b1ba15ba8170dd788179a086e4aea66cb177004 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:09:00 +0330 Subject: [PATCH 18/30] Update CMakeLists.txt --- src/nbl/builtin/CMakeLists.txt | 120 +++++++++++++++++++++++---------- 1 file changed, 83 insertions(+), 37 deletions(-) diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index cc81b093a2..b8bb8f039d 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -15,7 +15,15 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "shader/loader/gltf/fragment_impl.g LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "shader/loader/gltf/uv.frag") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "shader/loader/gltf/color.frag") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "shader/loader/gltf/no_uv_color.frag") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "material/lambertian/singletexture/specialized_shader.vert") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "material/lambertian/singletexture/specialized_shader.frag") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "material/debug/vertex_color/specialized_shader.vert") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "material/debug/vertex_normal/specialized_shader.vert") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "material/debug/vertex_normal/specialized_shader.frag") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "material/debug/vertex_uv/specialized_shader.frag") # generic GLSL headers after this line +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/macros.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/algorithm.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ieee754.glsl") # barycentric LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/barycentric/extensions.glsl") @@ -31,6 +39,30 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/legacy_bda_accessor.hlsl" # bump mapping LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/fragment.glsl") # TODO: rename to `frag.glsl` LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/utils.glsl") +# bxdf +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/common.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/common_samples.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/fresnel.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/ndf/common.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/ndf/blinn_phong.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/ndf/beckmann.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/ndf/ggx.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/geom/smith/common.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/geom/smith/beckmann.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/geom/smith/ggx.glsl") +# brdf +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/brdf/diffuse/lambert.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/brdf/diffuse/oren_nayar.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/brdf/specular/blinn_phong.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/brdf/specular/beckmann.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/brdf/specular/ggx.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/brdf/diffuse/fresnel_correction.glsl") +# bsdf +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/bsdf/diffuse/lambert.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/bsdf/specular/common.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/bsdf/specular/dielectric.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/bsdf/specular/beckmann.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bxdf/bsdf/specular/ggx.glsl") # colorspace LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/colorspace/EOTF.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/colorspace/OETF.glsl") @@ -55,15 +87,19 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/format/constants.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/loader/mtl/common.glsl") # LoD Library LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/lod_library/structs.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/lod_library/descriptor_set.glsl") # math and limits LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/math/constants.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/math/complex.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/math/functions.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/math/quaternions.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/math/typeless_arithmetic.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/limits/numeric.glsl") # material_compiler LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/material_compiler/common.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/material_compiler/common_declarations.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/material_compiler/common_invariant_declarations.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/material_compiler/rasterization/impl.glsl") # property pool LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/property_pool/transfer.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/property_pool/copy.comp") @@ -73,7 +109,6 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/random/xoroshiro.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/random/pcg.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/random/lcg.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/random/tea.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/random/dim_adaptor_recursive.hlsl") # sampling LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/sampling/bilinear.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/sampling/box_muller_transform.glsl") @@ -84,6 +119,14 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/sampling/projected_spherical_ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/sampling/spherical_rectangle.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/sampling/spherical_triangle.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/sampling/quantized_sequence.glsl") +# global exclusive scan +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/scan/direct.comp") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/scan/declarations.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/scan/descriptors.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/scan/default_scheduler.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/scan/indirect.comp") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/scan/parameters_struct.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/scan/virtual_workgroup.glsl") # faster and easier scan LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/scanning_append/scanning_append.glsl") # scene @@ -102,11 +145,30 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/skinning/debug.vert") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/skinning/linear.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/skinning/render_descriptor_set.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/skinning/update_descriptor_set.glsl") +# subgroup emulation +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/subgroup/arithmetic_portability.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/subgroup/arithmetic_portability_impl.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/subgroup/basic_portability.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/subgroup/fft.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/subgroup/shared_arithmetic_portability.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/subgroup/shared_shuffle_portability.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/subgroup/shuffle_portability.glsl") # utilities LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/utils/common.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/utils/culling.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/utils/compressed_normal_matrix_t.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/utils/normal_decode.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/utils/normal_encode.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/utils/transform.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/utils/morton.glsl") +# workgroup "intrinsics" +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/workgroup/arithmetic.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/workgroup/basic.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/workgroup/ballot.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/workgroup/fft.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/workgroup/shared_arithmetic.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/workgroup/shared_ballot.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/workgroup/shared_fft.glsl") #transform_tree LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/transform_tree/global_transform_update.comp") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/transform_tree/global_transform_and_normal_matrix_update.comp") @@ -120,7 +182,14 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/transform_tree/relative_trans LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/transform_tree/relative_transform_update_common.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/transform_tree/relative_transform_update_descriptor_set.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/transform_tree/debug.vert") -# +# ext shouldn't be built into the engine, but there's no harm including some non-dynamic GLSL source to make life easier +#LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/.glsl") +# radix sort +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/FFT/default_compute_fft.comp") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/FFT/fft.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/FFT/parameters_struct.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/FFT/parameters.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/FFT/types.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/LumaMeter/common.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/LumaMeter/impl.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/ToneMapper/operators.glsl") @@ -133,6 +202,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/MitsubaLoader/material_co LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/OIT/oit.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/OIT/insert_node.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/OIT/resolve.frag") +# virtual geometry +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/virtual_geometry/descriptors.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/virtual_geometry/virtual_attribute.glsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/virtual_geometry/virtual_attribute_fetch.glsl") # depth pyramid generator LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/DepthPyramidGenerator/common.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/DepthPyramidGenerator/push_constants_struct_common.h") @@ -141,7 +214,6 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/DepthPyramidGenerator/vir # HLSL LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/indirect_commands.hlsl") # emulated LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/float64_t.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/emulated/float64_t_impl.hlsl") @@ -195,6 +267,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/limits.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/type_traits.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tuple.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/utility.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/pair.hlsl") + #metaprogramming LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/mpl.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/member_test_macros.hlsl") @@ -218,12 +292,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/format/shared_exp.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/format.hlsl") #linear algebra LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/linalg/fast_affine.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/linalg/transform.hlsl") # TODO: rename `equations` to `polynomials` probably LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/functions.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/geometry.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/intutil.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/angle_adding.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quadratic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/cubic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quartic.hlsl") @@ -246,52 +318,24 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/circle.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/ellipse.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/line.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/beziers.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/aabb.hlsl") -#sampling -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/concentric_mapping.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/cos_weighted_spheres.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/quotient_and_pdf.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/uniform_spheres.hlsl") # LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ndarray_addressing.hlsl") # LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/util.hlsl") #FFT LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/fft/common.hlsl") +#Bitonic_sort +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bitonic_sort/common.hlsl") #sort LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sort/common.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sort/counting.hlsl") -#bxdf -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/common.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/fresnel.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/ndf.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/config.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/cook_torrance_base.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/reflection.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/transmission.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/bxdf_traits.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/ndf/beckmann.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/ndf/ggx.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/ndf/microfacet_to_light_transform.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/base/lambertian.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/base/oren_nayar.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/reflection/beckmann.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/reflection/ggx.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/reflection/lambertian.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/reflection/oren_nayar.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/reflection/delta_distribution.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/transmission/beckmann.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/transmission/ggx.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/transmission/lambertian.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/transmission/oren_nayar.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/transmission/smooth_dielectric.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bxdf/transmission/delta_distribution.hlsl") #subgroup LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/ballot.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/basic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability_impl.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/fft.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/bitonic_sort.hlsl") #subgroup2 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/ballot.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/arithmetic_params.hlsl") @@ -305,6 +349,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/basic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/ballot.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/broadcast.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/fft.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/bitonic_sort.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl") @@ -338,6 +383,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/mip_mapped LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/storable_image.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/generic_shared_data.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/fft.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/bitonic_sort.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/workgroup_arithmetic.hlsl") #tgmath LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl") @@ -347,4 +393,4 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath/output_structs.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/prefix_sum_blur/blur.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/prefix_sum_blur/box_sampler.hlsl") -ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL") \ No newline at end of file +ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL") From a20ba6ef3a0f36860c40255810fa20bbc2b05484 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:11:20 +0330 Subject: [PATCH 19/30] pair added --- include/nbl/builtin/hlsl/pair.hlsl | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 include/nbl/builtin/hlsl/pair.hlsl diff --git a/include/nbl/builtin/hlsl/pair.hlsl b/include/nbl/builtin/hlsl/pair.hlsl new file mode 100644 index 0000000000..af278c2c97 --- /dev/null +++ b/include/nbl/builtin/hlsl/pair.hlsl @@ -0,0 +1,38 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_PAIR_INCLUDED_ +#define _NBL_BUILTIN_HLSL_PAIR_INCLUDED_ + +#include "nbl/builtin/hlsl/type_traits.hlsl" + +namespace nbl +{ +namespace hlsl +{ + +template +struct pair +{ + using first_type = T1; + using second_type = T2; + + first_type first; + second_type second; +}; + + +// Helper to make a pair (similar to std::make_pair) +template +pair make_pair(T1 f, T2 s) +{ + pair p; + p.first = f; + p.second = s; + return p; +} + +} +} + +#endif From 8c7b7e5bedebb17c286a392ed0cbabf703483b0f Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:15:17 +0330 Subject: [PATCH 20/30] comment outdated pair impl --- include/nbl/builtin/hlsl/memory_accessor.hlsl | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/nbl/builtin/hlsl/memory_accessor.hlsl b/include/nbl/builtin/hlsl/memory_accessor.hlsl index 2194b1e917..1179d87033 100644 --- a/include/nbl/builtin/hlsl/memory_accessor.hlsl +++ b/include/nbl/builtin/hlsl/memory_accessor.hlsl @@ -24,15 +24,15 @@ namespace hlsl { // TODO: flesh out and move to `nbl/builtin/hlsl/utility.hlsl` -template -struct pair -{ - using first_type = T1; - using second_type = T2; - - first_type first; - second_type second; -}; +//template +//struct pair +//{ +// using first_type = T1; +// using second_type = T2; +// +// first_type first; +// second_type second; +//}; namespace accessor_adaptors { @@ -227,4 +227,4 @@ struct Offset : impl::OffsetBase } } } -#endif \ No newline at end of file +#endif From 06af50bd85ffe3c3ce536ad3162e6663b7de36bd Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:21:44 +0330 Subject: [PATCH 21/30] bitonic sort acessor added --- .../hlsl/concepts/accessors/bitonic_sort.hlsl | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 include/nbl/builtin/hlsl/concepts/accessors/bitonic_sort.hlsl diff --git a/include/nbl/builtin/hlsl/concepts/accessors/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/bitonic_sort.hlsl new file mode 100644 index 0000000000..a39e91ffb3 --- /dev/null +++ b/include/nbl/builtin/hlsl/concepts/accessors/bitonic_sort.hlsl @@ -0,0 +1,31 @@ +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_BITONIC_SORT_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_BITONIC_SORT_INCLUDED_ + +#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace workgroup +{ +namespace bitonic_sort +{ +// The SharedMemoryAccessor MUST provide the following methods: +// * void get(uint32_t index, NBL_REF_ARG(uint32_t) value); +// * void set(uint32_t index, in uint32_t value); +// * void workgroupExecutionAndMemoryBarrier(); +template +NBL_BOOL_CONCEPT BitonicSortSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor; + +// The Accessor MUST provide the following methods: +// * void get(uint32_t index, NBL_REF_ARG(pair) value); +// * void set(uint32_t index, in pair value); +template +NBL_BOOL_CONCEPT BitonicSortAccessor = concepts::accessors::GenericDataAccessor, I>; + +} +} +} +} +#endif From ecb71826415aa40204fbfc58d8e33cde2526c471 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:08:31 +0330 Subject: [PATCH 22/30] Update common.hlsl --- .../nbl/builtin/hlsl/bitonic_sort/common.hlsl | 125 ++++-------------- 1 file changed, 23 insertions(+), 102 deletions(-) diff --git a/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl index 30b94b442f..0b5bfb69ac 100644 --- a/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl +++ b/include/nbl/builtin/hlsl/bitonic_sort/common.hlsl @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace nbl { @@ -15,115 +15,36 @@ namespace bitonic_sort template void compareExchangeWithPartner( -bool takeLarger, -NBL_REF_ARG(KeyType) loKey, -NBL_CONST_REF_ARG(KeyType) partnerLoKey, -NBL_REF_ARG(KeyType) hiKey, -NBL_CONST_REF_ARG(KeyType) partnerHiKey, -NBL_REF_ARG(ValueType) loVal, -NBL_CONST_REF_ARG(ValueType) partnerLoVal, -NBL_REF_ARG(ValueType) hiVal, -NBL_CONST_REF_ARG(ValueType) partnerHiVal, -NBL_CONST_REF_ARG(Comparator) comp) + bool takeLarger, + NBL_REF_ARG(pair) loPair, + NBL_CONST_REF_ARG(pair) partnerLoPair, + NBL_REF_ARG(pair) hiPair, + NBL_CONST_REF_ARG(pair) partnerHiPair, + NBL_CONST_REF_ARG(Comparator) comp) { - const bool loSelfSmaller = comp(loKey, partnerLoKey); - const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; - loKey = takePartnerLo ? partnerLoKey : loKey; - loVal = takePartnerLo ? partnerLoVal : loVal; + const bool loSelfSmaller = comp(loPair.first, partnerLoPair.first); + const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; + if (takePartnerLo) + loPair = partnerLoPair; - const bool hiSelfSmaller = comp(hiKey, partnerHiKey); - const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; - hiKey = takePartnerHi ? partnerHiKey : hiKey; - hiVal = takePartnerHi ? partnerHiVal : hiVal; -} - - -template -void compareSwap( -bool ascending, -NBL_REF_ARG(KeyType) loKey, -NBL_REF_ARG(KeyType) hiKey, -NBL_REF_ARG(ValueType) loVal, -NBL_REF_ARG(ValueType) hiVal, -NBL_CONST_REF_ARG(Comparator) comp) -{ - const bool shouldSwap = comp(hiKey, loKey); - - const bool doSwap = (shouldSwap == ascending); - - KeyType tempKey = loKey; - loKey = doSwap ? hiKey : loKey; - hiKey = doSwap ? tempKey : hiKey; - - ValueType tempVal = loVal; - loVal = doSwap ? hiVal : loVal; - hiVal = doSwap ? tempVal : hiVal; -} - -template -void swap( -NBL_REF_ARG(KeyType) loKey, -NBL_REF_ARG(KeyType) hiKey, -NBL_REF_ARG(ValueType) loVal, -NBL_REF_ARG(ValueType) hiVal) -{ - KeyType tempKey = loKey; - loKey = hiKey; - hiKey = tempKey; - - ValueType tempVal = loVal; - loVal = hiVal; - hiVal = tempVal; -} - - - -template -void compareExchangeWithPartner( -bool takeLarger, -NBL_REF_ARG(pair) loPair, -NBL_CONST_REF_ARG(pair) partnerLoPair, -NBL_REF_ARG(pair) hiPair, -NBL_CONST_REF_ARG(pair) partnerHiPair, -NBL_CONST_REF_ARG(Comparator) comp) -{ - const bool loSelfSmaller = comp(loPair.first, partnerLoPair.first); - const bool takePartnerLo = takeLarger ? loSelfSmaller : !loSelfSmaller; - loPair.first = takePartnerLo ? partnerLoPair.first : loPair.first; - loPair.second = takePartnerLo ? partnerLoPair.second : loPair.second; - - const bool hiSelfSmaller = comp(hiPair.first, partnerHiPair.first); - const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; - hiPair.first = takePartnerHi ? partnerHiPair.first : hiPair.first; - hiPair.second = takePartnerHi ? partnerHiPair.second : hiPair.second; + const bool hiSelfSmaller = comp(hiPair.first, partnerHiPair.first); + const bool takePartnerHi = takeLarger ? hiSelfSmaller : !hiSelfSmaller; + if (takePartnerHi) + hiPair = partnerHiPair; } template void compareSwap( -bool ascending, -NBL_REF_ARG(pair) loPair, -NBL_REF_ARG(pair) hiPair, -NBL_CONST_REF_ARG(Comparator) comp) + bool ascending, + NBL_REF_ARG(pair) loPair, + NBL_REF_ARG(pair) hiPair, + NBL_CONST_REF_ARG(Comparator) comp) { - const bool shouldSwap = comp(hiPair.first, loPair.first); - const bool doSwap = (shouldSwap == ascending); + const bool shouldSwap = comp(hiPair.first, loPair.first); + const bool doSwap = (shouldSwap == ascending); - KeyType tempKey = loPair.first; - ValueType tempVal = loPair.second; - loPair.first = doSwap ? hiPair.first : loPair.first; - loPair.second = doSwap ? hiPair.second : loPair.second; - hiPair.first = doSwap ? tempKey : hiPair.first; - hiPair.second = doSwap ? tempVal : hiPair.second; -} - -template -void swap( -NBL_REF_ARG(pair) loPair, -NBL_REF_ARG(pair) hiPair) -{ - pair temp = loPair; - loPair = hiPair; - hiPair = temp; + if (doSwap) + swap(loPair, hiPair); } } } From 08867d6e2c2a03586fb702ff45e242a51e56e881 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:09:13 +0330 Subject: [PATCH 23/30] Update bitonic_sort.hlsl --- .../builtin/hlsl/subgroup/bitonic_sort.hlsl | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl index 9a905d36b7..64ca5e1ced 100644 --- a/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl @@ -30,8 +30,8 @@ struct bitonic_sort, device_ using value_t = typename config_t::value_t; using comparator_t = typename config_t::comparator_t; - static void mergeStage(uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + static void mergeStage(uint32_t stage, bool bitonicAscending, uint32_t invocationID, + NBL_REF_ARG(pair) loPair, NBL_REF_ARG(pair) hiPair) { comparator_t comp; @@ -43,26 +43,28 @@ struct bitonic_sort, device_ if (threadStride == 0) { // Local compare and swap for stage 0 - nbl::hlsl::bitonic_sort::compareSwap(bitonicAscending, loKey, hiKey, loVal, hiVal, comp); + nbl::hlsl::bitonic_sort::compareSwap(bitonicAscending, loPair, hiPair, comp); } else { // Shuffle from partner using XOR - const key_t pLoKey = glsl::subgroupShuffleXor(loKey, threadStride); - const key_t pHiKey = glsl::subgroupShuffleXor(hiKey, threadStride); - const value_t pLoVal = glsl::subgroupShuffleXor(loVal, threadStride); - const value_t pHiVal = glsl::subgroupShuffleXor(hiVal, threadStride); + const key_t pLoKey = glsl::subgroupShuffleXor(loPair.first, threadStride); + const value_t pLoVal = glsl::subgroupShuffleXor(loPair.second, threadStride); + const key_t pHiKey = glsl::subgroupShuffleXor(hiPair.first, threadStride); + const value_t pHiVal = glsl::subgroupShuffleXor(hiPair.second, threadStride); + + const pair partnerLoPair = make_pair(pLoKey, pLoVal); + const pair partnerHiPair = make_pair(pHiKey, pHiVal); const bool isUpper = bool(invocationID & threadStride); const bool takeLarger = isUpper == bitonicAscending; - nbl::hlsl::bitonic_sort::compareExchangeWithPartner(takeLarger, loKey, pLoKey, hiKey, pHiKey, loVal, pLoVal, hiVal, pHiVal, comp); + nbl::hlsl::bitonic_sort::compareExchangeWithPartner(takeLarger, loPair, partnerLoPair, hiPair, partnerHiPair, comp); } } } - static void __call(bool ascending, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, - NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) + static void __call(bool ascending, NBL_REF_ARG(pair) loPair, NBL_REF_ARG(pair) hiPair) { const uint32_t invocationID = glsl::gl_SubgroupInvocationID(); const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); @@ -70,7 +72,7 @@ struct bitonic_sort, device_ for (uint32_t stage = 0; stage <= subgroupSizeLog2; stage++) { const bool bitonicAscending = (stage == subgroupSizeLog2) ? ascending : !bool(invocationID & (1u << stage)); - mergeStage(stage, bitonicAscending, invocationID, loKey, hiKey, loVal, hiVal); + mergeStage(stage, bitonicAscending, invocationID, loPair, hiPair); } } }; From e2937cec17db751df953873b5ff9069d68d21933 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:10:04 +0330 Subject: [PATCH 24/30] Update bitonic_sort.hlsl --- .../builtin/hlsl/workgroup/bitonic_sort.hlsl | 76 ++++++++++++------- 1 file changed, 49 insertions(+), 27 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl index 8edccee7a0..4ae393fbca 100644 --- a/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl @@ -49,14 +49,37 @@ struct BitonicSort; + template + static void shuffleXor(NBL_REF_ARG(pair) p, uint32_t ownedIdx, uint32_t mask, NBL_REF_ARG(key_adaptor_t) keyAdaptor, NBL_REF_ARG(value_adaptor_t) valueAdaptor) + { + keyAdaptor.template set(ownedIdx, p.first); + valueAdaptor.template set(ownedIdx, p.second); + + // Wait until all writes are done before reading - only barrier on one adaptor here + keyAdaptor.workgroupExecutionAndMemoryBarrier(); + + keyAdaptor.template get(ownedIdx ^ mask, p.first); + valueAdaptor.template get(ownedIdx ^ mask, p.second); + } + + template static void mergeStage(NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor, uint32_t stage, bool bitonicAscending, uint32_t invocationID, - NBL_REF_ARG(nbl::hlsl::pair) loPair, NBL_REF_ARG(nbl::hlsl::pair) hiPair) + NBL_REF_ARG(nbl::hlsl::pair) lopair, NBL_REF_ARG(nbl::hlsl::pair) hipair) { const uint32_t WorkgroupSize = config_t::WorkgroupSize; const uint32_t subgroupSizeLog2 = glsl::gl_SubgroupSizeLog2(); comparator_t comp; + + using key_adaptor_t = accessor_adaptors::StructureOfArrays; + using value_adaptor_t = accessor_adaptors::StructureOfArrays >; + + key_adaptor_t keyAdaptor; + keyAdaptor.accessor = sharedmemAccessor; + value_adaptor_t valueAdaptor; + valueAdaptor.accessor = sharedmemAccessor; + [unroll] for (uint32_t pass = 0; pass <= stage; pass++) { @@ -67,17 +90,16 @@ struct BitonicSort> 1; - nbl::hlsl::pair pLoPair = loPair; - shuffleXor(pLoPair, threadStride, sharedmemAccessor); - sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + nbl::hlsl::pair plopair = lopair; + shuffleXor(plopair, invocationID, threadStride, keyAdaptor, valueAdaptor); - nbl::hlsl::pair pHiPair = hiPair; - shuffleXor(pHiPair, threadStride, sharedmemAccessor); + nbl::hlsl::pair phipair = hipair; + shuffleXor(phipair, invocationID ^ threadStride, threadStride, keyAdaptor, valueAdaptor); const bool isUpper = (invocationID & threadStride) != 0; const bool takeLarger = isUpper == bitonicAscending; - nbl::hlsl::bitonic_sort::compareExchangeWithPartner(takeLarger, loPair, pLoPair, hiPair, pHiPair, comp); + nbl::hlsl::bitonic_sort::compareExchangeWithPartner(takeLarger, lopair, plopair, hipair, phipair, comp); } } @@ -96,12 +118,12 @@ struct BitonicSort loPair, hiPair; - accessor.template get >(loIdx, loPair); - accessor.template get >(hiIdx, hiPair); + nbl::hlsl::pair lopair, hipair; + accessor.template get >(loIdx, lopair); + accessor.template get >(hiIdx, hipair); const bool subgroupAscending = (subgroupID & 1) == 0; - subgroup::bitonic_sort::__call(subgroupAscending, loPair.first, hiPair.first, loPair.second, hiPair.second); + subgroup::bitonic_sort::__call(subgroupAscending, lopair, hipair); const uint32_t subgroupInvocationID = glsl::gl_SubgroupInvocationID(); @@ -110,13 +132,13 @@ struct BitonicSort::mergeStage(subgroupSizeLog2, bitonicAscending, subgroupInvocationID, loPair.first, hiPair.first, loPair.second, hiPair.second); + subgroup::bitonic_sort::mergeStage(subgroupSizeLog2, bitonicAscending, subgroupInvocationID, lopair, hipair); } - accessor.template set >(loIdx, loPair); - accessor.template set >(hiIdx, hiPair); + accessor.template set >(loIdx, lopair); + accessor.template set >(hiIdx, hipair); } }; @@ -178,14 +200,14 @@ struct BitonicSort loPair, hiPair; - accessor.template get >(loIx, loPair); - accessor.template get >(hiIx, hiPair); + nbl::hlsl::pair lopair, hipair; + accessor.template get >(loIx, lopair); + accessor.template get >(hiIx, hipair); - nbl::hlsl::bitonic_sort::swap(loPair.first, hiPair.first, loPair.second, hiPair.second); + swap(lopair, hipair); - accessor.template set >(loIx, loPair); - accessor.template set >(hiIx, hiPair); + accessor.template set >(loIx, lopair); + accessor.template set >(hiIx, hipair); } sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); } @@ -209,14 +231,14 @@ struct BitonicSort loPair, hiPair; - accessor.template get >(loIx, loPair); - accessor.template get >(hiIx, hiPair); + nbl::hlsl::pair lopair, hipair; + accessor.template get >(loIx, lopair); + accessor.template get >(hiIx, hipair); - nbl::hlsl::bitonic_sort::compareSwap(bitonicAscending, loPair.first, hiPair.first, loPair.second, hiPair.second, comp); + nbl::hlsl::bitonic_sort::compareSwap(bitonicAscending, lopair, hipair, comp); - accessor.template set >(loIx, loPair); - accessor.template set >(hiIx, hiPair); + accessor.template set >(loIx, lopair); + accessor.template set >(hiIx, hipair); } sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); } From 686618caa507f702409275dd0bb4da58f5861517 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:11:07 +0330 Subject: [PATCH 25/30] Update utility.hlsl --- include/nbl/builtin/hlsl/utility.hlsl | 32 ++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/utility.hlsl b/include/nbl/builtin/hlsl/utility.hlsl index 21f1eb1909..912bfd00de 100644 --- a/include/nbl/builtin/hlsl/utility.hlsl +++ b/include/nbl/builtin/hlsl/utility.hlsl @@ -8,11 +8,41 @@ #include -// for now we only implement declval namespace nbl { namespace hlsl { + +template +struct pair +{ + using first_type = T1; + using second_type = T2; + + first_type first; + second_type second; +}; + +template +pair make_pair(T1 f, T2 s) +{ + pair p; + p.first = f; + p.second = s; + return p; +} + +template +void swap(NBL_REF_ARG(pair) a, NBL_REF_ARG(pair) b) +{ + T1 temp_first = a.first; + T2 temp_second = a.second; + a.first = b.first; + a.second = b.second; + b.first = temp_first; + b.second = temp_second; +} + template const static bool always_true = true; #ifndef __HLSL_VERSION From c8e990d6ab643b79cfd1d6816fb261316a5c4b2e Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:12:10 +0330 Subject: [PATCH 26/30] Delete include/nbl/builtin/hlsl/utility.hlsl --- include/nbl/builtin/hlsl/utility.hlsl | 70 --------------------------- 1 file changed, 70 deletions(-) delete mode 100644 include/nbl/builtin/hlsl/utility.hlsl diff --git a/include/nbl/builtin/hlsl/utility.hlsl b/include/nbl/builtin/hlsl/utility.hlsl deleted file mode 100644 index 912bfd00de..0000000000 --- a/include/nbl/builtin/hlsl/utility.hlsl +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (C) 2024 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_BUILTIN_HLSL_UTILITY_INCLUDED_ -#define _NBL_BUILTIN_HLSL_UTILITY_INCLUDED_ - - -#include - - -namespace nbl -{ -namespace hlsl -{ - -template -struct pair -{ - using first_type = T1; - using second_type = T2; - - first_type first; - second_type second; -}; - -template -pair make_pair(T1 f, T2 s) -{ - pair p; - p.first = f; - p.second = s; - return p; -} - -template -void swap(NBL_REF_ARG(pair) a, NBL_REF_ARG(pair) b) -{ - T1 temp_first = a.first; - T2 temp_second = a.second; - a.first = b.first; - a.second = b.second; - b.first = temp_first; - b.second = temp_second; -} - -template -const static bool always_true = true; -#ifndef __HLSL_VERSION - -template -std::add_rvalue_reference_t declval() noexcept -{ - static_assert(false,"Actually calling declval is ill-formed."); -} - -#else - -namespace experimental -{ - -template -T declval() {} - -} - -#endif -} -} - -#endif From 55b7813a99378e8ba7c359c57eb146daefaa3580 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:12:29 +0330 Subject: [PATCH 27/30] Delete include/nbl/builtin/hlsl/pair.hlsl --- include/nbl/builtin/hlsl/pair.hlsl | 38 ------------------------------ 1 file changed, 38 deletions(-) delete mode 100644 include/nbl/builtin/hlsl/pair.hlsl diff --git a/include/nbl/builtin/hlsl/pair.hlsl b/include/nbl/builtin/hlsl/pair.hlsl deleted file mode 100644 index af278c2c97..0000000000 --- a/include/nbl/builtin/hlsl/pair.hlsl +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_BUILTIN_HLSL_PAIR_INCLUDED_ -#define _NBL_BUILTIN_HLSL_PAIR_INCLUDED_ - -#include "nbl/builtin/hlsl/type_traits.hlsl" - -namespace nbl -{ -namespace hlsl -{ - -template -struct pair -{ - using first_type = T1; - using second_type = T2; - - first_type first; - second_type second; -}; - - -// Helper to make a pair (similar to std::make_pair) -template -pair make_pair(T1 f, T2 s) -{ - pair p; - p.first = f; - p.second = s; - return p; -} - -} -} - -#endif From d0cd7a30533c9c8519a38ef788bf5dfe53e0481b Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:13:08 +0330 Subject: [PATCH 28/30] Add files via upload --- include/nbl/builtin/hlsl/utility.hlsl | 70 +++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 include/nbl/builtin/hlsl/utility.hlsl diff --git a/include/nbl/builtin/hlsl/utility.hlsl b/include/nbl/builtin/hlsl/utility.hlsl new file mode 100644 index 0000000000..07c4b10624 --- /dev/null +++ b/include/nbl/builtin/hlsl/utility.hlsl @@ -0,0 +1,70 @@ +// Copyright (C) 2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_UTILITY_INCLUDED_ +#define _NBL_BUILTIN_HLSL_UTILITY_INCLUDED_ + + +#include + + +namespace nbl +{ +namespace hlsl +{ + +template +struct pair +{ + using first_type = T1; + using second_type = T2; + + first_type first; + second_type second; +}; + +template +pair make_pair(T1 f, T2 s) +{ + pair p; + p.first = f; + p.second = s; + return p; +} + +template +void swap(NBL_REF_ARG(pair) a, NBL_REF_ARG(pair) b) +{ + T1 temp_first = a.first; + T2 temp_second = a.second; + a.first = b.first; + a.second = b.second; + b.first = temp_first; + b.second = temp_second; +} + +template +const static bool always_true = true; +#ifndef __HLSL_VERSION + +template +std::add_rvalue_reference_t declval() noexcept +{ + static_assert(false,"Actually calling declval is ill-formed."); +} + +#else + +namespace experimental +{ + +template +T declval() {} + +} + +#endif +} +} + +#endif From e8f6134c6050c1643d4e4f69d54e5ecde2e60019 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:16:13 +0330 Subject: [PATCH 29/30] Update CMakeLists.txt --- src/nbl/builtin/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index b8bb8f039d..c56a468112 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -267,7 +267,6 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/limits.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/type_traits.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tuple.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/utility.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/pair.hlsl") #metaprogramming LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/mpl.hlsl") From 034cd335440e151db0806e85763485459eeb5c70 Mon Sep 17 00:00:00 2001 From: Abbas Garousi <69919151+CrabExtra@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:23:16 +0330 Subject: [PATCH 30/30] Remove unused pair struct from memory_accessor.hlsl Removed commented-out template struct for pair. --- include/nbl/builtin/hlsl/memory_accessor.hlsl | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/nbl/builtin/hlsl/memory_accessor.hlsl b/include/nbl/builtin/hlsl/memory_accessor.hlsl index 1179d87033..cee8617e1a 100644 --- a/include/nbl/builtin/hlsl/memory_accessor.hlsl +++ b/include/nbl/builtin/hlsl/memory_accessor.hlsl @@ -22,18 +22,6 @@ namespace nbl { namespace hlsl { - -// TODO: flesh out and move to `nbl/builtin/hlsl/utility.hlsl` -//template -//struct pair -//{ -// using first_type = T1; -// using second_type = T2; -// -// first_type first; -// second_type second; -//}; - namespace accessor_adaptors { namespace impl