Skip to content

Commit a9b7331

Browse files
author
kevyuu
committed
First draft of spd workgroup implementation
1 parent d6ff5fc commit a9b7331

File tree

1 file changed

+178
-0
lines changed
  • include/nbl/builtin/hlsl/workgroup

1 file changed

+178
-0
lines changed
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#include <nbl/builtin/hlsl/cpp_compat.hlsl>
2+
#include <nbl/builtin/hlsl/concepts.hlsl>
3+
#include <nbl/builtin/hlsl/glsl_compat/subgroup_quad.hlsl>
4+
5+
#ifndef _NBL_BUILTIN_HLSL_WORKGROUP_SPD_INCLUDED_
6+
#define _NBL_BUILTIN_HLSL_WORKGROUP_SPD_INCLUDED_
7+
8+
// ------------------------------- COMMON -----------------------------------------
9+
10+
namespace nbl
11+
{
12+
namespace hlsl
13+
{
14+
namespace workgroup
15+
{
16+
namespace spd
17+
{
18+
namespace impl
19+
{
20+
template<typename Reducer>
21+
void subgroupQuadReduce(NBL_CONST_REF_ARG(Reducer) reducer, float32_t4 v)
22+
{
23+
const float32_t4 v0 = v;
24+
const float32_t4 v1 = glsl::subgroupQuadSwapHorizontal(v);
25+
const float32_t4 v2 = glsl::subgroupQuadSwapVertical(v);
26+
const float32_t4 v3 = glsl::subgroupQuadSwapDiagonal(v);
27+
return reducer.reduce(v0, v1, v2, v3);
28+
}
29+
30+
template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor>
31+
void downsampleMips_0_1(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem)
32+
{
33+
float32_t4 v[4];
34+
35+
uint32_t x = coord.x;
36+
uint32_t y = coord.y;
37+
38+
int32_t2 tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2, y * 2);
39+
int32_t2 pix = int32_t2(workGroupID.xy * 32) + int32_t2(x, y);
40+
v[0] = srcImage.reduce(tex, slice);
41+
dstImage.set(pix, v[0], 0, slice);
42+
43+
tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2 + 32, y * 2);
44+
pix = int32_t2(workGroupID.xy * 32) + int32_t2(x + 16, y);
45+
v[1] = srcImage.reduce(tex, slice);
46+
dstImage.set(pix, v[1], 0, slice);
47+
48+
tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2, y * 2 + 32);
49+
pix = int32_t2(workGroupID.xy * 32) + int32_t2(x, y + 16);
50+
v[2] = srcImage.set(pix, v[2], 0, slice);
51+
dstImage.set(pix, v[2], 0, slice);
52+
53+
tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2 + 32, y * 2 + 32);
54+
pix = int32_t2(workGroupID.xy * 32) + int32_t2(x + 16, y + 16);
55+
v[3] = srcImage.set(pix, v[2], 0, slice);
56+
dstImage.set(pix, v[3], 0, slice);
57+
58+
if (mip <= 1)
59+
return;
60+
61+
v[0] = subgroupQuadReduce(reducer, v[0]);
62+
v[1] = subgroupQuadReduce(reducer, v[1]);
63+
v[2] = subgroupQuadReduce(reducer, v[2]);
64+
v[3] = subgroupQuadReduce(reducer, v[3]);
65+
66+
if ((localInvocationIndex % 4) == 0)
67+
{
68+
dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2, y / 2), v[0], 1, slice);
69+
sharedMem.set(int32_t2(x / 2, y / 2), v[0]);
70+
71+
dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2 + 8, y / 2), v[1], 1, slice);
72+
sharedMem.set(int32_t2(x / 2 + 8, y / 2), v[1]);
73+
74+
dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2, y / 2 + 8), v[2], 1, slice);
75+
sharedMem.set(int32_t2(x / 2, y / 2 + 8), v[2]);
76+
77+
dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
78+
sharedMem.set(int32_t2(x / 2 + 8, y / 2 + 8), v[3]);
79+
}
80+
}
81+
82+
template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor>
83+
void downsampleMip_2(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem)
84+
{
85+
float32_t4 v = sharedMem.get(coord);
86+
v = subgroupQuadReduce(reducer, v);
87+
if (localInvocationIndex % 4 == 0)
88+
{
89+
dstImage.set(int32_t2(workGroupID.xy * 8) + int32_t2(coord.x / 2, coord.y / 2), v, mip, slice);
90+
91+
// store to LDS, try to reduce bank conflicts
92+
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
93+
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
94+
// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
95+
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
96+
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
97+
// ...
98+
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
99+
sharedMem.set(int32_t2(coord.x + (coord.y / 2) % 2, coord.y), v);
100+
}
101+
}
102+
103+
template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor>
104+
void downsampleMip_3(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem)
105+
{
106+
if (localInvocationIndex < 64)
107+
{
108+
float32_t4 v = sharedMem.get(int32_t2(x * 2 + y % 2, y * 2));
109+
v = subgropuQuadReduce(reducer, v);
110+
if (localInvocationIndex % 4 == 0)
111+
{
112+
dstImage.set(int32_t2(workGroupID.xy * 4) + int32_t2(x / 2, y / 2), v, mip, slice);
113+
// store to LDS
114+
// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
115+
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
116+
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
117+
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
118+
// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
119+
// ...
120+
// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
121+
// ...
122+
// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
123+
// ...
124+
sharedMem.set(int32_t2(x * 2 + y / 2, y * 2), v);
125+
}
126+
}
127+
}
128+
129+
template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor>
130+
void downsampleMip_4(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem)
131+
{
132+
if (localInvocationIndex < 16)
133+
{
134+
float32_t4 v = sharedMem.get(int32_t2(x * 4 + y, y * 4));
135+
v = subgroupQuadReduce(reducer, v);
136+
if (localInvocationIndex % 4 == 0)
137+
{
138+
dstImage.set(int32_t2(workGroupID.xy * 2), int32_t2(x / 2, y / 2), v, mip, slice);
139+
// store to LDS
140+
// x x x x 0 ...
141+
// 0 ...
142+
sharedMem.set(int32_t2(x / 2 + y, 0), v);
143+
}
144+
145+
}
146+
}
147+
148+
template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor>
149+
void downsampleMip_5(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem)
150+
{
151+
if (localInvocationIndex < 4)
152+
{
153+
float32_t4 v = sharedMem.get(int32_t2(localInvocationIndex,0));
154+
v = subgroupQuadReduce(reducer, v);
155+
// quad index 0 stores result
156+
if (localInvocationIndex % 4 == 0)
157+
{
158+
SpdStore(ASU2(workGroupID.xy), v, mip, slice);
159+
}
160+
}
161+
}
162+
}
163+
164+
struct SPD
165+
{
166+
167+
static void __call()
168+
{
169+
170+
}
171+
172+
};
173+
174+
175+
}
176+
}
177+
}
178+
}

0 commit comments

Comments
 (0)