@@ -63,22 +63,6 @@ use glam::{UVec2, UVec3};
6363// different calling conventions dont exist in nvptx, so we just use C as a placeholder.
6464extern "C" {
6565 // defined in libintrinsics.ll
66- fn __nvvm_thread_idx_x ( ) -> u32 ;
67- fn __nvvm_thread_idx_y ( ) -> u32 ;
68- fn __nvvm_thread_idx_z ( ) -> u32 ;
69-
70- fn __nvvm_block_dim_x ( ) -> u32 ;
71- fn __nvvm_block_dim_y ( ) -> u32 ;
72- fn __nvvm_block_dim_z ( ) -> u32 ;
73-
74- fn __nvvm_block_idx_x ( ) -> u32 ;
75- fn __nvvm_block_idx_y ( ) -> u32 ;
76- fn __nvvm_block_idx_z ( ) -> u32 ;
77-
78- fn __nvvm_grid_dim_x ( ) -> u32 ;
79- fn __nvvm_grid_dim_y ( ) -> u32 ;
80- fn __nvvm_grid_dim_z ( ) -> u32 ;
81-
8266 fn __nvvm_warp_size ( ) -> u32 ;
8367
8468 fn __nvvm_block_barrier ( ) ;
@@ -92,8 +76,8 @@ extern "C" {
9276macro_rules! in_range {
9377 // The bounds were taken mostly from the cuda C++ programming guide. I also
9478 // double-checked with what cuda clang does by checking its emitted llvm ir's scalar metadata.
95- ( $func_name: ident , $range: expr) => { {
96- let val = unsafe { $func_name( ) } ;
79+ ( $func_name: path , $range: expr) => { {
80+ let val = unsafe { $func_name( ) as u32 } ;
9781 if !$range. contains( & val) {
9882 // SAFETY: this condition is declared unreachable by compute capability max bound.
9983 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
@@ -109,84 +93,84 @@ macro_rules! in_range {
10993#[ inline( always) ]
11094pub fn thread_idx_x ( ) -> u32 {
11195 // The range is derived from the `block_idx_x` range.
112- in_range ! ( __nvvm_thread_idx_x , 0 ..1024 )
96+ in_range ! ( core :: arch :: nvptx :: _thread_idx_x , 0 ..1024 )
11397}
11498
11599#[ gpu_only]
116100#[ inline( always) ]
117101pub fn thread_idx_y ( ) -> u32 {
118102 // The range is derived from the `block_idx_y` range.
119- in_range ! ( __nvvm_thread_idx_y , 0 ..1024 )
103+ in_range ! ( core :: arch :: nvptx :: _thread_idx_y , 0 ..1024 )
120104}
121105
122106#[ gpu_only]
123107#[ inline( always) ]
124108pub fn thread_idx_z ( ) -> u32 {
125109 // The range is derived from the `block_idx_z` range.
126- in_range ! ( __nvvm_thread_idx_z , 0 ..64 )
110+ in_range ! ( core :: arch :: nvptx :: _thread_idx_z , 0 ..64 )
127111}
128112
129113#[ gpu_only]
130114#[ inline( always) ]
131115pub fn block_idx_x ( ) -> u32 {
132116 // The range is derived from the `grid_idx_x` range.
133- in_range ! ( __nvvm_block_idx_x , 0 ..2147483647 )
117+ in_range ! ( core :: arch :: nvptx :: _block_idx_x , 0 ..2147483647 )
134118}
135119
136120#[ gpu_only]
137121#[ inline( always) ]
138122pub fn block_idx_y ( ) -> u32 {
139123 // The range is derived from the `grid_idx_y` range.
140- in_range ! ( __nvvm_block_idx_y , 0 ..65535 )
124+ in_range ! ( core :: arch :: nvptx :: _block_idx_y , 0 ..65535 )
141125}
142126
143127#[ gpu_only]
144128#[ inline( always) ]
145129pub fn block_idx_z ( ) -> u32 {
146130 // The range is derived from the `grid_idx_z` range.
147- in_range ! ( __nvvm_block_idx_z , 0 ..65535 )
131+ in_range ! ( core :: arch :: nvptx :: _block_idx_z , 0 ..65535 )
148132}
149133
150134#[ gpu_only]
151135#[ inline( always) ]
152136pub fn block_dim_x ( ) -> u32 {
153137 // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
154- in_range ! ( __nvvm_block_dim_x , 1 ..=1024 )
138+ in_range ! ( core :: arch :: nvptx :: _block_dim_x , 1 ..=1024 )
155139}
156140
157141#[ gpu_only]
158142#[ inline( always) ]
159143pub fn block_dim_y ( ) -> u32 {
160144 // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
161- in_range ! ( __nvvm_block_dim_y , 1 ..=1024 )
145+ in_range ! ( core :: arch :: nvptx :: _block_dim_y , 1 ..=1024 )
162146}
163147
164148#[ gpu_only]
165149#[ inline( always) ]
166150pub fn block_dim_z ( ) -> u32 {
167151 // CUDA Compute Capabilities: "Maximum z-dimension of a block" is 64.
168- in_range ! ( __nvvm_block_dim_z , 1 ..=64 )
152+ in_range ! ( core :: arch :: nvptx :: _block_dim_z , 1 ..=64 )
169153}
170154
171155#[ gpu_only]
172156#[ inline( always) ]
173157pub fn grid_dim_x ( ) -> u32 {
174158 // CUDA Compute Capabilities: "Maximum x-dimension of a grid of thread blocks" is 2^32 - 1.
175- in_range ! ( __nvvm_grid_dim_x , 1 ..=2147483647 )
159+ in_range ! ( core :: arch :: nvptx :: _grid_dim_x , 1 ..=2147483647 )
176160}
177161
178162#[ gpu_only]
179163#[ inline( always) ]
180164pub fn grid_dim_y ( ) -> u32 {
181165 // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
182- in_range ! ( __nvvm_grid_dim_y , 1 ..=65535 )
166+ in_range ! ( core :: arch :: nvptx :: _grid_dim_y , 1 ..=65535 )
183167}
184168
185169#[ gpu_only]
186170#[ inline( always) ]
187171pub fn grid_dim_z ( ) -> u32 {
188172 // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
189- in_range ! ( __nvvm_grid_dim_z , 1 ..=65535 )
173+ in_range ! ( core :: arch :: nvptx :: _grid_dim_z , 1 ..=65535 )
190174}
191175
192176/// Gets the 3d index of the thread currently executing the kernel.
0 commit comments