Remove __nvvm_{thread,block,grid}_{idx,dim}_[xyz] intrinsics.

nnethercote · nnethercote · commit 450af34b4620 · 2025-11-10T14:56:49.000+11:00
`core` has equivalents, might as well use them instead.
diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs
@@ -24,7 +24,7 @@
 #![allow(internal_features)]
 #![cfg_attr(
     target_os = "cuda",
-    feature(alloc_error_handler, asm_experimental_arch, link_llvm_intrinsics)
+    feature(alloc_error_handler, asm_experimental_arch, link_llvm_intrinsics, stdarch_nvptx)
 )]
 
 extern crate alloc;
diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs
@@ -63,22 +63,6 @@ use glam::{UVec2, UVec3};
 // different calling conventions dont exist in nvptx, so we just use C as a placeholder.
 extern "C" {
     // defined in libintrinsics.ll
-    fn __nvvm_thread_idx_x() -> u32;
-    fn __nvvm_thread_idx_y() -> u32;
-    fn __nvvm_thread_idx_z() -> u32;
-
-    fn __nvvm_block_dim_x() -> u32;
-    fn __nvvm_block_dim_y() -> u32;
-    fn __nvvm_block_dim_z() -> u32;
-
-    fn __nvvm_block_idx_x() -> u32;
-    fn __nvvm_block_idx_y() -> u32;
-    fn __nvvm_block_idx_z() -> u32;
-
-    fn __nvvm_grid_dim_x() -> u32;
-    fn __nvvm_grid_dim_y() -> u32;
-    fn __nvvm_grid_dim_z() -> u32;
-
     fn __nvvm_warp_size() -> u32;
 
     fn __nvvm_block_barrier();
@@ -92,8 +76,8 @@ extern "C" {
 macro_rules! in_range {
     // The bounds were taken mostly from the cuda C++ programming guide. I also
     // double-checked with what cuda clang does by checking its emitted llvm ir's scalar metadata.
-    ($func_name:ident, $range:expr) => {{
-        let val = unsafe { $func_name() };
+    ($func_name:path, $range:expr) => {{
+        let val = unsafe { $func_name() as u32 };
         if !$range.contains(&val) {
             // SAFETY: this condition is declared unreachable by compute capability max bound.
             // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
@@ -109,84 +93,84 @@ macro_rules! in_range {
 #[inline(always)]
 pub fn thread_idx_x() -> u32 {
     // The range is derived from the `block_idx_x` range.
-    in_range!(__nvvm_thread_idx_x, 0..1024)
+    in_range!(core::arch::nvptx::_thread_idx_x, 0..1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_y() -> u32 {
     // The range is derived from the `block_idx_y` range.
-    in_range!(__nvvm_thread_idx_y, 0..1024)
+    in_range!(core::arch::nvptx::_thread_idx_y, 0..1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_z() -> u32 {
     // The range is derived from the `block_idx_z` range.
-    in_range!(__nvvm_thread_idx_z, 0..64)
+    in_range!(core::arch::nvptx::_thread_idx_z, 0..64)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_x() -> u32 {
     // The range is derived from the `grid_idx_x` range.
-    in_range!(__nvvm_block_idx_x, 0..2147483647)
+    in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_y() -> u32 {
     // The range is derived from the `grid_idx_y` range.
-    in_range!(__nvvm_block_idx_y, 0..65535)
+    in_range!(core::arch::nvptx::_block_idx_y, 0..65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_z() -> u32 {
     // The range is derived from the `grid_idx_z` range.
-    in_range!(__nvvm_block_idx_z, 0..65535)
+    in_range!(core::arch::nvptx::_block_idx_z, 0..65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_x() -> u32 {
     // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
-    in_range!(__nvvm_block_dim_x, 1..=1024)
+    in_range!(core::arch::nvptx::_block_dim_x, 1..=1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_y() -> u32 {
     // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024.
-    in_range!(__nvvm_block_dim_y, 1..=1024)
+    in_range!(core::arch::nvptx::_block_dim_y, 1..=1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_z() -> u32 {
     // CUDA Compute Capabilities: "Maximum z-dimension of a block" is 64.
-    in_range!(__nvvm_block_dim_z, 1..=64)
+    in_range!(core::arch::nvptx::_block_dim_z, 1..=64)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_x() -> u32 {
     // CUDA Compute Capabilities: "Maximum x-dimension of a grid of thread blocks" is 2^32 - 1.
-    in_range!(__nvvm_grid_dim_x, 1..=2147483647)
+    in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_y() -> u32 {
     // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
-    in_range!(__nvvm_grid_dim_y, 1..=65535)
+    in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_z() -> u32 {
     // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535.
-    in_range!(__nvvm_grid_dim_z, 1..=65535)
+    in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535)
 }
 
 /// Gets the 3d index of the thread currently executing the kernel.
diff --git a/crates/rustc_codegen_nvvm/libintrinsics.bc b/crates/rustc_codegen_nvvm/libintrinsics.bc
diff --git a/crates/rustc_codegen_nvvm/libintrinsics.ll b/crates/rustc_codegen_nvvm/libintrinsics.ll
@@ -8,86 +8,6 @@ source_filename = "libintrinsics"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-; thread ----
-
-define i32 @__nvvm_thread_idx_x() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  ret i32 %0
-}
-
-define i32 @__nvvm_thread_idx_y() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-  ret i32 %0
-}
-
-define i32 @__nvvm_thread_idx_z() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-  ret i32 %0
-}
-
-; block dimension ----
-
-define i32 @__nvvm_block_dim_x() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  ret i32 %0
-}
-
-define i32 @__nvvm_block_dim_y() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-  ret i32 %0
-}
-
-define i32 @__nvvm_block_dim_z() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-  ret i32 %0
-}
-
-; block idx ----
-
-define i32 @__nvvm_block_idx_x() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  ret i32 %0
-}
-
-define i32 @__nvvm_block_idx_y() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-  ret i32 %0
-}
-
-define i32 @__nvvm_block_idx_z() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
-  ret i32 %0
-}
-
-; grid dimension ---- 
-
-define i32 @__nvvm_grid_dim_x() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-  ret i32 %0
-}
-
-define i32 @__nvvm_grid_dim_y() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
-  ret i32 %0
-}
-
-define i32 @__nvvm_grid_dim_z() #0 {
-start:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
-  ret i32 %0
-}
-
 ; warp ----
 
 define i32 @__nvvm_warp_size() #0 {
@@ -96,18 +16,6 @@ start:
   ret i32 %0
 }
 
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
-declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
-declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
 
 ; other ----