From e98fec3184550e017141244700c3bddb35105288 Mon Sep 17 00:00:00 2001 From: Christian Legnitto Date: Sun, 25 May 2025 15:20:57 -0400 Subject: [PATCH 1/4] Allow address spaces to propagate to LLVM This looks like it was code that wasn't deleted after the refactor in https://github.com/Rust-GPU/Rust-CUDA/commit/decda8784d10fc4e4a9b9e385e2a4322cfd07d79 --- crates/rustc_codegen_nvvm/src/builder.rs | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/crates/rustc_codegen_nvvm/src/builder.rs b/crates/rustc_codegen_nvvm/src/builder.rs index 7dac242b..ac2075f2 100644 --- a/crates/rustc_codegen_nvvm/src/builder.rs +++ b/crates/rustc_codegen_nvvm/src/builder.rs @@ -1154,18 +1154,7 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { impl<'ll> StaticBuilderMethods for Builder<'_, 'll, '_> { fn get_static(&mut self, def_id: DefId) -> &'ll Value { - unsafe { - let mut g = self.cx.get_static(def_id); - let llty = self.val_ty(g); - let addrspace = AddressSpace(llvm::LLVMGetPointerAddressSpace(llty)); - if addrspace != AddressSpace::DATA { - trace!("Remapping global address space of global {:?}", g); - let llty = llvm::LLVMGetElementType(llty); - let ty = self.type_ptr_to_ext(llty, AddressSpace::DATA); - g = llvm::LLVMBuildAddrSpaceCast(self.llbuilder, g, ty, unnamed()); - } - g - } + self.cx.get_static(def_id) } } From 6a3b08bfaa3c3441f7e3d8a522ad444d794e35b5 Mon Sep 17 00:00:00 2001 From: Christian Legnitto Date: Sun, 25 May 2025 18:10:30 -0400 Subject: [PATCH 2/4] Spill large statics from constant to global memory This isn't fully correct, as ideally we keep track of what we have put into constant memory and when it is filled up spill instdead of only spilling when a static is big. But, this is materially better than what is there (which is a runtime error). An argument can be made to just _always_ use global memory and we don't have to worry about getting the packing right. Fixes https://github.com/Rust-GPU/Rust-CUDA/issues/208. See also the debugging and discussion in https://github.com/Rust-GPU/Rust-CUDA/pull/216 --- crates/rustc_codegen_nvvm/src/context.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/crates/rustc_codegen_nvvm/src/context.rs b/crates/rustc_codegen_nvvm/src/context.rs index 077d8887..43942005 100644 --- a/crates/rustc_codegen_nvvm/src/context.rs +++ b/crates/rustc_codegen_nvvm/src/context.rs @@ -22,7 +22,7 @@ use rustc_errors::DiagMessage; use rustc_hash::FxHashMap; use rustc_middle::dep_graph::DepContext; use rustc_middle::ty::layout::{ - FnAbiError, FnAbiOf, FnAbiRequest, HasTyCtxt, HasTypingEnv, LayoutError, + FnAbiError, FnAbiOf, FnAbiRequest, HasTyCtxt, HasTypingEnv, LayoutError, LayoutOf, }; use rustc_middle::ty::layout::{FnAbiOfHelpers, LayoutOfHelpers}; use rustc_middle::ty::{Ty, TypeVisitableExt}; @@ -40,6 +40,10 @@ use rustc_target::callconv::FnAbi; use rustc_target::spec::{HasTargetSpec, Target}; use tracing::{debug, trace}; +/// "There is a total of 64 KB constant memory on a device." +/// +const CONSTANT_MEMORY_SIZE_LIMIT_BYTES: u64 = 64 * 1024; + pub(crate) struct CodegenCx<'ll, 'tcx> { pub tcx: TyCtxt<'tcx>, @@ -267,7 +271,18 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> { } if !is_mutable && self.type_is_freeze(ty) { - AddressSpace(4) + let layout = self.layout_of(ty); + if layout.size.bytes() > CONSTANT_MEMORY_SIZE_LIMIT_BYTES { + self.tcx.sess.dcx().warn(format!( + "static `{}` exceeds the constant-memory limit; placing in global memory (performance may be reduced)", + instance + )); + // Global memory + AddressSpace(1) + } else { + // Constant memory + AddressSpace(4) + } } else { AddressSpace::DATA } From 81936e1049edb9af498597b8be1634f262e17bf9 Mon Sep 17 00:00:00 2001 From: Christian Legnitto Date: Mon, 26 May 2025 12:45:01 -0400 Subject: [PATCH 3/4] Add `--use-constant-memory-space` flag, off by default --- crates/cuda_builder/src/lib.rs | 29 ++++++++++++++++++++ crates/rustc_codegen_nvvm/src/context.rs | 34 +++++++++++++++++------- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs index 12f8c0f0..84a95d3c 100644 --- a/crates/cuda_builder/src/lib.rs +++ b/crates/cuda_builder/src/lib.rs @@ -130,6 +130,18 @@ pub struct CudaBuilder { /// /// `true` by default. pub override_libm: bool, + /// If `true`, the codegen will attempt to place `static` variables in CUDA's + /// constant memory, which is fast but limited in size (~64KB total across all + /// statics). The codegen avoids placing any single item too large, but it does not + /// track cumulative size. Exceeding the limit may cause `IllegalAddress` runtime + /// errors (CUDA error code: `700`). + /// + /// The default is `false`, which places all statics in global memory. This avoids + /// such errors but may reduce performance and use more general memory. + /// + /// Future versions may support smarter placement and user-controlled + /// packing/spilling strategies. + pub use_constant_memory_space: bool, /// Whether to generate any debug info and what level of info to generate. pub debug: DebugInfo, /// Additional arguments passed to cargo during `cargo build`. @@ -155,6 +167,7 @@ impl CudaBuilder { emit: None, optix: false, override_libm: true, + use_constant_memory_space: false, debug: DebugInfo::None, build_args: vec![], final_module_path: None, @@ -284,6 +297,22 @@ impl CudaBuilder { self } + /// If `true`, the codegen will attempt to place `static` variables in CUDA's + /// constant memory, which is fast but limited in size (~64KB total across all + /// statics). The codegen avoids placing any single item too large, but it does not + /// track cumulative size. Exceeding the limit may cause `IllegalAddress` runtime + /// errors (CUDA error code: `700`). + /// + /// If `false`, all statics are placed in global memory. This avoids such errors but + /// may reduce performance and use more general memory. + /// + /// Future versions may support smarter placement and user-controlled + /// packing/spilling strategies. + pub fn use_constant_memory_space(mut self, use_constant_memory_space: bool) -> Self { + self.use_constant_memory_space = use_constant_memory_space; + self + } + /// An optional path where to dump LLVM IR of the final output the codegen will feed to libnvvm. Usually /// used for debugging. pub fn final_module_path(mut self, path: impl AsRef) -> Self { diff --git a/crates/rustc_codegen_nvvm/src/context.rs b/crates/rustc_codegen_nvvm/src/context.rs index 43942005..a185a101 100644 --- a/crates/rustc_codegen_nvvm/src/context.rs +++ b/crates/rustc_codegen_nvvm/src/context.rs @@ -271,17 +271,30 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> { } if !is_mutable && self.type_is_freeze(ty) { - let layout = self.layout_of(ty); - if layout.size.bytes() > CONSTANT_MEMORY_SIZE_LIMIT_BYTES { - self.tcx.sess.dcx().warn(format!( - "static `{}` exceeds the constant-memory limit; placing in global memory (performance may be reduced)", - instance - )); - // Global memory + if !self.codegen_args.use_constant_memory_space { + // We aren't using constant memory, so put the instance in global memory. AddressSpace(1) } else { - // Constant memory - AddressSpace(4) + // We are using constant memory, see if the instance will fit. + // + // FIXME(@LegNeato) ideally we keep track of what we have put into + // constant memory and when it is filled up spill instead of only + // spilling when a static is big. We'll probably want some packing + // strategy controlled by the user...for example, if you have one large + // static and many small ones, you might want the small ones to all be + // in constant memory or just the big one depending on your workload. + let layout = self.layout_of(ty); + if layout.size.bytes() > CONSTANT_MEMORY_SIZE_LIMIT_BYTES { + self.tcx.sess.dcx().warn(format!( + "static `{}` exceeds the constant memory limit; placing in global memory (performance may be reduced)", + instance + )); + // Place instance in global memory if it is too big for constant memory. + AddressSpace(1) + } else { + // Place instance in constant memory if it fits. + AddressSpace(4) + } } } else { AddressSpace::DATA @@ -534,6 +547,7 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> { pub struct CodegenArgs { pub nvvm_options: Vec, pub override_libm: bool, + pub use_constant_memory_space: bool, pub final_module_path: Option, } @@ -552,6 +566,8 @@ impl CodegenArgs { cg_args.nvvm_options.push(flag); } else if arg == "--override-libm" { cg_args.override_libm = true; + } else if arg == "--use-constant-memory-space" { + cg_args.use_constant_memory_space = true; } else if arg == "--final-module-path" { cg_args.final_module_path = Some(PathBuf::from( args.get(idx + 1).expect("No path for --final-module-path"), From 5f81c65950166abd48cd3b94b01564538a281008 Mon Sep 17 00:00:00 2001 From: Christian Legnitto Date: Mon, 26 May 2025 13:15:42 -0400 Subject: [PATCH 4/4] Make it clear that `#[cuda_std::address_space(constant)]` still works --- crates/cuda_builder/src/lib.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs index 84a95d3c..1a056d1b 100644 --- a/crates/cuda_builder/src/lib.rs +++ b/crates/cuda_builder/src/lib.rs @@ -137,7 +137,10 @@ pub struct CudaBuilder { /// errors (CUDA error code: `700`). /// /// The default is `false`, which places all statics in global memory. This avoids - /// such errors but may reduce performance and use more general memory. + /// such errors but may reduce performance and use more general memory. When set to + /// `false`, you can still annotate `static` variables with + /// `#[cuda_std::address_space(constant)]` to place them in constant memory + /// manually. This option only affects automatic placement. /// /// Future versions may support smarter placement and user-controlled /// packing/spilling strategies. @@ -304,7 +307,9 @@ impl CudaBuilder { /// errors (CUDA error code: `700`). /// /// If `false`, all statics are placed in global memory. This avoids such errors but - /// may reduce performance and use more general memory. + /// may reduce performance and use more general memory. You can still annotate + /// `static` variables with `#[cuda_std::address_space(constant)]` to place them in + /// constant memory manually as this option only affects automatic placement. /// /// Future versions may support smarter placement and user-controlled /// packing/spilling strategies.