diff --git a/Cargo.lock b/Cargo.lock index d7b47e8ef2..45004612e5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -604,6 +604,26 @@ dependencies = [ "serde", ] +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bindgen" version = "0.71.1" @@ -2136,7 +2156,7 @@ dependencies = [ "async_zmq", "axum 0.8.4", "axum-server", - "bincode", + "bincode 2.0.1", "bitflags 2.9.4", "blake3", "bs62", @@ -2273,7 +2293,7 @@ dependencies = [ "async-trait", "async_zmq", "axum 0.8.4", - "bincode", + "bincode 1.3.3", "blake3", "bytes", "chrono", @@ -5575,9 +5595,9 @@ checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" [[package]] name = "ordered-float" -version = "5.0.0" +version = "5.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2c1f9f56e534ac6a9b8a4600bdf0f530fb393b5f393e7b4d03489c3cf0c3f01" +checksum = "7f4779c6901a562440c3786d08192c6fbda7c1c2060edd10006b05ee35d10f2d" dependencies = [ "num-traits", ] @@ -7899,9 +7919,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "symphonia" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "815c942ae7ee74737bb00f965fa5b5a2ac2ce7b6c01c0cc169bbeaf7abd5f5a9" +checksum = "5773a4c030a19d9bfaa090f49746ff35c75dfddfa700df7a5939d5e076a57039" dependencies = [ "lazy_static", "symphonia-bundle-flac", @@ -7917,9 +7937,9 @@ dependencies = [ [[package]] name = "symphonia-bundle-flac" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e34f34298a7308d4397a6c7fbf5b84c5d491231ce3dd379707ba673ab3bd97" +checksum = "c91565e180aea25d9b80a910c546802526ffd0072d0b8974e3ebe59b686c9976" dependencies = [ "log", "symphonia-core", @@ -7929,9 +7949,9 @@ dependencies = [ [[package]] name = "symphonia-bundle-mp3" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c01c2aae70f0f1fb096b6f0ff112a930b1fb3626178fba3ae68b09dce71706d4" +checksum = "4872dd6bb56bf5eac799e3e957aa1981086c3e613b27e0ac23b176054f7c57ed" dependencies = [ "lazy_static", "log", @@ -7941,9 +7961,9 @@ dependencies = [ [[package]] name = "symphonia-codec-pcm" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f395a67057c2ebc5e84d7bb1be71cce1a7ba99f64e0f0f0e303a03f79116f89b" +checksum = "4e89d716c01541ad3ebe7c91ce4c8d38a7cf266a3f7b2f090b108fb0cb031d95" dependencies = [ "log", "symphonia-core", @@ -7951,9 +7971,9 @@ dependencies = [ [[package]] name = "symphonia-codec-vorbis" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a98765fb46a0a6732b007f7e2870c2129b6f78d87db7987e6533c8f164a9f30" +checksum = "f025837c309cd69ffef572750b4a2257b59552c5399a5e49707cc5b1b85d1c73" dependencies = [ "log", "symphonia-core", @@ -7962,9 +7982,9 @@ dependencies = [ [[package]] name = "symphonia-core" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "798306779e3dc7d5231bd5691f5a813496dc79d3f56bf82e25789f2094e022c3" +checksum = "ea00cc4f79b7f6bb7ff87eddc065a1066f3a43fe1875979056672c9ef948c2af" dependencies = [ "arrayvec", "bitflags 1.3.2", @@ -7975,9 +7995,9 @@ dependencies = [ [[package]] name = "symphonia-format-isomp4" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abfdf178d697e50ce1e5d9b982ba1b94c47218e03ec35022d9f0e071a16dc844" +checksum = "243739585d11f81daf8dac8d9f3d18cc7898f6c09a259675fc364b382c30e0a5" dependencies = [ "encoding_rs", "log", @@ -7988,9 +8008,9 @@ dependencies = [ [[package]] name = "symphonia-format-ogg" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ada3505789516bcf00fc1157c67729eded428b455c27ca370e41f4d785bfa931" +checksum = "2b4955c67c1ed3aa8ae8428d04ca8397fbef6a19b2b051e73b5da8b1435639cb" dependencies = [ "log", "symphonia-core", @@ -8000,9 +8020,9 @@ dependencies = [ [[package]] name = "symphonia-format-riff" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f7be232f962f937f4b7115cbe62c330929345434c834359425e043bfd15f50" +checksum = "c2d7c3df0e7d94efb68401d81906eae73c02b40d5ec1a141962c592d0f11a96f" dependencies = [ "extended", "log", @@ -8012,9 +8032,9 @@ dependencies = [ [[package]] name = "symphonia-metadata" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc622b9841a10089c5b18e99eb904f4341615d5aa55bbf4eedde1be721a4023c" +checksum = "36306ff42b9ffe6e5afc99d49e121e0bd62fe79b9db7b9681d48e29fa19e6b16" dependencies = [ "encoding_rs", "lazy_static", @@ -8024,9 +8044,9 @@ dependencies = [ [[package]] name = "symphonia-utils-xiph" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "484472580fa49991afda5f6550ece662237b00c6f562c7d9638d1b086ed010fe" +checksum = "ee27c85ab799a338446b68eec77abf42e1a6f1bb490656e121c6e27bfbab9f16" dependencies = [ "symphonia-core", "symphonia-metadata", @@ -9281,6 +9301,12 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "ureq" version = "2.12.1" @@ -9515,6 +9541,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + [[package]] name = "vob" version = "3.0.6" @@ -9730,9 +9762,9 @@ dependencies = [ [[package]] name = "widestring" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd7cf3379ca1aac9eea11fba24fd7e315d621f8dfe35c8d7d2be8b793726e07d" +checksum = "72069c3113ab32ab29e5584db3c6ec55d416895e60715417b5b883a357c3e471" [[package]] name = "winapi" diff --git a/Cargo.toml b/Cargo.toml index 256248beb4..e9c3dc8376 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,6 +63,7 @@ chrono = { version = "0.4", default-features = false, features = [ "now", "serde", ] } +cudarc = { version = "0.17.1", features = ["cuda-12020"] } derive_builder = { version = "0.20" } derive-getters = { version = "0.5" } either = { version = "1.13", features = ["serde"] } diff --git a/lib/bindings/python/Cargo.lock b/lib/bindings/python/Cargo.lock index 4586591b15..7576ead2c0 100644 --- a/lib/bindings/python/Cargo.lock +++ b/lib/bindings/python/Cargo.lock @@ -515,6 +515,26 @@ dependencies = [ "serde", ] +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bindgen" version = "0.69.5" @@ -1103,15 +1123,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "cudarc" -version = "0.16.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17200eb07e7d85a243aa1bf4569a7aa998385ba98d14833973a817a63cc86e92" -dependencies = [ - "libloading", -] - [[package]] name = "cudarc" version = "0.17.2" @@ -1452,6 +1463,15 @@ dependencies = [ "uuid", ] +[[package]] +name = "dynamo-kvbm-kernels" +version = "0.6.0" +dependencies = [ + "cc", + "cudarc", + "once_cell", +] + [[package]] name = "dynamo-llm" version = "0.6.0" @@ -1459,6 +1479,7 @@ dependencies = [ "ahash", "aho-corasick", "akin", + "aligned-vec", "anyhow", "async-nats", "async-stream", @@ -1466,7 +1487,7 @@ dependencies = [ "async_zmq", "axum", "axum-server", - "bincode", + "bincode 2.0.1", "bitflags 2.9.3", "blake3", "bs62", @@ -1474,12 +1495,13 @@ dependencies = [ "bytes", "candle-core", "chrono", - "cudarc 0.17.2", + "cudarc", "dashmap", "derive-getters", "derive_builder", "dialoguer", "dynamo-async-openai", + "dynamo-kvbm-kernels", "dynamo-parsers", "dynamo-runtime", "either", @@ -1560,7 +1582,7 @@ dependencies = [ "anyhow", "async-stream", "async-trait", - "cudarc 0.16.6", + "cudarc", "derive-getters", "dlpark", "dynamo-async-openai", @@ -1602,7 +1624,7 @@ dependencies = [ "async-trait", "async_zmq", "axum", - "bincode", + "bincode 1.3.3", "blake3", "bytes", "chrono", @@ -6825,6 +6847,12 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "ureq" version = "2.12.1" @@ -6981,6 +7009,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + [[package]] name = "walkdir" version = "2.5.0" diff --git a/lib/bindings/python/Cargo.toml b/lib/bindings/python/Cargo.toml index bb978a37ee..459663c5af 100644 --- a/lib/bindings/python/Cargo.toml +++ b/lib/bindings/python/Cargo.toml @@ -73,7 +73,7 @@ pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = pythonize = "0.23" dlpark = { version = "0.5", features = ["pyo3", "half"], optional = true } -cudarc = { version = "0.16.2", features = ["cuda-12020"], optional = true } +cudarc = { version = "0.17.1", features = ["cuda-12020"], optional = true } prometheus = "0.14.0" diff --git a/lib/llm/Cargo.toml b/lib/llm/Cargo.toml index d74056150b..cebf062be6 100644 --- a/lib/llm/Cargo.toml +++ b/lib/llm/Cargo.toml @@ -21,7 +21,7 @@ testing-full = ["testing-cuda", "testing-nixl"] testing-cuda = ["dep:cudarc"] testing-nixl = ["dep:nixl-sys"] testing-etcd = [] -block-manager = ["dep:nixl-sys", "dep:cudarc", "dep:ndarray", "dep:nix"] +block-manager = ["dep:nixl-sys", "dep:cudarc", "dep:ndarray", "dep:nix", "dep:aligned-vec"] cuda = ["dep:cudarc"] integration = ["dynamo-runtime/integration"] @@ -85,7 +85,7 @@ offset-allocator = "0.2" regex = "1" rayon = "1" dashmap = { version = "5.5.3" } -bincode = "1" +bincode = { version = "2.0.1", features = ["serde", "derive"] } # input/text dialoguer = { version = "0.11", default-features = false, features = [ @@ -94,11 +94,13 @@ dialoguer = { version = "0.11", default-features = false, features = [ ] } # block_manager +aligned-vec = { version = "0.6.4", optional = true } nixl-sys = { version = "=0.6.0", optional = true } -cudarc = { version = "0.17.1", features = ["cuda-12020"], optional = true } +cudarc = { workspace = true, optional = true } ndarray = { version = "0.16", optional = true } nix = { version = "0.26", optional = true } + # protocols unicode-segmentation = "1.12" @@ -163,7 +165,7 @@ insta = { version = "1.41", features = [ "redactions", "filters", ] } -aligned-vec = "0.6.4" + lazy_static = "1.4" [build-dependencies] diff --git a/lib/llm/benches/transfer_context_v2.rs b/lib/llm/benches/transfer_context_v2.rs index 22a76b01cc..69e3d20f8e 100644 --- a/lib/llm/benches/transfer_context_v2.rs +++ b/lib/llm/benches/transfer_context_v2.rs @@ -7,7 +7,7 @@ mod benchmarks { use criterion::{BenchmarkId, Criterion, criterion_group}; use cudarc::driver::{CudaContext, CudaStream}; - use nixl_sys; + use tokio::runtime::Runtime; use tokio_util::task::TaskTracker; diff --git a/lib/llm/src/block_manager.rs b/lib/llm/src/block_manager.rs index 0670f95e2d..edab3e71ec 100644 --- a/lib/llm/src/block_manager.rs +++ b/lib/llm/src/block_manager.rs @@ -20,6 +20,7 @@ pub mod numa_allocator; pub mod offload; pub mod pool; pub mod storage; +pub mod v2; // dynamo rt integration pub mod controller; @@ -326,18 +327,6 @@ mod tests { .unwrap() } - pub async fn create_reference_block_manager_with_counts( - device: usize, - host: usize, - disk: usize, - ) -> ReferenceBlockManager { - ReferenceBlockManager::new(create_reference_block_manager_config_with_counts( - device, host, disk, - )) - .await - .unwrap() - } - #[tokio::test] async fn test_reference_block_manager_inherited_async_runtime() { dynamo_runtime::logging::init(); diff --git a/lib/llm/src/block_manager/block/transfer/context.rs b/lib/llm/src/block_manager/block/transfer/context.rs index 36ad83a4c0..7d8e97c340 100644 --- a/lib/llm/src/block_manager/block/transfer/context.rs +++ b/lib/llm/src/block_manager/block/transfer/context.rs @@ -563,11 +563,11 @@ pub mod v2 { tracker.spawn(async move { let event = ctx_clone .record_event() - .expect(&format!("Failed to record event {}", i)); + .unwrap_or_else(|_| panic!("Failed to record event {}", i)); event .synchronize() .await - .expect(&format!("Failed to sync event {}", i)); + .unwrap_or_else(|_| panic!("Failed to sync event {}", i)); }); } @@ -575,26 +575,6 @@ pub mod v2 { tracker.wait().await; } - #[tokio::test] - async fn test_performance_baseline() { - let ctx = setup_context(); - let start = std::time::Instant::now(); - - // Test a reasonable number of synchronizations - for _ in 0..10 { - let event = ctx.record_event().expect("Failed to record event"); - event.synchronize().await.expect("Sync failed"); - } - - let duration = start.elapsed(); - // Should complete 10 synchronizations in reasonable time (< 1ms total) - assert!( - duration < std::time::Duration::from_millis(1), - "Performance regression: took {:?} for 10 syncs", - duration - ); - } - #[tokio::test] async fn test_error_handling() { let ctx = setup_context(); diff --git a/lib/llm/src/block_manager/distributed/worker.rs b/lib/llm/src/block_manager/distributed/worker.rs index 8b3890e0d2..2fc927db2c 100644 --- a/lib/llm/src/block_manager/distributed/worker.rs +++ b/lib/llm/src/block_manager/distributed/worker.rs @@ -185,10 +185,13 @@ struct WorkerMetadataHandler { #[async_trait] impl Handler for WorkerMetadataHandler { async fn handle(&self, mut message: MessageHandle) -> anyhow::Result<()> { - let payload = bincode::serialize(&WorkerMetadata { - num_device_blocks: self.num_device_blocks, - bytes_per_block: self.bytes_per_block, - })?; + let payload = bincode::serde::encode_to_vec( + &WorkerMetadata { + num_device_blocks: self.num_device_blocks, + bytes_per_block: self.bytes_per_block, + }, + bincode::config::standard(), + )?; message .reply(ZMQ_WORKER_METADATA_MESSAGE, &[payload]) .await?; @@ -226,8 +229,11 @@ impl Handler for LeaderMetadataHandler { ); return Ok(()); } - let leader_meta: LeaderMetadata = match bincode::deserialize(&message.data[0]) { - Ok(m) => m, + let leader_meta: LeaderMetadata = match bincode::serde::decode_from_slice( + &message.data[0], + bincode::config::standard(), + ) { + Ok((m, _)) => m, Err(e) => { tracing::error!("leader_metadata: bad payload: {e:#}"); return Ok(()); diff --git a/lib/llm/src/block_manager/distributed/zmq.rs b/lib/llm/src/block_manager/distributed/zmq.rs index d2e19322fd..5a48bb5f3d 100644 --- a/lib/llm/src/block_manager/distributed/zmq.rs +++ b/lib/llm/src/block_manager/distributed/zmq.rs @@ -166,14 +166,18 @@ impl ZmqActiveMessageLeader { } }; - let workers: Vec = workers_payloads - .into_iter() - .map(|b| bincode::deserialize::(&b)) - .collect::>()?; + let mut workers: Vec = Vec::with_capacity(workers_payloads.len()); + + for payload in workers_payloads { + let worker: WorkerMetadata = + bincode::serde::decode_from_slice(&payload, bincode::config::standard())?.0; + workers.push(worker); + } // 2) Compute & broadcast LeaderMetadata; wait for ALL acks in the SAME round. let leader_meta = make_leader_meta(&workers); - let leader_meta_bytes = bincode::serialize(&leader_meta)?; + let leader_meta_bytes = + bincode::serde::encode_to_vec(&leader_meta, bincode::config::standard())?; loop { if Instant::now() >= deadline { diff --git a/lib/llm/src/block_manager/offload.rs b/lib/llm/src/block_manager/offload.rs index 57f3553bf2..1ad258d002 100644 --- a/lib/llm/src/block_manager/offload.rs +++ b/lib/llm/src/block_manager/offload.rs @@ -693,7 +693,7 @@ impl OffloadFiltersBuilder { } } -#[cfg(all(test, feature = "testing-cuda"))] +#[cfg(all(test, feature = "testing-cuda", feature = "testing-nixl"))] mod tests { use super::*; @@ -713,8 +713,7 @@ mod tests { use nixl_sys::{MemoryRegion, NixlDescriptor}; use aligned_vec::avec; - use cudarc::runtime::sys::{cudaMemcpy, cudaMemcpyKind, cudaMemset}; - use prometheus::Registry; + use cudarc::runtime::sys::{cudaDeviceSynchronize, cudaMemcpy, cudaMemcpyKind, cudaMemset}; use rstest::*; use std::fs::File; use std::io::{Read, Seek, SeekFrom, Write}; @@ -1286,6 +1285,8 @@ mod tests { // Check that this is the same block. check_block_contents(&immutable_host_block, &device_blocks[0], 42)?; + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + Ok(()) } diff --git a/lib/llm/src/block_manager/v2.rs b/lib/llm/src/block_manager/v2.rs new file mode 100644 index 0000000000..51eb8e2a8f --- /dev/null +++ b/lib/llm/src/block_manager/v2.rs @@ -0,0 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod kernels; +pub mod memory; +pub mod physical; diff --git a/lib/llm/src/block_manager/v2/kernels/mod.rs b/lib/llm/src/block_manager/v2/kernels/mod.rs new file mode 100644 index 0000000000..5db3a820ee --- /dev/null +++ b/lib/llm/src/block_manager/v2/kernels/mod.rs @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Safe-ish wrappers around the CUDA block/universal packing kernels. +//! +//! The core ideas: +//! * A “block” represents the stack of `nl * no` tensors arranged either as NHD +//! (inner axes `[nt, nh, hd]`) or HND (inner axes `[nh, nt, hd]`). +//! * A “universal” tensor is `[nh, nl, no, nt, hd]` stored contiguously. +//! * An “operational” tensor is `[nl, no, inner]` with `inner = nt * nh * hd`. +//! +//! Host code calls these helpers with flattened pointer tables so a single +//! launch can move many logical blocks in one go. + +#![allow(dead_code)] +#![allow(clippy::missing_safety_doc)] + +/// Numeric tags passed across the FFI boundary to select the CUDA template. +#[repr(i32)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum TensorDataType { + F16 = 0, + BF16 = 1, + F32 = 2, + F64 = 3, +} + +/// Identifies how each `[nt, nh, hd]` chunk is laid out in device memory. +#[repr(i32)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum BlockLayout { + NHD = 0, + HND = 1, +} + +/// Direction flag for copying between block stacks and operational buffers. +#[repr(i32)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum OperationalCopyDirection { + BlockToOperational = 0, + OperationalToBlock = 1, +} + +/// Selects how the operational copy should move data. +#[repr(i32)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum OperationalCopyBackend { + /// Try cudaMemcpyBatchAsync, fall back to cudaMemcpyAsync, then the kernel. + Auto = 0, + /// Force the custom CUDA kernel path. + KernelOnly = 1, + /// Issue one cudaMemcpyAsync per chunk. + MemcpyAsync = 2, + /// Invoke cudaMemcpyBatchAsync directly. + MemcpyBatch = 3, +} diff --git a/lib/llm/src/block_manager/v2/memory/actions.rs b/lib/llm/src/block_manager/v2/memory/actions.rs new file mode 100644 index 0000000000..98fdbce61b --- /dev/null +++ b/lib/llm/src/block_manager/v2/memory/actions.rs @@ -0,0 +1,221 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Storage actions. + +use super::{MemoryRegion, StorageError}; + +/// Extension trait for storage types that support memory setting operations +pub trait Memset: MemoryRegion { + /// Sets a region of memory to a specific value + /// + /// # Arguments + /// * `value` - The value to set (will be truncated to u8) + /// * `offset` - Offset in bytes from the start of the storage + /// * `size` - Number of bytes to set + /// + /// # Safety + /// The caller must ensure: + /// - offset + size <= self.size() + /// - No other references exist to the memory region being set + fn memset(&mut self, value: u8, offset: usize, size: usize) -> Result<(), StorageError>; +} + +/// Extension trait for storage types that support slicing operations +pub trait Slice { + /// Returns an immutable byte slice view of the entire storage region + /// + /// # Safety + /// The caller must ensure: + /// - The memory region is valid and initialized + /// - No concurrent mutable access occurs while the slice is in use + fn as_slice(&self) -> Result<&[u8], StorageError>; + + /// Returns an immutable byte slice view of a subregion + /// + /// # Arguments + /// * `offset` - Offset in bytes from the start of the storage + /// * `len` - Number of bytes to slice + /// + /// # Safety + /// The caller must ensure: + /// - offset + len <= self.size() + /// - The memory region is valid and initialized + /// - No concurrent mutable access occurs while the slice is in use + fn slice(&self, offset: usize, len: usize) -> Result<&[u8], StorageError> { + let slice = self.as_slice()?; + + // validate offset and len + if offset.saturating_add(len) > slice.len() { + return Err(StorageError::Unsupported("slice out of bounds".into())); + } + + slice + .get(offset..offset.saturating_add(len)) + .ok_or_else(|| StorageError::Unsupported("slice out of bounds".into())) + } + + /// Returns a typed immutable slice view of the entire storage region + /// + /// # Safety + /// The caller must ensure: + /// - The memory region is valid and initialized + /// - The memory is properly aligned for type T + /// - The size is a multiple of `size_of::()` + /// - No concurrent mutable access occurs while the slice is in use + /// - The data represents valid values of type T + fn as_slice_typed(&self) -> Result<&[T], StorageError> { + let bytes = self.as_slice()?; + let ptr = bytes.as_ptr() as *const T; + let len = bytes.len() / std::mem::size_of::(); + + if !(bytes.as_ptr() as usize).is_multiple_of(std::mem::align_of::()) { + return Err(StorageError::Unsupported(format!( + "memory not aligned for type (required alignment: {})", + std::mem::align_of::() + ))); + } + + if bytes.len() % std::mem::size_of::() != 0 { + return Err(StorageError::Unsupported(format!( + "size {} is not a multiple of type size {}", + bytes.len(), + std::mem::size_of::() + ))); + } + + // SAFETY: Caller guarantees memory is valid, aligned, and properly initialized for T + Ok(unsafe { std::slice::from_raw_parts(ptr, len) }) + } + + /// Returns a typed immutable slice view of a subregion + /// + /// # Arguments + /// * `offset` - Offset in bytes from the start of the storage + /// * `len` - Number of elements of type T to slice + /// + /// # Safety + /// The caller must ensure: + /// - offset + (len * size_of::()) <= self.size() + /// - offset is properly aligned for type T + /// - The memory region is valid and initialized + /// - No concurrent mutable access occurs while the slice is in use + /// - The data represents valid values of type T + fn slice_typed(&self, offset: usize, len: usize) -> Result<&[T], StorageError> { + let type_size = std::mem::size_of::(); + let byte_len = len + .checked_mul(type_size) + .ok_or_else(|| StorageError::Unsupported("length overflow".into()))?; + + let bytes = self.slice(offset, byte_len)?; + let ptr = bytes.as_ptr() as *const T; + + if !(bytes.as_ptr() as usize).is_multiple_of(std::mem::align_of::()) { + return Err(StorageError::Unsupported(format!( + "memory not aligned for type (required alignment: {})", + std::mem::align_of::() + ))); + } + + // SAFETY: Caller guarantees memory is valid, aligned, and properly initialized for T + Ok(unsafe { std::slice::from_raw_parts(ptr, len) }) + } +} + +pub trait SliceMut { + /// Returns a mutable byte slice view of the entire storage region + /// + /// # Safety + /// The caller must ensure: + /// - The memory region is valid + /// - No other references (mutable or immutable) exist to this memory region + fn as_slice_mut(&mut self) -> Result<&mut [u8], StorageError>; + + /// Returns a mutable byte slice view of a subregion + /// + /// # Arguments + /// * `offset` - Offset in bytes from the start of the storage + /// * `len` - Number of bytes to slice + /// + /// # Safety + /// The caller must ensure: + /// - offset + len <= self.size() + /// - The memory region is valid + /// - No other references (mutable or immutable) exist to this memory region + fn slice_mut(&mut self, offset: usize, len: usize) -> Result<&mut [u8], StorageError> { + let slice = self.as_slice_mut()?; + + // validate offset and len + if offset.saturating_add(len) > slice.len() { + return Err(StorageError::Unsupported("slice out of bounds".into())); + } + + slice + .get_mut(offset..offset.saturating_add(len)) + .ok_or_else(|| StorageError::Unsupported("slice out of bounds".into())) + } + + /// Returns a typed mutable slice view of the entire storage region + /// + /// # Safety + /// The caller must ensure: + /// - The memory region is valid + /// - The memory is properly aligned for type T + /// - The size is a multiple of `size_of::()` + /// - No other references (mutable or immutable) exist to this memory region + fn as_slice_typed_mut(&mut self) -> Result<&mut [T], StorageError> { + let bytes = self.as_slice_mut()?; + let ptr = bytes.as_mut_ptr() as *mut T; + let len = bytes.len() / std::mem::size_of::(); + + if !(bytes.as_ptr() as usize).is_multiple_of(std::mem::align_of::()) { + return Err(StorageError::Unsupported(format!( + "memory not aligned for type (required alignment: {})", + std::mem::align_of::() + ))); + } + + if bytes.len() % std::mem::size_of::() != 0 { + return Err(StorageError::Unsupported(format!( + "size {} is not a multiple of type size {}", + bytes.len(), + std::mem::size_of::() + ))); + } + + // SAFETY: Caller guarantees memory is valid, aligned, and no aliasing + Ok(unsafe { std::slice::from_raw_parts_mut(ptr, len) }) + } + + /// Returns a typed mutable slice view of a subregion + /// + /// # Arguments + /// * `offset` - Offset in bytes from the start of the storage + /// * `len` - Number of elements of type T to slice + /// + /// # Safety + /// The caller must ensure: + /// - offset + (len * size_of::()) <= self.size() + /// - offset is properly aligned for type T + /// - The memory region is valid + /// - No other references (mutable or immutable) exist to this memory region + fn slice_typed_mut(&mut self, offset: usize, len: usize) -> Result<&mut [T], StorageError> { + let type_size = std::mem::size_of::(); + let byte_len = len + .checked_mul(type_size) + .ok_or_else(|| StorageError::Unsupported("length overflow".into()))?; + + let bytes = self.slice_mut(offset, byte_len)?; + let ptr = bytes.as_mut_ptr() as *mut T; + + if !(bytes.as_ptr() as usize).is_multiple_of(std::mem::align_of::()) { + return Err(StorageError::Unsupported(format!( + "memory not aligned for type (required alignment: {})", + std::mem::align_of::() + ))); + } + + // SAFETY: Caller guarantees memory is valid, aligned, and no aliasing + Ok(unsafe { std::slice::from_raw_parts_mut(ptr, len) }) + } +} diff --git a/lib/llm/src/block_manager/v2/memory/device.rs b/lib/llm/src/block_manager/v2/memory/device.rs new file mode 100644 index 0000000000..6f8a88daa4 --- /dev/null +++ b/lib/llm/src/block_manager/v2/memory/device.rs @@ -0,0 +1,115 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! CUDA device memory storage. + +use super::{MemoryRegion, Result, StorageError, StorageKind}; +use cudarc::driver::CudaContext; +use std::any::Any; +use std::collections::HashMap; +use std::sync::{Arc, Mutex, OnceLock}; + +/// Get or create a CUDA context for the given device. +fn cuda_context(device_id: u32) -> Result> { + static CONTEXTS: OnceLock>>> = OnceLock::new(); + let mut map = CONTEXTS.get_or_init(Default::default).lock().unwrap(); + + if let Some(existing) = map.get(&device_id) { + return Ok(existing.clone()); + } + + let ctx = CudaContext::new(device_id as usize)?; + map.insert(device_id, ctx.clone()); + Ok(ctx) +} + +/// CUDA device memory allocated via cudaMalloc. +#[derive(Debug)] +pub struct DeviceStorage { + ctx: Arc, + ptr: u64, + device_id: u32, + len: usize, +} + +unsafe impl Send for DeviceStorage {} +unsafe impl Sync for DeviceStorage {} + +impl DeviceStorage { + /// Allocate new device memory of the given size. + /// + /// # Arguments + /// * `len` - Size in bytes to allocate + /// * `device_id` - CUDA device on which to allocate + pub fn new(len: usize, device_id: u32) -> Result { + if len == 0 { + return Err(StorageError::AllocationFailed( + "zero-sized allocations are not supported".into(), + )); + } + + let ctx = cuda_context(device_id)?; + ctx.bind_to_thread().map_err(StorageError::Cuda)?; + let ptr = unsafe { cudarc::driver::result::malloc_sync(len).map_err(StorageError::Cuda)? }; + + Ok(Self { + ctx, + ptr, + device_id, + len, + }) + } + + /// Get the device pointer value. + pub fn device_ptr(&self) -> u64 { + self.ptr + } + + /// Get the CUDA device ID this memory is allocated on. + pub fn device_id(&self) -> u32 { + self.device_id + } +} + +impl Drop for DeviceStorage { + fn drop(&mut self) { + if let Err(e) = self.ctx.bind_to_thread() { + tracing::debug!("failed to bind CUDA context for free: {e}"); + } + unsafe { + if let Err(e) = cudarc::driver::result::free_sync(self.ptr) { + tracing::debug!("failed to free device memory: {e}"); + } + }; + } +} + +impl MemoryRegion for DeviceStorage { + fn addr(&self) -> usize { + self.device_ptr() as usize + } + + fn size(&self) -> usize { + self.len + } + + fn storage_kind(&self) -> StorageKind { + StorageKind::Device(self.device_id) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +// Support for NIXL registration +impl super::registered::NixlCompatible for DeviceStorage { + fn nixl_params(&self) -> (*const u8, usize, nixl_sys::MemType, u64) { + ( + self.ptr as *const u8, + self.len, + nixl_sys::MemType::Vram, + self.device_id as u64, + ) + } +} diff --git a/lib/llm/src/block_manager/v2/memory/disk.rs b/lib/llm/src/block_manager/v2/memory/disk.rs new file mode 100644 index 0000000000..a0ce440746 --- /dev/null +++ b/lib/llm/src/block_manager/v2/memory/disk.rs @@ -0,0 +1,362 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Disk-backed memory storage using memory-mapped files. + +use super::{MemoryRegion, Result, StorageError, StorageKind}; +use std::any::Any; +use std::path::{Path, PathBuf}; + +use core::ffi::c_char; +use nix::fcntl::{FallocateFlags, fallocate}; +use nix::unistd::unlink; +use std::ffi::CString; + +const DISK_CACHE_KEY: &str = "DYN_KVBM_DISK_CACHE_DIR"; +const DEFAULT_DISK_CACHE_DIR: &str = "/tmp/"; + +#[derive(Debug)] +pub struct DiskStorage { + fd: u64, + path: PathBuf, + size: usize, + unlinked: bool, +} + +impl DiskStorage { + pub fn new(size: usize) -> Result { + // We need to open our file with some special flags that aren't supported by the tempfile crate. + // Instead, we'll use the mkostemp function to create a temporary file with the correct flags. + + let specified_dir = + std::env::var(DISK_CACHE_KEY).unwrap_or_else(|_| DEFAULT_DISK_CACHE_DIR.to_string()); + let file_path = Path::new(&specified_dir).join("dynamo-kvbm-disk-cache-XXXXXX"); + + Self::new_at(file_path, size) + } + + pub fn new_at(path: impl AsRef, len: usize) -> Result { + if len == 0 { + return Err(StorageError::AllocationFailed( + "zero-sized allocations are not supported".into(), + )); + } + + let file_path = path.as_ref().to_path_buf(); + + if !file_path.exists() { + std::fs::create_dir_all(file_path.parent().unwrap()).unwrap(); + } + + tracing::debug!("Allocating disk cache file at {}", file_path.display()); + + let path_str = file_path.to_str().unwrap(); + let is_template = path_str.contains("XXXXXX"); + + let (raw_fd, actual_path) = if is_template { + // Template path - use mkostemp to generate unique filename + let template = CString::new(path_str).unwrap(); + let mut template_bytes = template.into_bytes_with_nul(); + + let fd = unsafe { + nix::libc::mkostemp( + template_bytes.as_mut_ptr() as *mut c_char, + nix::libc::O_RDWR | nix::libc::O_DIRECT, + ) + }; + + if fd == -1 { + return Err(StorageError::AllocationFailed(format!( + "mkostemp failed: {}", + std::io::Error::last_os_error() + ))); + } + + // Extract the actual path created by mkostemp + let actual = PathBuf::from( + CString::from_vec_with_nul(template_bytes) + .unwrap() + .to_str() + .unwrap(), + ); + + (fd, actual) + } else { + // Specific path - use open with O_CREAT + let path_cstr = CString::new(path_str).unwrap(); + let fd = unsafe { + nix::libc::open( + path_cstr.as_ptr(), + nix::libc::O_CREAT | nix::libc::O_RDWR | nix::libc::O_DIRECT, + 0o644, + ) + }; + + if fd == -1 { + return Err(StorageError::AllocationFailed(format!( + "open failed: {}", + std::io::Error::last_os_error() + ))); + } + + (fd, file_path) + }; + + // We need to use fallocate to actually allocate the storage and create the blocks on disk. + fallocate(raw_fd, FallocateFlags::empty(), 0, len as i64).map_err(|e| { + StorageError::AllocationFailed(format!("Failed to allocate temp file: {}", e)) + })?; + + Ok(Self { + fd: raw_fd as u64, + path: actual_path, + size: len, + unlinked: false, + }) + } + + pub fn fd(&self) -> u64 { + self.fd + } + + pub fn path(&self) -> &Path { + self.path.as_path() + } + + /// Unlink our temp file. + /// This means that when this process terminates, the file will be automatically deleted by the OS. + /// Unfortunately, GDS requires that files we try to register must be linked. + /// To get around this, we unlink the file only after we've registered it with NIXL. + pub fn unlink(&mut self) -> Result<()> { + if self.unlinked { + return Ok(()); + } + + unlink(self.path.as_path()) + .map_err(|e| StorageError::AllocationFailed(format!("Failed to unlink file: {}", e)))?; + self.unlinked = true; + Ok(()) + } + + pub fn unlinked(&self) -> bool { + self.unlinked + } +} + +impl Drop for DiskStorage { + fn drop(&mut self) { + let _ = self.unlink(); + } +} + +impl MemoryRegion for DiskStorage { + fn addr(&self) -> usize { + 0 + } + + fn size(&self) -> usize { + self.size + } + + fn storage_kind(&self) -> StorageKind { + StorageKind::Disk(self.fd) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +// Support for NIXL registration +impl super::registered::NixlCompatible for DiskStorage { + fn nixl_params(&self) -> (*const u8, usize, nixl_sys::MemType, u64) { + #[cfg(unix)] + { + // Use file descriptor as device_id for MemType::File + ( + std::ptr::null(), + self.size, + nixl_sys::MemType::File, + self.fd, + ) + } + + #[cfg(not(unix))] + { + // On non-Unix systems, we can't get the file descriptor easily + // Return device_id as 0 - registration will fail on these systems + ( + self.mmap.as_ptr(), + self.mmap.len(), + nixl_sys::MemType::File, + 0, + ) + } + } +} + +// mod mmap { +// use super::*; + +// #[cfg(unix)] +// use std::os::unix::io::AsRawFd; + +// use memmap2::{MmapMut, MmapOptions}; +// use std::fs::{File, OpenOptions}; +// use tempfile::NamedTempFile; + +// /// Disk-backed storage using memory-mapped files. +// #[derive(Debug)] +// pub struct MemMappedFileStorage { +// _file: File, // Keep file alive for the lifetime of the mmap +// mmap: MmapMut, +// path: PathBuf, +// #[cfg(unix)] +// fd: i32, +// } + +// unsafe impl Send for MemMappedFileStorage {} +// unsafe impl Sync for MemMappedFileStorage {} + +// impl MemMappedFileStorage { +// /// Create new disk storage with a temporary file. +// pub fn new_temp(len: usize) -> Result { +// if len == 0 { +// return Err(StorageError::AllocationFailed( +// "zero-sized allocations are not supported".into(), +// )); +// } + +// // Create temporary file +// let temp_file = NamedTempFile::new()?; +// let path = temp_file.path().to_path_buf(); +// let file = temp_file.into_file(); + +// // Set file size +// file.set_len(len as u64)?; + +// #[cfg(unix)] +// let fd = file.as_raw_fd(); + +// // Memory map the file +// let mmap = unsafe { MmapOptions::new().len(len).map_mut(&file)? }; + +// Ok(Self { +// _file: file, +// mmap, +// path, +// #[cfg(unix)] +// fd, +// }) +// } + +// /// Create new disk storage with a specific file path. +// pub fn new_at(path: impl AsRef, len: usize) -> Result { +// if len == 0 { +// return Err(StorageError::AllocationFailed( +// "zero-sized allocations are not supported".into(), +// )); +// } + +// let path = path.as_ref().to_path_buf(); + +// // Create or open file +// let file = OpenOptions::new() +// .read(true) +// .write(true) +// .create(true) +// .open(&path)?; + +// // Set file size +// file.set_len(len as u64)?; + +// #[cfg(unix)] +// let fd = file.as_raw_fd(); + +// // Memory map the file +// let mmap = unsafe { MmapOptions::new().len(len).map_mut(&file)? }; + +// Ok(Self { +// _file: file, +// mmap, +// path, +// #[cfg(unix)] +// fd, +// }) +// } + +// /// Get the path to the backing file. +// pub fn path(&self) -> &Path { +// &self.path +// } + +// /// Get the file descriptor (Unix only). +// #[cfg(unix)] +// pub fn fd(&self) -> i32 { +// self.fd +// } + +// /// Get a pointer to the memory-mapped region. +// /// +// /// # Safety +// /// The caller must ensure the pointer is not used after this storage is dropped. +// pub unsafe fn as_ptr(&self) -> *const u8 { +// self.mmap.as_ptr() +// } + +// /// Get a mutable pointer to the memory-mapped region. +// /// +// /// # Safety +// /// The caller must ensure the pointer is not used after this storage is dropped +// /// and that there are no other references to this memory. +// pub unsafe fn as_mut_ptr(&mut self) -> *mut u8 { +// self.mmap.as_mut_ptr() +// } +// } + +// impl MemoryRegion for MemMappedFileStorage { +// fn addr(&self) -> usize { +// self.mmap.as_ptr() as usize +// } + +// fn size(&self) -> usize { +// self.mmap.len() +// } + +// fn storage_kind(&self) -> StorageKind { +// StorageKind::Disk +// } + +// fn as_any(&self) -> &dyn Any { +// self +// } +// } + +// // Support for NIXL registration +// impl super::super::registered::NixlCompatible for MemMappedFileStorage { +// fn nixl_params(&self) -> (*const u8, usize, nixl_sys::MemType, u64) { +// #[cfg(unix)] +// { +// // Use file descriptor as device_id for MemType::File +// ( +// self.mmap.as_ptr(), +// self.mmap.len(), +// nixl_sys::MemType::File, +// self.fd as u64, +// ) +// } + +// #[cfg(not(unix))] +// { +// // On non-Unix systems, we can't get the file descriptor easily +// // Return device_id as 0 - registration will fail on these systems +// ( +// self.mmap.as_ptr(), +// self.mmap.len(), +// nixl_sys::MemType::File, +// 0, +// ) +// } +// } +// } +// } diff --git a/lib/llm/src/block_manager/v2/memory/mod.rs b/lib/llm/src/block_manager/v2/memory/mod.rs new file mode 100644 index 0000000000..b9ae29358e --- /dev/null +++ b/lib/llm/src/block_manager/v2/memory/mod.rs @@ -0,0 +1,206 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Clean, minimal storage API for v2 block manager. +//! +//! This module provides a simplified storage abstraction with: +//! - Single trait for type erasure (`MemoryRegion`) +//! - Concrete storage types (no trait implementations required) +//! - Composition-based NIXL registration via `NixlRegistered` wrapper +//! - RAII with proper drop ordering (registration handle drops before memory) + +pub mod actions; + +mod device; +mod disk; +mod pinned; +mod registered; +mod system; +mod torch; + +#[cfg(test)] +mod tests; + +pub use device::DeviceStorage; +pub use disk::DiskStorage; +pub use pinned::PinnedStorage; +pub use registered::{ + NixlCompatible, NixlDescriptor, NixlRegistered, RegisteredView, register_with_nixl, +}; +pub use system::SystemStorage; +pub use torch::{TorchDevice, TorchTensor}; + +use serde::{Deserialize, Serialize}; +use std::any::Any; +use std::fmt; +use std::sync::Arc; +use thiserror::Error; + +/// Result type for storage operations. +pub type Result = std::result::Result; + +/// Errors that can occur during storage operations. +#[derive(Debug, Error)] +pub enum StorageError { + #[error("allocation failed: {0}")] + AllocationFailed(String), + + #[error("registration failed: {0}")] + RegistrationFailed(String), + + #[error("operation failed: {0}")] + OperationFailed(String), + + #[error("unsupported operation: {0}")] + Unsupported(String), + + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + + // #[cfg(feature = "cuda")] + #[error("CUDA error: {0}")] + Cuda(#[from] cudarc::driver::DriverError), + + #[error("NIXL error: {0}")] + Nixl(#[from] nixl_sys::NixlError), +} + +/// Storage type classification. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum StorageKind { + /// System memory (malloc) + System, + + /// CUDA pinned host memory + // #[cfg(feature = "cuda")] + Pinned, + + /// CUDA device memory with device ID + // #[cfg(feature = "cuda")] + Device(u32), + + /// Disk-backed memory (mmap) + Disk(u64), +} + +/// Core trait for memory regions that can be type-erased. +/// +/// This is the only trait in the storage API. Concrete storage types +/// implement this trait to enable type erasure via `Arc`. +pub trait MemoryRegion: Send + Sync + fmt::Debug { + /// Base address of the memory region. + fn addr(&self) -> usize; + + /// Size of the memory region in bytes. + fn size(&self) -> usize; + + /// Type of storage backing this region. + fn storage_kind(&self) -> StorageKind; + + /// Enable downcasting to concrete type. + fn as_any(&self) -> &dyn Any; + + /// Get the NIXL descriptor for this memory region. + fn nixl_descriptor(&self) -> Option { + None + } +} + +/// Type-erased memory region for use in layouts. +pub type OwnedMemoryRegion = Arc; + +/// Helper function to convert concrete storage to type-erased form. +pub fn erase_storage(storage: S) -> OwnedMemoryRegion { + Arc::new(storage) +} + +/// Simple memory region descriptor. +#[derive(Debug)] +pub struct OffsetMemoryRegion { + base: OwnedMemoryRegion, + offset: usize, + len: usize, +} + +impl OffsetMemoryRegion { + /// Create a new offset view into an existing memory region. + /// + /// Returns an error if the offset and length exceed the bounds of the base region. + pub fn new(base: OwnedMemoryRegion, offset: usize, len: usize) -> Result { + let end = offset + .checked_add(len) + .ok_or_else(|| StorageError::Unsupported("offset overflow".into()))?; + if end > base.size() { + return Err(StorageError::Unsupported( + "offset region exceeds base allocation bounds".into(), + )); + } + Ok(Self { base, offset, len }) + } + + /// Get the offset relative to the base mapping. + pub fn offset(&self) -> usize { + self.offset + } + + /// Get the length of the offset region. + pub fn len(&self) -> usize { + self.len + } + + /// Check if the offset region is empty. + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Access the underlying base region. + pub fn base(&self) -> &OwnedMemoryRegion { + &self.base + } +} + +impl MemoryRegion for OffsetMemoryRegion { + fn addr(&self) -> usize { + self.base.addr() + self.offset + } + + fn size(&self) -> usize { + self.len + } + + fn storage_kind(&self) -> StorageKind { + self.base.storage_kind() + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct MemoryDescriptor { + pub addr: usize, + pub size: usize, +} + +impl MemoryDescriptor { + pub fn new(addr: usize, size: usize) -> Self { + Self { addr, size } + } + + #[inline] + pub fn addr(&self) -> usize { + self.addr + } + + #[inline] + pub fn size(&self) -> usize { + self.size + } +} + +impl actions::Slice for MemoryDescriptor { + fn as_slice(&self) -> Result<&[u8]> { + Ok(unsafe { std::slice::from_raw_parts(self.addr as *const u8, self.size) }) + } +} diff --git a/lib/llm/src/block_manager/v2/memory/pinned.rs b/lib/llm/src/block_manager/v2/memory/pinned.rs new file mode 100644 index 0000000000..e6d83174ac --- /dev/null +++ b/lib/llm/src/block_manager/v2/memory/pinned.rs @@ -0,0 +1,139 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! CUDA pinned host memory storage. + +use super::{MemoryRegion, Result, StorageError, StorageKind, actions}; +use cudarc::driver::CudaContext; +use cudarc::driver::sys; +use std::any::Any; +use std::collections::HashMap; +use std::sync::{Arc, Mutex, OnceLock}; + +/// Get or create a CUDA context for the given device. +fn cuda_context(device_id: u32) -> Result> { + static CONTEXTS: OnceLock>>> = OnceLock::new(); + let mut map = CONTEXTS.get_or_init(Default::default).lock().unwrap(); + + if let Some(existing) = map.get(&device_id) { + return Ok(existing.clone()); + } + + let ctx = CudaContext::new(device_id as usize)?; + map.insert(device_id, ctx.clone()); + Ok(ctx) +} + +/// CUDA pinned host memory allocated via cudaHostAlloc. +#[derive(Debug)] +pub struct PinnedStorage { + ptr: usize, + len: usize, + ctx: Arc, +} + +unsafe impl Send for PinnedStorage {} +unsafe impl Sync for PinnedStorage {} + +impl PinnedStorage { + /// Allocate new pinned memory of the given size. + /// + /// # Arguments + /// * `len` - Size in bytes to allocate + /// * `device_id` - CUDA device to associate with the allocation + pub fn new(len: usize) -> Result { + if len == 0 { + return Err(StorageError::AllocationFailed( + "zero-sized allocations are not supported".into(), + )); + } + + let ctx = cuda_context(0)?; + let ptr = unsafe { + ctx.bind_to_thread().map_err(StorageError::Cuda)?; + + let ptr = cudarc::driver::result::malloc_host(len, sys::CU_MEMHOSTALLOC_WRITECOMBINED) + .map_err(StorageError::Cuda)?; + + let ptr = ptr as *mut u8; + assert!(!ptr.is_null(), "Failed to allocate pinned memory"); + assert!(ptr.is_aligned(), "Pinned memory is not aligned"); + assert!(len < isize::MAX as usize); + + ptr as usize + }; + + Ok(Self { ptr, len, ctx }) + } + + /// Get a pointer to the underlying memory. + /// + /// # Safety + /// The caller must ensure the pointer is not used after this storage is dropped. + pub unsafe fn as_ptr(&self) -> *const u8 { + self.ptr as *const u8 + } + + /// Get a mutable pointer to the underlying memory. + /// + /// # Safety + /// The caller must ensure the pointer is not used after this storage is dropped + /// and that there are no other references to this memory. + pub unsafe fn as_mut_ptr(&mut self) -> *mut u8 { + self.ptr as *mut u8 + } +} + +impl Drop for PinnedStorage { + fn drop(&mut self) { + if let Err(e) = self.ctx.bind_to_thread() { + tracing::debug!("failed to bind CUDA context for free: {e}"); + } + unsafe { + if let Err(e) = cudarc::driver::result::free_host(self.ptr as _) { + tracing::debug!("failed to free pinned memory: {e}"); + } + }; + } +} + +impl MemoryRegion for PinnedStorage { + fn addr(&self) -> usize { + unsafe { self.as_ptr() as usize } + } + + fn size(&self) -> usize { + self.len + } + + fn storage_kind(&self) -> StorageKind { + StorageKind::Pinned + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +// Support for NIXL registration +impl super::registered::NixlCompatible for PinnedStorage { + fn nixl_params(&self) -> (*const u8, usize, nixl_sys::MemType, u64) { + let ptr = unsafe { self.as_ptr() }; + (ptr, self.len, nixl_sys::MemType::Dram, 0) + } +} + +impl actions::Memset for PinnedStorage { + fn memset(&mut self, value: u8, offset: usize, size: usize) -> Result<()> { + if offset + size > self.len { + return Err(StorageError::OperationFailed( + "memset: offset + size > storage size".into(), + )); + } + unsafe { + let ptr = (self.ptr as *mut u8).add(offset); + std::ptr::write_bytes(ptr, value, size); + } + Ok(()) + } +} diff --git a/lib/llm/src/block_manager/v2/memory/registered.rs b/lib/llm/src/block_manager/v2/memory/registered.rs new file mode 100644 index 0000000000..010088198a --- /dev/null +++ b/lib/llm/src/block_manager/v2/memory/registered.rs @@ -0,0 +1,195 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! NIXL registration wrapper for storage types. + +use super::{MemoryRegion, StorageKind}; +use nixl_sys::{Agent as NixlAgent, MemType, OptArgs, RegistrationHandle}; +use std::any::Any; +use std::fmt; + +/// Trait for storage types that can be registered with NIXL. +pub trait NixlCompatible { + /// Get parameters needed for NIXL registration. + /// + /// Returns (ptr, size, mem_type, device_id) + fn nixl_params(&self) -> (*const u8, usize, MemType, u64); +} + +/// NIXL descriptor containing registration information. +#[derive(Debug, Clone)] +pub struct NixlDescriptor { + pub addr: u64, + pub size: usize, + pub mem_type: MemType, + pub device_id: u64, +} + +impl nixl_sys::MemoryRegion for NixlDescriptor { + unsafe fn as_ptr(&self) -> *const u8 { + self.addr as *const u8 + } + + fn size(&self) -> usize { + self.size + } +} + +impl nixl_sys::NixlDescriptor for NixlDescriptor { + fn mem_type(&self) -> MemType { + self.mem_type + } + + fn device_id(&self) -> u64 { + self.device_id + } +} + +/// View trait for accessing registration information without unwrapping. +pub trait RegisteredView { + /// Get the name of the NIXL agent that registered this memory. + fn agent_name(&self) -> &str; + + /// Get the NIXL descriptor for this registered memory. + fn descriptor(&self) -> NixlDescriptor; +} + +/// Wrapper for storage that has been registered with NIXL. +/// +/// This wrapper ensures proper drop order: the registration handle is +/// dropped before the storage, ensuring deregistration happens before +/// the memory is freed. +pub struct NixlRegistered { + storage: S, + handle: Option, + agent_name: String, +} + +impl Drop for NixlRegistered { + fn drop(&mut self) { + // Explicitly drop the registration handle first + drop(self.handle.take()); + // Storage drops naturally after + } +} + +impl fmt::Debug for NixlRegistered { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("NixlRegistered") + .field("storage", &self.storage) + .field("agent_name", &self.agent_name) + .field("handle", &self.handle.is_some()) + .finish() + } +} + +impl MemoryRegion for NixlRegistered { + fn addr(&self) -> usize { + self.storage.addr() + } + + fn size(&self) -> usize { + self.storage.size() + } + + fn storage_kind(&self) -> StorageKind { + self.storage.storage_kind() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn nixl_descriptor(&self) -> Option { + Some(self.descriptor()) + } +} + +impl RegisteredView for NixlRegistered { + fn agent_name(&self) -> &str { + &self.agent_name + } + + fn descriptor(&self) -> NixlDescriptor { + let (ptr, size, mem_type, device_id) = self.storage.nixl_params(); + NixlDescriptor { + addr: ptr as u64, + size, + mem_type, + device_id, + } + } +} + +impl NixlRegistered { + /// Get a reference to the underlying storage. + pub fn storage(&self) -> &S { + &self.storage + } + + /// Get a mutable reference to the underlying storage. + pub fn storage_mut(&mut self) -> &mut S { + &mut self.storage + } + + /// Check if the registration handle is still valid. + pub fn is_registered(&self) -> bool { + self.handle.is_some() + } + + /// Consume this wrapper and return the underlying storage. + /// + /// This will deregister the storage from NIXL. + pub fn into_storage(mut self) -> S { + // Manually drop the handle first + self.handle = None; + // Now we can move out the storage + // We need to use mem::forget to prevent Drop from running + let storage = std::mem::replace(&mut self.storage, unsafe { std::mem::zeroed() }); + std::mem::forget(self); + storage + } +} + +/// Register storage with a NIXL agent. +/// +/// This consumes the storage and returns a `NixlRegistered` wrapper that +/// manages the registration lifetime. The registration handle will be +/// automatically dropped when the wrapper is dropped, ensuring proper +/// cleanup order. +/// +/// # Arguments +/// * `storage` - The storage to register (consumed) +/// * `agent` - The NIXL agent to register with +/// * `opt` - Optional arguments for registration +/// +/// # Returns +/// A `NixlRegistered` wrapper containing the storage and registration handle. +pub fn register_with_nixl( + storage: S, + agent: &NixlAgent, + opt: Option<&OptArgs>, +) -> std::result::Result, S> +where + S: MemoryRegion + NixlCompatible, +{ + // Get NIXL parameters + let (ptr, size, mem_type, device_id) = storage.nixl_params(); + + // Create a NIXL descriptor for registration + let descriptor = NixlDescriptor { + addr: ptr as u64, + size, + mem_type, + device_id, + }; + + match agent.register_memory(&descriptor, opt) { + Ok(handle) => Ok(NixlRegistered { + storage, + handle: Some(handle), + agent_name: agent.name().to_string(), + }), + Err(_) => Err(storage), + } +} diff --git a/lib/llm/src/block_manager/v2/memory/system.rs b/lib/llm/src/block_manager/v2/memory/system.rs new file mode 100644 index 0000000000..f51abeb6bf --- /dev/null +++ b/lib/llm/src/block_manager/v2/memory/system.rs @@ -0,0 +1,131 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! System memory storage backed by malloc. + +use super::{MemoryRegion, Result, StorageError, StorageKind, actions}; +use std::any::Any; +use std::ptr::NonNull; + +use nix::libc; + +/// System memory allocated via malloc. +#[derive(Debug)] +pub struct SystemStorage { + ptr: NonNull, + len: usize, +} + +unsafe impl Send for SystemStorage {} +unsafe impl Sync for SystemStorage {} + +impl SystemStorage { + /// Allocate new system memory of the given size. + pub fn new(len: usize) -> Result { + if len == 0 { + return Err(StorageError::AllocationFailed( + "zero-sized allocations are not supported".into(), + )); + } + + let mut ptr: *mut libc::c_void = std::ptr::null_mut(); + + // We need 4KB alignment here for NIXL disk transfers to work. + // The O_DIRECT flag is required for GDS. + // However, a limitation of this flag is that all operations involving disk + // (both read and write) must be page-aligned. + // Pinned memory is already page-aligned, so we only need to align system memory. + // TODO(jthomson04): Is page size always 4KB? + + // SAFETY: malloc returns suitably aligned memory or null on failure. + let result = unsafe { libc::posix_memalign(&mut ptr, 4096, len) }; + if result != 0 { + return Err(StorageError::AllocationFailed(format!( + "posix_memalign failed for size {}", + len + ))); + } + let ptr = NonNull::new(ptr as *mut u8).ok_or_else(|| { + StorageError::AllocationFailed(format!("malloc failed for size {}", len)) + })?; + + // Zero-initialize the memory + unsafe { + std::ptr::write_bytes(ptr.as_ptr(), 0, len); + } + + Ok(Self { ptr, len }) + } + + /// Get a pointer to the underlying memory. + /// + /// # Safety + /// The caller must ensure the pointer is not used after this storage is dropped. + pub unsafe fn as_ptr(&self) -> *const u8 { + self.ptr.as_ptr() + } + + /// Get a mutable pointer to the underlying memory. + /// + /// # Safety + /// The caller must ensure the pointer is not used after this storage is dropped + /// and that there are no other references to this memory. + pub unsafe fn as_mut_ptr(&mut self) -> *mut u8 { + self.ptr.as_ptr() + } +} + +impl Drop for SystemStorage { + fn drop(&mut self) { + // SAFETY: pointer was allocated by malloc. + unsafe { + libc::free(self.ptr.as_ptr() as *mut libc::c_void); + } + } +} + +impl MemoryRegion for SystemStorage { + fn addr(&self) -> usize { + self.ptr.as_ptr() as usize + } + + fn size(&self) -> usize { + self.len + } + + fn storage_kind(&self) -> StorageKind { + StorageKind::System + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +// Support for NIXL registration +impl super::registered::NixlCompatible for SystemStorage { + fn nixl_params(&self) -> (*const u8, usize, nixl_sys::MemType, u64) { + (self.ptr.as_ptr(), self.len, nixl_sys::MemType::Dram, 0) + } +} + +impl actions::Memset for SystemStorage { + fn memset(&mut self, value: u8, offset: usize, size: usize) -> Result<()> { + if offset + size > self.len { + return Err(StorageError::OperationFailed( + "memset: offset + size > storage size".into(), + )); + } + unsafe { + let ptr = self.ptr.as_ptr().add(offset); + std::ptr::write_bytes(ptr, value, size); + } + Ok(()) + } +} + +impl actions::Slice for SystemStorage { + fn as_slice(&self) -> Result<&[u8]> { + Ok(unsafe { std::slice::from_raw_parts(self.ptr.as_ptr(), self.len) }) + } +} diff --git a/lib/llm/src/block_manager/v2/memory/tests.rs b/lib/llm/src/block_manager/v2/memory/tests.rs new file mode 100644 index 0000000000..f354e72124 --- /dev/null +++ b/lib/llm/src/block_manager/v2/memory/tests.rs @@ -0,0 +1,129 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Tests for the storage-next module. + +use super::*; + +#[test] +fn test_system_storage() { + let storage = SystemStorage::new(1024).unwrap(); + assert_eq!(storage.size(), 1024); + assert_eq!(storage.storage_kind(), StorageKind::System); + assert!(storage.addr() != 0); + + // Test that we can create multiple allocations + let storage2 = SystemStorage::new(2048).unwrap(); + assert_eq!(storage2.size(), 2048); + assert_ne!(storage.addr(), storage2.addr()); +} + +#[test] +fn test_system_storage_zero_size() { + let result = SystemStorage::new(0); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + StorageError::AllocationFailed(_) + )); +} + +#[test] +fn test_disk_storage_temp() { + let storage = DiskStorage::new(4096).unwrap(); + assert_eq!(storage.size(), 4096); + assert!(matches!(storage.storage_kind(), StorageKind::Disk(_))); + // Disk storage is file-backed, so addr() returns 0 (no memory address) + assert_eq!(storage.addr(), 0); + assert!(storage.path().exists()); +} + +#[test] +fn test_disk_storage_at_path() { + let temp_dir = tempfile::tempdir().unwrap(); + let path = temp_dir.path().join("test.bin"); + + let storage = DiskStorage::new_at(&path, 8192).unwrap(); + assert_eq!(storage.size(), 8192); + assert!(matches!(storage.storage_kind(), StorageKind::Disk(_))); + assert!(path.exists()); +} + +#[test] +fn test_type_erasure() { + let storage = SystemStorage::new(1024).unwrap(); + let erased: OwnedMemoryRegion = erase_storage(storage); + + assert_eq!(erased.size(), 1024); + assert_eq!(erased.storage_kind(), StorageKind::System); +} + +#[test] +fn test_memory_descriptor() { + let desc = MemoryDescriptor::new(0x1000, 4096); + assert_eq!(desc.addr, 0x1000); + assert_eq!(desc.size, 4096); +} + +#[cfg(feature = "testing-cuda")] +mod cuda_tests { + use super::*; + + #[test] + fn test_pinned_storage() { + let storage = PinnedStorage::new(2048).unwrap(); + assert_eq!(storage.size(), 2048); + assert_eq!(storage.storage_kind(), StorageKind::Pinned); + assert!(storage.addr() != 0); + } + + #[test] + fn test_pinned_storage_zero_size() { + let storage = PinnedStorage::new(0); + assert!(storage.is_err()); + assert!(matches!( + storage.unwrap_err(), + StorageError::AllocationFailed(_) + )); + } + + #[test] + fn test_device_storage() { + let storage = DeviceStorage::new(4096, 0).unwrap(); + assert_eq!(storage.size(), 4096); + assert_eq!(storage.storage_kind(), StorageKind::Device(0)); + assert!(storage.addr() != 0); + assert_eq!(storage.device_id(), 0); + } + + #[test] + fn test_device_storage_zero_size() { + let result = DeviceStorage::new(0, 0); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + StorageError::AllocationFailed(_) + )); + } +} + +// Tests for NIXL registration would require a real NIXL agent, +// so we'll skip those for now. In practice, you'd mock the agent +// or use integration tests. +#[cfg(feature = "testing-nixl")] +mod nixl_tests { + use super::super::registered::register_with_nixl; + use super::*; + use nixl_sys::Agent as NixlAgent; + + // These tests would require a mock NIXL agent or real NIXL setup + // Placeholder for now + + #[test] + fn test_nixl_registration() { + let pinned = PinnedStorage::new(2048).unwrap(); + let agent = NixlAgent::new("test_agent").unwrap(); + let registered = register_with_nixl(pinned, &agent, None).unwrap(); + assert_eq!(registered.agent_name(), "test_agent"); + } +} diff --git a/lib/llm/src/block_manager/v2/memory/torch.rs b/lib/llm/src/block_manager/v2/memory/torch.rs new file mode 100644 index 0000000000..c60f5e2b31 --- /dev/null +++ b/lib/llm/src/block_manager/v2/memory/torch.rs @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum TorchDevice { + Cuda(usize), + Other(String), +} + +impl TorchDevice { + pub fn is_cuda(&self) -> bool { + matches!(self, TorchDevice::Cuda(_)) + } + + pub fn cuda_device_index(&self) -> Option { + match self { + TorchDevice::Cuda(index) => Some(*index), + TorchDevice::Other(_) => None, + } + } +} + +pub trait TorchTensor: std::fmt::Debug + Send + Sync { + fn device(&self) -> TorchDevice; + fn data_ptr(&self) -> u64; + fn size_bytes(&self) -> usize; + fn shape(&self) -> Vec; + fn stride(&self) -> Vec; +} diff --git a/lib/llm/src/block_manager/v2/physical/layout/builder.rs b/lib/llm/src/block_manager/v2/physical/layout/builder.rs new file mode 100644 index 0000000000..80b9b7c419 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/layout/builder.rs @@ -0,0 +1,864 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Typed builder for constructing [`PhysicalLayout`](crate::block_manager::v2::layout::PhysicalLayout) +//! instances with strongly-typed configuration, layout selection, and memory provisioning. +//! +//! The builder enforces the three steps required to materialize a physical layout: +//! 1. Provide a [`LayoutConfig`] +//! 2. Select a concrete layout (fully contiguous or layer separate) +//! 3. Specify memory backing (either by allocating or by supplying existing regions) +//! +//! NIXL registration is always enabled. Callers must provide a [`nixl_sys::Agent`], and any memory +//! supplied to the builder must implement [`NixlCompatible`]. + +use crate::block_manager::v2::physical::layout::physical::PhysicalLayout; + +use super::{ + BlockDimension, FullyContiguousLayout, LayerSeparateLayout, Layout, LayoutConfig, MemoryRegion, + physical::NixlMetadata, +}; + +use crate::block_manager::v2::memory::{ + DiskStorage, NixlCompatible, NixlDescriptor, OffsetMemoryRegion, OwnedMemoryRegion, + RegisteredView, StorageKind, SystemStorage, register_with_nixl, +}; +use anyhow::{Result, anyhow, bail}; +#[allow(unused_imports)] +use nixl_sys::Agent as RawNixlAgent; +use nixl_sys::MemType; +use std::marker::PhantomData; +use std::path::PathBuf; +use std::sync::Arc; + +use crate::block_manager::v2::memory::{DeviceStorage, PinnedStorage}; + +use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent; + +const REGION_ALIGNMENT: usize = 512; + +/// Layout selection exposed by the builder. +#[derive(Debug, Clone)] +pub enum LayoutKind { + FullyContiguous, + LayerSeparate { block_dim: BlockDimension }, +} + +/// Allocation strategies for builder-managed memory. +#[derive(Debug, Clone)] +enum AllocationKind { + System, + Pinned { numa_aware: bool }, + + Device { device_id: u32 }, + Disk { path: Option }, +} + +/// Memory provisioning plan (either provided regions or an allocation request). +#[derive(Debug, Clone)] +enum MemoryPlan { + Provided(Vec), + Allocate(AllocationKind), +} + +/// Memory tenancy captured during the build process. +#[derive(Debug, Clone)] +struct MemoryEntry { + region: OwnedMemoryRegion, + descriptor: Option, +} + +impl MemoryEntry { + fn new(region: OwnedMemoryRegion, descriptor: Option) -> Self { + Self { region, descriptor } + } + + fn ensure_registered(mut self) -> Result { + if self.descriptor.is_none() { + self.descriptor = self.region.nixl_descriptor(); + } + + #[cfg(not(test))] + { + // In production, require NIXL registration + if self.descriptor.is_none() { + bail!( + "memory region {} is not registered with NIXL", + self.region.addr() + ); + } + } + + // In test builds, allow None descriptors for local-only layouts + Ok(self) + } +} + +/// Marker types for the builder state machine. +pub struct NoConfig; +pub struct HasConfig; + +pub struct NoLayout; +pub struct HasLayout; + +pub struct NoMemory; +pub struct HasMemory; + +/// Default builder state type alias. +pub type PhysicalLayoutBuilderDefault = PhysicalLayoutBuilder; + +/// Typed builder enforcing configuration, layout selection, and memory provisioning phases. +pub struct PhysicalLayoutBuilder { + agent: NixlAgent, + config: Option, + layout_kind: Option, + memory_plan: Option, + _config: PhantomData, + _layout: PhantomData, + _memory: PhantomData, +} + +impl PhysicalLayoutBuilder { + /// Create a new builder in its initial state. + pub fn new(agent: NixlAgent) -> Self { + Self { + agent, + config: None, + layout_kind: None, + memory_plan: None, + _config: PhantomData, + _layout: PhantomData, + _memory: PhantomData, + } + } +} + +impl PhysicalLayoutBuilder { + fn into_parts( + self, + ) -> ( + NixlAgent, + Option, + Option, + Option, + ) { + (self.agent, self.config, self.layout_kind, self.memory_plan) + } + + fn from_parts( + agent: NixlAgent, + config: Option, + layout_kind: Option, + memory_plan: Option, + ) -> PhysicalLayoutBuilder { + PhysicalLayoutBuilder { + agent, + config, + layout_kind, + memory_plan, + _config: PhantomData, + _layout: PhantomData, + _memory: PhantomData, + } + } +} + +impl PhysicalLayoutBuilder { + /// Attach the [`LayoutConfig`] required to size the layout and allocations. + pub fn with_config(self, config: LayoutConfig) -> PhysicalLayoutBuilder { + let (agent, _config, layout_kind, memory_plan) = self.into_parts(); + PhysicalLayoutBuilder::::from_parts( + agent, + Some(config), + layout_kind, + memory_plan, + ) + } +} + +impl PhysicalLayoutBuilder { + /// Select the fully contiguous layout variant. + pub fn fully_contiguous(self) -> PhysicalLayoutBuilder { + let (agent, config, _layout, memory_plan) = self.into_parts(); + PhysicalLayoutBuilder::::from_parts( + agent, + config, + Some(LayoutKind::FullyContiguous), + memory_plan, + ) + } + + /// Select the layer-separate layout variant with the provided block dimension ordering. + pub fn layer_separate( + self, + block_dim: BlockDimension, + ) -> PhysicalLayoutBuilder { + let (agent, config, _layout, memory_plan) = self.into_parts(); + PhysicalLayoutBuilder::::from_parts( + agent, + config, + Some(LayoutKind::LayerSeparate { block_dim }), + memory_plan, + ) + } +} + +impl PhysicalLayoutBuilder { + fn set_memory_plan( + self, + plan: MemoryPlan, + ) -> PhysicalLayoutBuilder { + let (agent, config, layout_kind, _memory) = self.into_parts(); + PhysicalLayoutBuilder::::from_parts( + agent, + config, + layout_kind, + Some(plan), + ) + } + + pub fn allocate_system(self) -> PhysicalLayoutBuilder { + self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::System)) + } + + /// Allocate pinned (page-locked) host memory. + pub fn allocate_pinned( + self, + numa_aware: bool, + ) -> PhysicalLayoutBuilder { + self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Pinned { numa_aware })) + } + + /// Allocate device memory on the specified CUDA device (or the context device if `None`). + pub fn allocate_device( + self, + device_id: u32, + ) -> PhysicalLayoutBuilder { + self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Device { device_id })) + } + + /// Allocate disk-backed storage. When `path` is `None`, a temporary file is used. + pub fn allocate_disk( + self, + path: Option, + ) -> PhysicalLayoutBuilder { + self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Disk { path })) + } + + /// Use existing NIXL-compatible memory regions supplied by the caller. + pub fn with_memory_regions( + self, + regions: Vec, + ) -> Result> + where + S: MemoryRegion + NixlCompatible + 'static, + { + let (agent, config, layout_kind, _memory) = self.into_parts(); + let entries = register_existing_regions(&agent, regions)?; + Ok( + PhysicalLayoutBuilder::::from_parts( + agent, + config, + layout_kind, + Some(MemoryPlan::Provided(entries)), + ), + ) + } + + /// Use pre-registered memory regions (already wrapped in `Arc`). + /// + /// All regions must already expose a NIXL descriptor. + pub fn with_registered_regions( + self, + regions: Vec, + ) -> Result> { + let entries = regions + .into_iter() + .enumerate() + .map(|(index, region)| { + let descriptor = region.nixl_descriptor().ok_or_else(|| { + anyhow!( + "provided memory region at index {} is not NIXL registered", + index + ) + })?; + Ok(MemoryEntry::new(region, Some(descriptor))) + }) + .collect::>>()?; + + let (agent, config, layout_kind, _memory) = self.into_parts(); + Ok( + PhysicalLayoutBuilder::::from_parts( + agent, + config, + layout_kind, + Some(MemoryPlan::Provided(entries)), + ), + ) + } +} + +impl PhysicalLayoutBuilder { + /// Finalize the builder, constructing the [`PhysicalLayout`]. + pub fn build(self) -> Result { + let (agent, config, layout_kind, memory_plan) = self.into_parts(); + + let config = config.ok_or_else(|| anyhow!("layout config missing despite type state"))?; + let layout_kind = + layout_kind.ok_or_else(|| anyhow!("layout kind missing despite type state"))?; + let memory_plan = + memory_plan.ok_or_else(|| anyhow!("memory plan missing despite type state"))?; + + let required_sizes = compute_allocation_sizes(&config, &layout_kind)?; + let entries = resolve_memory_plan(&agent, memory_plan, &required_sizes)?; + + validate_memory_sizes(&entries, &required_sizes)?; + let kind = derive_storage_kind(&entries)?; + let metadata = derive_nixl_metadata(&agent, &entries)?; + + let layout: Arc = match layout_kind { + LayoutKind::FullyContiguous => { + let entry = entries.first().ok_or_else(|| { + anyhow!("fully contiguous layout requires a single memory region") + })?; + let layout = FullyContiguousLayout::new(config.clone(), Arc::clone(&entry.region))?; + Arc::new(layout) + } + LayoutKind::LayerSeparate { block_dim } => { + let regions: Vec = entries + .iter() + .map(|entry| Arc::clone(&entry.region)) + .collect(); + let layout = LayerSeparateLayout::new(config.clone(), regions, block_dim)?; + Arc::new(layout) + } + }; + + Ok(PhysicalLayout::new_local(layout, kind, metadata)) + } +} + +fn register_existing_regions(agent: &NixlAgent, regions: Vec) -> Result> +where + S: MemoryRegion + NixlCompatible + 'static, +{ + regions + .into_iter() + .map(|region| register_storage(region, agent)) + .collect() +} + +fn resolve_memory_plan( + agent: &NixlAgent, + plan: MemoryPlan, + sizes: &[usize], +) -> Result> { + match plan { + MemoryPlan::Provided(entries) => { + if entries.len() != sizes.len() { + bail!( + "provided memory count ({}) does not match required allocations ({})", + entries.len(), + sizes.len() + ); + } + entries + .into_iter() + .map(MemoryEntry::ensure_registered) + .collect() + } + MemoryPlan::Allocate(strategy) => allocate_regions(agent, strategy, sizes), + } +} + +fn allocate_regions( + agent: &NixlAgent, + strategy: AllocationKind, + sizes: &[usize], +) -> Result> { + if sizes.is_empty() { + return Ok(Vec::new()); + } + + let reserve_size = total_allocation_size(sizes, REGION_ALIGNMENT)?; + + let base_entry = match strategy { + AllocationKind::System => allocate_system_entry(reserve_size, agent)?, + AllocationKind::Pinned { numa_aware } => { + allocate_pinned_entry(reserve_size, agent, numa_aware)? + } + + AllocationKind::Device { device_id } => { + allocate_device_entry(reserve_size, agent, device_id)? + } + AllocationKind::Disk { path } => allocate_disk_entry(reserve_size, agent, path)?, + }; + + create_offset_entries(base_entry, sizes, REGION_ALIGNMENT) +} + +fn allocate_system_entry(size: usize, agent: &NixlAgent) -> Result { + let storage = SystemStorage::new(size) + .map_err(|e| anyhow!("failed to allocate system memory ({size} bytes): {e}"))?; + register_storage(storage, agent) +} + +fn allocate_pinned_entry(size: usize, agent: &NixlAgent, _numa_aware: bool) -> Result { + let storage = PinnedStorage::new(size) + .map_err(|e| anyhow!("failed to allocate pinned memory ({size} bytes): {e}"))?; + register_storage(storage, agent) +} + +fn allocate_device_entry(size: usize, agent: &NixlAgent, device_id: u32) -> Result { + let storage = DeviceStorage::new(size, device_id).map_err(|e| { + anyhow!("failed to allocate device memory ({size} bytes) on device {device_id}: {e}") + })?; + register_storage(storage, agent) +} + +fn allocate_disk_entry( + size: usize, + agent: &NixlAgent, + path: Option, +) -> Result { + let storage = if let Some(path) = path { + DiskStorage::new_at(&path, size) + .map_err(|e| anyhow!("failed to allocate disk storage at {}: {e}", path.display()))? + } else { + DiskStorage::new(size).map_err(|e| anyhow!("failed to allocate disk storage: {e}"))? + }; + register_storage(storage, agent) +} + +// When testing, we allow unregistered layouts to help with test time. NIXL + UCX is very expensive to setup +// so we only use that backend when it's needed. +#[cfg(test)] +fn register_storage(storage: S, agent: &NixlAgent) -> Result +where + S: MemoryRegion + NixlCompatible + 'static, +{ + let storage_kind = storage.storage_kind(); + + // Determine if registration is needed based on storage type and available backends + let should_register = match storage_kind { + StorageKind::System | StorageKind::Pinned => { + // System/Pinned memory needs UCX for remote transfers + agent.has_backend("UCX") || agent.has_backend("POSIX") + } + StorageKind::Device(_) => { + // Device memory needs UCX for remote transfers OR GDS for direct disk transfers + agent.has_backend("UCX") || agent.has_backend("GDS_MT") + } + StorageKind::Disk(_) => { + // Disk storage needs POSIX for regular I/O OR GDS for GPU direct I/O + agent.has_backend("POSIX") || agent.has_backend("GDS_MT") + } + }; + + if !should_register { + // Skip registration - only local non-NIXL transfers will be used + let region: OwnedMemoryRegion = Arc::new(storage); + return Ok(MemoryEntry::new(region, None)); + } + + // Register with NIXL using the appropriate backend + match register_with_nixl(storage, agent.raw_agent(), None) { + Ok(registered) => { + let descriptor = registered.descriptor(); + let region: OwnedMemoryRegion = Arc::new(registered); + Ok(MemoryEntry::new(region, Some(descriptor))) + } + Err(_storage) => bail!("failed to register memory with NIXL agent {}", agent.name()), + } +} + +// Production builds always register +#[cfg(not(test))] +fn register_storage(storage: S, agent: &NixlAgent) -> Result +where + S: MemoryRegion + NixlCompatible + 'static, +{ + // Production builds always register for safety + match register_with_nixl(storage, agent.raw_agent(), None) { + Ok(registered) => { + let descriptor = registered.descriptor(); + let region: OwnedMemoryRegion = Arc::new(registered); + Ok(MemoryEntry::new(region, Some(descriptor))) + } + Err(_storage) => bail!("failed to register memory with NIXL agent {}", agent.name()), + } +} + +fn create_offset_entries( + base_entry: MemoryEntry, + sizes: &[usize], + alignment: usize, +) -> Result> { + if sizes.is_empty() { + return Ok(Vec::new()); + } + + let base_region = base_entry.region; + let base_descriptor = base_entry.descriptor; + let base_addr = base_region.addr(); + let base_len = base_region.size(); + + let mut entries = Vec::with_capacity(sizes.len()); + let mut offset = 0usize; + + for (index, &size) in sizes.iter().enumerate() { + let region = if index == 0 && offset == 0 && size == base_len && sizes.len() == 1 { + Arc::clone(&base_region) + } else { + let view = OffsetMemoryRegion::new(Arc::clone(&base_region), offset, size) + .map_err(|e| anyhow!("failed to create offset region: {e}"))?; + Arc::new(view) as OwnedMemoryRegion + }; + + let descriptor = base_descriptor + .as_ref() + .map(|descriptor| derive_descriptor(descriptor, offset, size)) + .transpose()?; + + entries.push(MemoryEntry::new(region, descriptor)); + + offset = offset + .checked_add(size) + .ok_or_else(|| anyhow!("offset computation overflow"))?; + + if index + 1 < sizes.len() && alignment > 1 { + let current_addr = base_addr + .checked_add(offset) + .ok_or_else(|| anyhow!("address computation overflow"))?; + let aligned_addr = align_up(current_addr, alignment)?; + offset = aligned_addr + .checked_sub(base_addr) + .ok_or_else(|| anyhow!("alignment subtraction overflow"))?; + } + } + + if offset > base_len { + bail!( + "allocated base region ({base_len} bytes) is insufficient for {offset} bytes with padding" + ); + } + + Ok(entries) +} + +fn derive_descriptor(base: &NixlDescriptor, offset: usize, size: usize) -> Result { + let mut descriptor = base.clone(); + descriptor.size = size; + if descriptor.mem_type != MemType::File { + descriptor.addr = descriptor + .addr + .checked_add(offset as u64) + .ok_or_else(|| anyhow!("descriptor address overflow"))?; + } + Ok(descriptor) +} + +fn compute_allocation_sizes(config: &LayoutConfig, kind: &LayoutKind) -> Result> { + match kind { + LayoutKind::FullyContiguous => { + let factors = [ + config.num_blocks, + config.num_layers, + config.outer_dim, + config.page_size, + config.inner_dim, + config.dtype_width_bytes, + ]; + let total = mul_chain(&factors)?; + Ok(vec![total]) + } + LayoutKind::LayerSeparate { .. } => { + let factors = [ + config.num_blocks, + config.outer_dim, + config.page_size, + config.inner_dim, + config.dtype_width_bytes, + ]; + let per_layer = mul_chain(&factors)?; + Ok(vec![per_layer; config.num_layers]) + } + } +} + +fn mul_chain(factors: &[usize]) -> Result { + factors.iter().try_fold(1usize, |acc, &value| { + acc.checked_mul(value) + .ok_or_else(|| anyhow!("allocation size overflow during layout computation")) + }) +} + +fn total_allocation_size(sizes: &[usize], alignment: usize) -> Result { + if sizes.is_empty() { + return Ok(0); + } + + let mut total = *sizes + .first() + .ok_or_else(|| anyhow!("allocation requires at least one region"))?; + + for size in sizes.iter().skip(1) { + total = total + .checked_add(*size) + .ok_or_else(|| anyhow!("allocation size overflow during aggregation"))?; + if alignment > 1 { + total = total + .checked_add(alignment - 1) + .ok_or_else(|| anyhow!("allocation alignment padding overflow"))?; + } + } + + Ok(total) +} + +fn align_up(value: usize, alignment: usize) -> Result { + if alignment <= 1 { + return Ok(value); + } + let remainder = value % alignment; + if remainder == 0 { + Ok(value) + } else { + value + .checked_add(alignment - remainder) + .ok_or_else(|| anyhow!("alignment overflow")) + } +} + +fn validate_memory_sizes(entries: &[MemoryEntry], required: &[usize]) -> Result<()> { + for (entry, &required_size) in entries.iter().zip(required.iter()) { + if entry.region.size() < required_size { + bail!( + "memory region too small: required {} bytes, available {} bytes", + required_size, + entry.region.size() + ); + } + } + Ok(()) +} + +fn derive_storage_kind(entries: &[MemoryEntry]) -> Result { + let first = entries + .first() + .ok_or_else(|| anyhow!("no memory regions available to determine storage location"))?; + let first_kind = first.region.storage_kind(); + + for entry in entries.iter().skip(1) { + let kind = entry.region.storage_kind(); + if kind != first_kind { + bail!( + "all memory regions must share the same storage location (found {:?} and {:?})", + first_kind, + kind + ); + } + } + + Ok(first_kind) +} + +fn derive_nixl_metadata(agent: &NixlAgent, entries: &[MemoryEntry]) -> Result { + // Try to find a descriptor from entries + let descriptor_opt = entries.iter().find_map(|entry| entry.descriptor.clone()); + + #[cfg(test)] + { + // In test builds, allow layouts without NIXL registration + // Use defaults for local-only transfers + if let Some(descriptor) = descriptor_opt { + Ok(NixlMetadata::new( + agent.name().to_string(), + descriptor.mem_type, + descriptor.device_id, + )) + } else { + // Use placeholder metadata for unregistered layouts + let first_entry = entries + .first() + .ok_or_else(|| anyhow!("no memory entries"))?; + let storage_kind = first_entry.region.storage_kind(); + let (mem_type, device_id) = match storage_kind { + StorageKind::System => (MemType::Dram, 0), + StorageKind::Pinned => (MemType::Dram, 0), + StorageKind::Device(id) => (MemType::Vram, id as u64), + StorageKind::Disk(id) => (MemType::File, id), + }; + Ok(NixlMetadata::new( + agent.name().to_string(), + mem_type, + device_id, + )) + } + } + + #[cfg(not(test))] + { + let descriptor = descriptor_opt + .ok_or_else(|| anyhow!("memory entries missing NIXL registration metadata"))?; + Ok(NixlMetadata::new( + agent.name().to_string(), + descriptor.mem_type, + descriptor.device_id, + )) + } +} + +#[cfg(all(test, feature = "testing-nixl"))] +mod tests { + use super::super::{BlockDimension, LayoutConfig}; + use super::*; + + use crate::block_manager::v2::memory::{MemoryRegion, OwnedMemoryRegion, StorageKind}; + use nixl_sys::MemType; + use std::any::Any; + use std::sync::Arc; + + #[derive(Debug)] + struct TestRegisteredRegion { + data: Vec, + kind: StorageKind, + descriptor: NixlDescriptor, + } + + impl TestRegisteredRegion { + fn new(size: usize, kind: StorageKind, mem_type: MemType, device_id: u64) -> Self { + let data = vec![0u8; size]; + let addr = data.as_ptr() as u64; + let descriptor = NixlDescriptor { + addr, + size, + mem_type, + device_id, + }; + Self { + data, + kind, + descriptor, + } + } + } + + impl MemoryRegion for TestRegisteredRegion { + fn addr(&self) -> usize { + self.data.as_ptr() as usize + } + + fn size(&self) -> usize { + self.data.len() + } + + fn storage_kind(&self) -> StorageKind { + self.kind + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn nixl_descriptor(&self) -> Option { + Some(self.descriptor.clone()) + } + } + + fn make_layout_config() -> LayoutConfig { + LayoutConfig::builder() + .num_blocks(2) + .num_layers(3) + .outer_dim(2) + .page_size(4) + .inner_dim(8) + .dtype_width_bytes(2) + .build() + .unwrap() + } + + fn fully_contiguous_size(cfg: &LayoutConfig) -> usize { + cfg.num_blocks + * cfg.num_layers + * cfg.outer_dim + * cfg.page_size + * cfg.inner_dim + * cfg.dtype_width_bytes + } + + fn per_layer_size(cfg: &LayoutConfig) -> usize { + cfg.num_blocks * cfg.outer_dim * cfg.page_size * cfg.inner_dim * cfg.dtype_width_bytes + } + + #[test] + fn builds_fully_contiguous_from_registered_regions() { + let agent = NixlAgent::require_backends("builder-test-fully", &[]) + .expect("failed to create wrapped agent"); + let cfg = make_layout_config(); + + let required = fully_contiguous_size(&cfg); + let region = Arc::new(TestRegisteredRegion::new( + required, + StorageKind::System, + MemType::Dram, + 0, + )) as OwnedMemoryRegion; + + let physical = PhysicalLayoutBuilder::new(agent.clone()) + .with_config(cfg.clone()) + .fully_contiguous() + .with_registered_regions(vec![region]) + .expect("registered regions accepted") + .build() + .expect("builder should succeed"); + + assert_eq!(physical.location(), StorageKind::System); + assert!(physical.layout().as_ref().is_fully_contiguous()); + assert_eq!(physical.layout().config().num_blocks, cfg.num_blocks); + assert_eq!(physical.layout().config().num_layers, cfg.num_layers); + + let metadata = physical.nixl_metadata(); + assert_eq!(metadata.agent_name(), agent.name()); + assert_eq!(metadata.mem_type(), MemType::Dram); + } + + #[test] + fn builds_layer_separate_from_registered_regions() { + let agent = NixlAgent::require_backends("builder-test-layer", &[]) + .expect("failed to create wrapped agent"); + let cfg = make_layout_config(); + + let per_layer = per_layer_size(&cfg); + let regions: Vec = (0..cfg.num_layers) + .map(|_| { + Arc::new(TestRegisteredRegion::new( + per_layer, + StorageKind::System, + MemType::Dram, + 0, + )) as OwnedMemoryRegion + }) + .collect(); + + let physical = PhysicalLayoutBuilder::new(agent.clone()) + .with_config(cfg.clone()) + .layer_separate(BlockDimension::BlockIsFirstDim) + .with_registered_regions(regions) + .expect("registered layer regions accepted") + .build() + .expect("builder should succeed"); + + assert_eq!(physical.location(), StorageKind::System); + assert!(!physical.layout().as_ref().is_fully_contiguous()); + assert_eq!(physical.layout().config().num_layers, cfg.num_layers); + + let metadata = physical.nixl_metadata(); + assert_eq!(metadata.agent_name(), agent.name()); + assert_eq!(metadata.mem_type(), MemType::Dram); + } +} + +// fn context_device_id(ctx: &TransferContext) -> u32 { +// ctx.stream().context().ordinal() as u32 +// } diff --git a/lib/llm/src/block_manager/v2/physical/layout/config.rs b/lib/llm/src/block_manager/v2/physical/layout/config.rs new file mode 100644 index 0000000000..e4900478db --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/layout/config.rs @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use derive_builder::Builder; +use serde::{Deserialize, Serialize}; +use validator::{Validate, ValidationError}; + +use super::InnerShape; + +/// Configuration for block layouts +#[derive(Debug, Clone, Builder, Validate, Serialize, Deserialize, PartialEq, Eq)] +pub struct LayoutConfig { + /// Number of blocks + #[validate(range(min = 1))] + pub num_blocks: usize, + + /// Number of layers + #[validate(range(min = 1))] + pub num_layers: usize, + + /// Number of outer dimensions + #[validate(range(min = 1, max = 2))] + pub outer_dim: usize, + + /// Page size + #[validate(range(min = 1))] + pub page_size: usize, + + /// Inner dimension + #[validate(range(min = 1))] + pub inner_dim: usize, + + /// Alignment + #[validate(custom(function = "validate_power_of_2"))] + #[builder(default = "1")] + pub alignment: usize, + + /// Data type + #[validate(custom(function = "validate_dtype_width_bytes"))] + #[builder(default = "2")] + pub dtype_width_bytes: usize, + + /// Inner shape format (NHD, HND, or Unknown) + #[builder(default = "InnerShape::Unknown")] + pub inner_shape: InnerShape, +} + +impl LayoutConfig { + /// Builder for LayoutConfig + pub fn builder() -> LayoutConfigBuilder { + LayoutConfigBuilder::default() + } + + pub fn required_bytes(&self) -> usize { + self.num_blocks + .saturating_mul(self.num_layers) + .saturating_mul(self.outer_dim) + .saturating_mul(self.page_size) + .saturating_mul(self.inner_dim) + .saturating_mul(self.dtype_width_bytes) + } +} + +/// The first two dimensions of the tensor, `shape[0]` and `shape[1]`, one of those corresponds to the +/// block dimension, while the other corresponds to the outer dimension. +/// +/// The outer dimension is typically: +/// - 1: MLA or K and V stored together, +/// - 2: K and V stored separately, +/// +/// The block dimension tell us the number of blocks. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum BlockDimension { + /// The block dimension is the first dimension of the tensor, `[n_blocks, outer_dim, inner_dim]` + BlockIsFirstDim, + + /// The block dimension is the second dimension of the tensor, `[outer_dim, n_blocks, inner_dim]` + /// This is a replacement for v1's `outer_contiguous` is true. + BlockIsSecondDim, +} + +/// Validation function for Option to check if it's Some(power_of_2). +pub fn validate_power_of_2(alignment: usize) -> Result<(), ValidationError> { + if !alignment.is_power_of_two() { + // Return validation error if alignment is not a power of 2 + return Err(validator::ValidationError::new( + "alignment_must_be_power_of_2", + )); + } + // Passes validation if alignment is a power of 2 + Ok(()) +} + +pub fn validate_dtype_width_bytes(dtype_width_bytes: usize) -> Result<(), ValidationError> { + if !dtype_width_bytes.is_power_of_two() || !(2..=8).contains(&dtype_width_bytes) { + return Err(validator::ValidationError::new( + "dtype_width_bytes_must_be_power_of_two_and_less_than_8_bytes", + )); + } + Ok(()) +} diff --git a/lib/llm/src/block_manager/v2/physical/layout/fully_contiguous.rs b/lib/llm/src/block_manager/v2/physical/layout/fully_contiguous.rs new file mode 100644 index 0000000000..7a438c8a42 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/layout/fully_contiguous.rs @@ -0,0 +1,271 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Fully contiguous layout implementation. +//! +//! This layout stores all blocks in a single contiguous memory allocation +//! with the shape: [num_blocks, num_layers, outer_dim, page_size, inner_dim]. + +use anyhow::{Result, anyhow}; +use std::sync::Arc; +use validator::Validate; + +use super::serialize::{BlockFormat, FullyContiguousDetails, LayoutTypeDetails}; +use super::{Layout, LayoutConfig, MemoryDescriptor, MemoryRegion, OwnedMemoryRegion}; + +/// Fully contiguous layout where all blocks are in a single allocation. +#[derive(Debug)] +pub struct FullyContiguousLayout { + config: LayoutConfig, + /// Base address of the allocation + base_addr: usize, + /// Stride between blocks in bytes + block_stride: usize, + /// Stride between layers in bytes + layer_stride: usize, + /// Stride between outer dimensions in bytes + outer_stride: usize, + /// Size of each memory region (page) in bytes + region_size: usize, + /// Owned memory region backing this layout + memory: Arc, + /// Format of blocks in memory + block_format: BlockFormat, +} + +impl FullyContiguousLayout { + /// Create a new fully contiguous layout. + /// + /// # Arguments + /// * `config` - Layout configuration + /// * `memory` - Owned memory region that backs this layout + /// + /// # Returns + /// A new FullyContiguousLayout instance + pub fn new(config: LayoutConfig, memory: Arc) -> Result { + config.validate()?; + + let base_addr = memory.addr(); + + // Calculate strides + let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes; + let outer_stride = region_size; + let layer_stride = outer_stride * config.outer_dim; + let block_stride = layer_stride * config.num_layers; + + // Validate that the memory region is large enough + let required_size = block_stride * config.num_blocks; + if memory.size() < required_size { + return Err(anyhow!( + "Memory region too small for layout. Required: {} bytes, got: {} bytes", + required_size, + memory.size() + )); + } + + Ok(Self { + config, + base_addr, + block_stride, + layer_stride, + outer_stride, + region_size, + memory, + block_format: BlockFormat::default(), + }) + } + + /// Create a new fully contiguous layout with a specific block format. + /// + /// # Arguments + /// * `config` - Layout configuration + /// * `memory` - Owned memory region that backs this layout + /// * `block_format` - Format of blocks in memory + /// + /// # Returns + /// A new FullyContiguousLayout instance + pub(crate) fn new_with_format( + config: LayoutConfig, + memory: Arc, + block_format: BlockFormat, + ) -> Result { + let mut layout = Self::new(config, memory)?; + layout.block_format = block_format; + Ok(layout) + } + + /// Get the block format. + pub fn block_format(&self) -> BlockFormat { + self.block_format + } + + /// Calculate the address of a specific memory region. + fn calculate_address( + &self, + block_id: usize, + layer_id: usize, + outer_id: usize, + ) -> Result { + if block_id >= self.config.num_blocks { + return Err(anyhow!( + "Block ID {} out of range (max: {})", + block_id, + self.config.num_blocks + )); + } + if layer_id >= self.config.num_layers { + return Err(anyhow!( + "Layer ID {} out of range (max: {})", + layer_id, + self.config.num_layers + )); + } + if outer_id >= self.config.outer_dim { + return Err(anyhow!( + "Outer ID {} out of range (max: {})", + outer_id, + self.config.outer_dim + )); + } + + Ok(self.base_addr + + block_id * self.block_stride + + layer_id * self.layer_stride + + outer_id * self.outer_stride) + } + + /// Get mutable reference to the memory Arc for NIXL registration. + pub fn memory_arc_mut(&mut self) -> &mut Arc { + &mut self.memory + } +} + +impl Layout for FullyContiguousLayout { + fn config(&self) -> &LayoutConfig { + &self.config + } + + fn memory_regions(&self) -> &[OwnedMemoryRegion] { + std::slice::from_ref(&self.memory) + } + + fn memory_region( + &self, + block_id: usize, + layer_id: usize, + outer_id: usize, + ) -> Result { + let addr = self.calculate_address(block_id, layer_id, outer_id)?; + Ok(MemoryDescriptor::new(addr, self.region_size)) + } + + fn required_allocations(&self) -> Vec { + // Single contiguous allocation + vec![self.block_stride * self.config.num_blocks] + } + + fn is_fully_contiguous(&self) -> bool { + true + } + + fn num_blocks(&self) -> usize { + self.config.num_blocks + } + + fn num_layers(&self) -> usize { + self.config.num_layers + } + + fn outer_dim(&self) -> usize { + self.config.outer_dim + } + + fn page_size(&self) -> usize { + self.config.page_size + } + + fn inner_dim(&self) -> usize { + self.config.inner_dim + } + + fn dtype_width_bytes(&self) -> usize { + self.config.dtype_width_bytes + } + + fn serialization_details(&self) -> LayoutTypeDetails { + LayoutTypeDetails::FullyContiguous(FullyContiguousDetails { + block_format: self.block_format, + }) + } +} + +#[cfg(test)] +mod tests { + use super::super::tests::*; + use super::*; + + #[test] + fn test_fully_contiguous_layout_creation() { + let config = LayoutConfig::builder() + .num_blocks(10) + .num_layers(4) + .outer_dim(2) + .page_size(16) + .inner_dim(128) + .dtype_width_bytes(2) + .build() + .unwrap(); + + let required_bytes = config.required_bytes(); + assert_eq!(required_bytes, 10 * 4 * 2 * 16 * 128 * 2); + + let memory = MockMemory::new(0x1000, required_bytes); + + let layout = FullyContiguousLayout::new(config, memory).unwrap(); + assert_eq!(layout.num_blocks(), 10); + assert!(layout.is_fully_contiguous()); + } + + #[test] + fn test_memory_region() { + let config = LayoutConfig::builder() + .num_blocks(2) + .num_layers(2) + .outer_dim(2) + .page_size(16) + .inner_dim(128) + .dtype_width_bytes(2) + .build() + .unwrap(); + + let required_size = config.required_bytes(); + let memory = MockMemory::new(0x1000, required_size); + let layout = FullyContiguousLayout::new(config.clone(), memory).unwrap(); + + // Test accessing specific memory regions + let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes; + + // Block 0, Layer 0, Outer 0 + let region = layout.memory_region(0, 0, 0).unwrap(); + assert_eq!(region.addr, 0x1000); + assert_eq!(region.size, region_size); + + // Block 0, Layer 0, Outer 1 + let region = layout.memory_region(0, 0, 1).unwrap(); + assert_eq!(region.addr, 0x1000 + region_size); + assert_eq!(region.size, region_size); + + // Block 0, Layer 1, Outer 0 + let region = layout.memory_region(0, 1, 0).unwrap(); + assert_eq!(region.addr, 0x1000 + 2 * region_size); + assert_eq!(region.size, region_size); + + // Block 1, Layer 0, Outer 0 + let region = layout.memory_region(1, 0, 0).unwrap(); + assert_eq!( + region.addr, + 0x1000 + (config.outer_dim * config.num_layers * region_size) + ); + assert_eq!(region.size, region_size); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/layout/integration_tests.rs b/lib/llm/src/block_manager/v2/physical/layout/integration_tests.rs new file mode 100644 index 0000000000..e09a71cef5 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/layout/integration_tests.rs @@ -0,0 +1,401 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Integration tests comparing v1 and v2 layout implementations. +//! +//! These tests validate that the new v2 layout system produces identical +//! memory region **addresses** as the proven v1 implementation. +//! +//! **Note on Size Differences**: V1's `memory_region()` returns `layer_stride` as the +//! size (covering all outer dimensions), while V2 returns `outer_stride` (single page). +//! This is an intentional API difference - V2 provides more granular access. +//! Therefore, these tests only compare addresses, not sizes. + +#![cfg(test)] + +use anyhow::Result; +use std::{any::Any, sync::Arc}; + +use crate::block_manager::{ + layout::{ + BlockDimension, BlockLayout, BlockLayoutConfig, GenericBlockLayout, LayoutConfig, + LayoutType, + tests::{setup_layer_separate_layout, setup_layout}, + }, + storage::{Storage, tests::NullDeviceStorage}, + v2::storage::StorageKind, +}; + +use super::{ + FullyContiguousLayout, LayerSeparateLayout, Layout, LayoutConfig as V2LayoutConfig, + MemoryRegion, +}; + +// Test constants matching v1 tests +const NUM_BLOCKS: usize = 7; +const NUM_LAYERS: usize = 5; +const OUTER_DIM: usize = 2; +const PAGE_SIZE: usize = 4; +const INNER_DIM: usize = 13; +const DTYPE_WIDTH_BYTES: usize = 4; + +/// Wrapper to make v1 NullDeviceStorage compatible with v2 MemoryRegion trait. +#[derive(Debug)] +struct V1StorageWrapper { + storage: NullDeviceStorage, +} + +impl MemoryRegion for V1StorageWrapper { + fn addr(&self) -> usize { + self.storage.addr() as usize + } + + fn size(&self) -> usize { + self.storage.size() + } + + fn storage_kind(&self) -> StorageKind { + StorageKind::System + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +/// Create v1 layout configuration +fn create_v1_config() -> LayoutConfig { + LayoutConfig::builder() + .num_blocks(NUM_BLOCKS) + .num_layers(NUM_LAYERS) + .outer_dim(OUTER_DIM) + .page_size(PAGE_SIZE) + .inner_dim(INNER_DIM) + .alignment(1) + .dtype_width_bytes(DTYPE_WIDTH_BYTES) + .build() + .unwrap() +} + +/// Create v2 layout configuration (equivalent to v1) +fn create_v2_config() -> V2LayoutConfig { + create_v1_config() +} + +#[test] +fn test_v1_v2_fully_contiguous_equivalence() -> Result<()> { + // Create v1 layout + let v1_layout = setup_layout(None)?; + + // Create v2 layout with same configuration + let v2_config = create_v2_config(); + let required_size = + NUM_BLOCKS * NUM_LAYERS * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES; + let v1_storage = NullDeviceStorage::new(required_size as u64); + let memory = Arc::new(V1StorageWrapper { + storage: v1_storage, + }) as Arc; + let v2_layout = FullyContiguousLayout::new(v2_config, memory)?; + + // Compare all memory regions + for block_id in 0..NUM_BLOCKS { + for layer_id in 0..NUM_LAYERS { + for outer_id in 0..OUTER_DIM { + let v1_region = v1_layout.memory_region(block_id, layer_id, outer_id)?; + let v2_region = v2_layout.memory_region(block_id, layer_id, outer_id)?; + + assert_eq!( + v1_region.addr(), + v2_region.addr, + "Address mismatch at block={}, layer={}, outer={}", + block_id, + layer_id, + outer_id + ); + assert_eq!( + v1_region.size(), + v2_region.size, + "Size mismatch at block={}, layer={}, outer={}", + block_id, + layer_id, + outer_id + ); + } + } + } + + // Verify metadata + assert_eq!(v1_layout.num_blocks(), v2_layout.num_blocks()); + assert_eq!(v1_layout.num_layers(), v2_layout.num_layers()); + assert_eq!(v1_layout.outer_dim(), v2_layout.outer_dim()); + assert_eq!(v1_layout.page_size(), v2_layout.page_size()); + assert_eq!(v1_layout.inner_dim(), v2_layout.inner_dim()); + + Ok(()) +} + +#[test] +fn test_v1_v2_layer_separate_block_contiguous_equivalence() -> Result<()> { + // Create v1 layout (block contiguous = !outer_contiguous) + let v1_layout = setup_layer_separate_layout(None, BlockDimension::BlockIsFirstDim)?; + + // Create v2 layout with same configuration + let v2_config = create_v2_config(); + let per_layer_size = NUM_BLOCKS * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES; + + let memory: Vec> = (0..NUM_LAYERS) + .map(|_| { + Arc::new(V1StorageWrapper { + storage: NullDeviceStorage::new(per_layer_size as u64), + }) as Arc + }) + .collect(); + + let v2_layout = LayerSeparateLayout::new(v2_config, memory, BlockDimension::BlockIsFirstDim)?; + + // Verify metadata + assert_eq!(v1_layout.num_blocks(), v2_layout.num_blocks()); + assert_eq!(v1_layout.num_layers(), v2_layout.num_layers()); + assert_eq!(v1_layout.outer_dim(), v2_layout.outer_dim()); + assert_eq!(v1_layout.page_size(), v2_layout.page_size()); + assert_eq!(v1_layout.inner_dim(), v2_layout.inner_dim()); + + // Compare all memory regions + for block_id in 0..NUM_BLOCKS { + for layer_id in 0..NUM_LAYERS { + for outer_id in 0..OUTER_DIM { + let v1_region = v1_layout.memory_region(block_id, layer_id, outer_id)?; + let v2_region = v2_layout.memory_region(block_id, layer_id, outer_id)?; + + assert_eq!( + v1_region.addr(), + v2_region.addr, + "Address mismatch at block={}, layer={}, outer={} (block_contiguous)", + block_id, + layer_id, + outer_id + ); + assert_eq!( + v1_region.size(), + v2_region.size, + "Size mismatch at block={}, layer={}, outer={} (block_contiguous)", + block_id, + layer_id, + outer_id + ); + } + } + } + + // Verify layout type + assert!(!v2_layout.is_fully_contiguous()); + + assert_eq!( + v1_layout.layout_type(), + LayoutType::LayerSeparate { + block_dim: BlockDimension::BlockIsFirstDim, + } + ); + + Ok(()) +} + +#[test] +fn test_v1_v2_layer_separate_outer_contiguous_equivalence() -> Result<()> { + // Create v1 layout (outer contiguous) + let v1_layout = setup_layer_separate_layout(None, BlockDimension::BlockIsSecondDim)?; + + // Create v2 layout with same configuration + let v2_config = create_v2_config(); + let per_layer_size = NUM_BLOCKS * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES; + + let memory: Vec> = (0..NUM_LAYERS) + .map(|_| { + Arc::new(V1StorageWrapper { + storage: NullDeviceStorage::new(per_layer_size as u64), + }) as Arc + }) + .collect(); + + let v2_layout = LayerSeparateLayout::new(v2_config, memory, BlockDimension::BlockIsSecondDim)?; + + // Compare all memory regions + for block_id in 0..NUM_BLOCKS { + for layer_id in 0..NUM_LAYERS { + for outer_id in 0..OUTER_DIM { + let v1_region = v1_layout.memory_region(block_id, layer_id, outer_id)?; + let v2_region = v2_layout.memory_region(block_id, layer_id, outer_id)?; + + assert_eq!( + v1_region.addr(), + v2_region.addr, + "Address mismatch at block={}, layer={}, outer={} (outer_contiguous)", + block_id, + layer_id, + outer_id + ); + assert_eq!( + v1_region.size(), + v2_region.size, + "Size mismatch at block={}, layer={}, outer={} (outer_contiguous)", + block_id, + layer_id, + outer_id + ); + } + } + } + + // Verify layout type + assert!(!v2_layout.is_fully_contiguous()); + assert_eq!( + v1_layout.layout_type(), + LayoutType::LayerSeparate { + block_dim: BlockDimension::BlockIsSecondDim, + } + ); + + Ok(()) +} + +#[test] +fn test_v1_v2_stride_calculations() -> Result<()> { + // Test with a specific pattern to verify stride calculations + let _v1_layout = setup_layout(None)?; + let v2_config = create_v2_config(); + let required_size = + NUM_BLOCKS * NUM_LAYERS * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES; + let v1_storage = NullDeviceStorage::new(required_size as u64); + let memory = Arc::new(V1StorageWrapper { + storage: v1_storage, + }) as Arc; + let v2_layout = FullyContiguousLayout::new(v2_config, memory)?; + + // Calculate expected strides + let region_size = PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES; + let outer_stride = region_size; + let layer_stride = outer_stride * OUTER_DIM; + let block_stride = layer_stride * NUM_LAYERS; + + // Test stride consistency across blocks + for block_id in 0..NUM_BLOCKS - 1 { + let region_b0 = v2_layout.memory_region(block_id, 0, 0)?; + let region_b1 = v2_layout.memory_region(block_id + 1, 0, 0)?; + assert_eq!( + region_b1.addr - region_b0.addr, + block_stride, + "Block stride mismatch between blocks {} and {}", + block_id, + block_id + 1 + ); + } + + // Test stride consistency across layers + for layer_id in 0..NUM_LAYERS - 1 { + let region_l0 = v2_layout.memory_region(0, layer_id, 0)?; + let region_l1 = v2_layout.memory_region(0, layer_id + 1, 0)?; + assert_eq!( + region_l1.addr - region_l0.addr, + layer_stride, + "Layer stride mismatch between layers {} and {}", + layer_id, + layer_id + 1 + ); + } + + // Test stride consistency across outer dimensions + for outer_id in 0..OUTER_DIM - 1 { + let region_o0 = v2_layout.memory_region(0, 0, outer_id)?; + let region_o1 = v2_layout.memory_region(0, 0, outer_id + 1)?; + assert_eq!( + region_o1.addr - region_o0.addr, + outer_stride, + "Outer stride mismatch between outer dims {} and {}", + outer_id, + outer_id + 1 + ); + } + + Ok(()) +} + +#[test] +fn test_v1_v2_edge_case_single_block() -> Result<()> { + // Test with minimal configuration: single block + let v1_config = LayoutConfig::builder() + .num_blocks(1) + .num_layers(NUM_LAYERS) + .outer_dim(OUTER_DIM) + .page_size(PAGE_SIZE) + .inner_dim(INNER_DIM) + .dtype_width_bytes(DTYPE_WIDTH_BYTES) + .build() + .unwrap(); + + let v1_layout = crate::block_manager::layout::FullyContiguous::allocate( + v1_config.clone(), + &crate::block_manager::storage::tests::NullDeviceAllocator, + )?; + + let v2_config = v1_config.clone(); + + let required_size = 1 * NUM_LAYERS * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES; + let v1_storage = NullDeviceStorage::new(required_size as u64); + let memory = Arc::new(V1StorageWrapper { + storage: v1_storage, + }) as Arc; + let v2_layout = FullyContiguousLayout::new(v2_config, memory)?; + + // Compare the single block across all layers and outer dims + for layer_id in 0..NUM_LAYERS { + for outer_id in 0..OUTER_DIM { + let v1_region = v1_layout.memory_region(0, layer_id, outer_id)?; + let v2_region = v2_layout.memory_region(0, layer_id, outer_id)?; + + assert_eq!(v1_region.addr(), v2_region.addr); + assert_eq!(v1_region.size(), v2_region.size); + } + } + + Ok(()) +} + +#[test] +fn test_v1_v2_edge_case_single_layer() -> Result<()> { + // Test with minimal configuration: single layer + let v1_config = LayoutConfig::builder() + .num_blocks(NUM_BLOCKS) + .num_layers(1) + .outer_dim(OUTER_DIM) + .page_size(PAGE_SIZE) + .inner_dim(INNER_DIM) + .dtype_width_bytes(DTYPE_WIDTH_BYTES) + .build()?; + + let v1_layout = crate::block_manager::layout::FullyContiguous::allocate( + v1_config.clone(), + &crate::block_manager::storage::tests::NullDeviceAllocator, + )?; + + let v2_config = v1_config.clone(); + + let required_size = NUM_BLOCKS * 1 * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES; + let v1_storage = NullDeviceStorage::new(required_size as u64); + let memory = Arc::new(V1StorageWrapper { + storage: v1_storage, + }) as Arc; + let v2_layout = FullyContiguousLayout::new(v2_config, memory)?; + + // Compare the single layer across all blocks and outer dims + for block_id in 0..NUM_BLOCKS { + for outer_id in 0..OUTER_DIM { + let v1_region = v1_layout.memory_region(block_id, 0, outer_id)?; + let v2_region = v2_layout.memory_region(block_id, 0, outer_id)?; + + assert_eq!(v1_region.addr(), v2_region.addr); + assert_eq!(v1_region.size(), v2_region.size); + } + } + + Ok(()) +} diff --git a/lib/llm/src/block_manager/v2/physical/layout/layer_separate.rs b/lib/llm/src/block_manager/v2/physical/layout/layer_separate.rs new file mode 100644 index 0000000000..035dbb18b1 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/layout/layer_separate.rs @@ -0,0 +1,311 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Layer-separate layout implementation. +//! +//! This layout stores each layer in its own allocation, which is the typical +//! vLLM layout. Each layer can be either block-contiguous or outer-contiguous: +//! - Block-contiguous: [num_blocks, outer_dim, page_size, inner_dim] +//! - Outer-contiguous: [outer_dim, num_blocks, page_size, inner_dim] + +use anyhow::{Result, anyhow}; +use std::sync::Arc; +use validator::Validate; + +use super::serialize::{LayerSeparateDetails, LayoutTypeDetails}; +use super::{ + BlockDimension, Layout, LayoutConfig, MemoryDescriptor, MemoryRegion, OwnedMemoryRegion, +}; + +/// Layer-separate layout where each layer has its own allocation. +#[derive(Debug)] +pub struct LayerSeparateLayout { + config: LayoutConfig, + /// Base addresses for each layer + layer_base_addrs: Vec, + /// Whether the outer dimension is contiguous (vs block dimensionl + block_dim: BlockDimension, + /// Stride between blocks in bytes + block_stride: usize, + /// Stride between outer dimensions in bytes + outer_stride: usize, + /// Size of each memory region (page) in bytes + region_size: usize, + /// Owned memory regions backing this layout (one per layer) + memory_regions: Vec>, +} + +impl LayerSeparateLayout { + /// Create a new layer-separate layout. + /// + /// # Arguments + /// - `config` - Layout configuration + /// - `memory` - Vector of owned memory regions (one per layer) + /// - `outer_contiguous` - If true, outer dimension is contiguous with the inner dimension, i.e. (num_blocks, outer_dim, ...); + /// if false, block dimension is contiguous with the inner dimension, i.e. (outer_dim, num_blocks, ...). + /// + /// # Returns + /// A new LayerSeparateLayout instance + pub fn new( + config: LayoutConfig, + memory: Vec>, + block_dim: BlockDimension, + ) -> Result { + config.validate()?; + + if memory.len() != config.num_layers { + return Err(anyhow!( + "Memory region count ({}) must match num_layers ({})", + memory.len(), + config.num_layers + )); + } + + // Calculate strides + let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes; + + let (block_stride, outer_stride) = if block_dim == BlockDimension::BlockIsSecondDim { + // Layout: [outer_dim, num_blocks, page_size, inner_dim] + let block_stride = region_size; + let outer_stride = block_stride * config.num_blocks; + (block_stride, outer_stride) + } else { + // Layout: [num_blocks, outer_dim, page_size, inner_dim] + let outer_stride = region_size; + let block_stride = outer_stride * config.outer_dim; + (block_stride, outer_stride) + }; + + // Extract base addresses and validate sizes + let mut layer_base_addrs = Vec::with_capacity(config.num_layers); + let required_size = config.num_blocks * config.outer_dim * region_size; + + for (i, mem) in memory.iter().enumerate() { + if mem.size() < required_size { + return Err(anyhow!( + "Memory region {} too small for layout. Required: {} bytes, got: {} bytes", + i, + required_size, + mem.size() + )); + } + layer_base_addrs.push(mem.addr()); + } + + Ok(Self { + config, + layer_base_addrs, + block_dim, + block_stride, + outer_stride, + region_size, + memory_regions: memory, + }) + } + + /// Calculate the address of a specific memory region. + fn calculate_address( + &self, + block_id: usize, + layer_id: usize, + outer_id: usize, + ) -> Result { + if block_id >= self.config.num_blocks { + return Err(anyhow!( + "Block ID {} out of range (max: {})", + block_id, + self.config.num_blocks + )); + } + if layer_id >= self.config.num_layers { + return Err(anyhow!( + "Layer ID {} out of range (max: {})", + layer_id, + self.config.num_layers + )); + } + if outer_id >= self.config.outer_dim { + return Err(anyhow!( + "Outer ID {} out of range (max: {})", + outer_id, + self.config.outer_dim + )); + } + + let base_addr = self.layer_base_addrs[layer_id]; + let offset = block_id * self.block_stride + outer_id * self.outer_stride; + + Ok(base_addr + offset) + } + + pub fn block_dim(&self) -> BlockDimension { + self.block_dim + } + + /// Get mutable reference to the memory regions for NIXL registration. + pub fn memory_regions_mut(&mut self) -> &mut [Arc] { + &mut self.memory_regions + } +} + +impl Layout for LayerSeparateLayout { + fn config(&self) -> &LayoutConfig { + &self.config + } + + fn memory_regions(&self) -> &[OwnedMemoryRegion] { + &self.memory_regions + } + + fn memory_region( + &self, + block_id: usize, + layer_id: usize, + outer_id: usize, + ) -> Result { + let addr = self.calculate_address(block_id, layer_id, outer_id)?; + Ok(MemoryDescriptor::new(addr, self.region_size)) + } + + fn required_allocations(&self) -> Vec { + // One allocation per layer + let per_layer_size = self.config.num_blocks * self.config.outer_dim * self.region_size; + vec![per_layer_size; self.config.num_layers] + } + + fn is_fully_contiguous(&self) -> bool { + false + } + + fn num_blocks(&self) -> usize { + self.config.num_blocks + } + + fn num_layers(&self) -> usize { + self.config.num_layers + } + + fn outer_dim(&self) -> usize { + self.config.outer_dim + } + + fn page_size(&self) -> usize { + self.config.page_size + } + + fn inner_dim(&self) -> usize { + self.config.inner_dim + } + + fn dtype_width_bytes(&self) -> usize { + self.config.dtype_width_bytes + } + + fn serialization_details(&self) -> LayoutTypeDetails { + LayoutTypeDetails::LayerSeparate(LayerSeparateDetails { + block_dim: self.block_dim, + }) + } +} + +#[cfg(test)] +mod tests { + use super::super::tests::*; + use super::*; + + #[test] + fn test_layer_separate_block_contiguous() { + let config = LayoutConfig::builder() + .num_blocks(10) + .num_layers(4) + .outer_dim(2) + .page_size(16) + .inner_dim(128) + .dtype_width_bytes(2) + .build() + .unwrap(); + + let per_layer_size = 10 * 2 * 16 * 128 * 2; + let memory: Vec> = (0..4) + .map(|i| { + MockMemory::new(0x1000 + i * per_layer_size, per_layer_size) + as Arc + }) + .collect(); + + let layout = + LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap(); + + assert_eq!(layout.num_blocks(), 10); + assert!(!layout.is_fully_contiguous()); + assert_eq!(layout.required_allocations().len(), 4); + } + + #[test] + fn test_layer_separate_outer_contiguous() { + let config = LayoutConfig::builder() + .num_blocks(10) + .num_layers(4) + .outer_dim(2) + .page_size(16) + .inner_dim(128) + .dtype_width_bytes(2) + .build() + .unwrap(); + + let per_layer_size = 10 * 2 * 16 * 128 * 2; + let memory: Vec> = (0..4) + .map(|i| { + MockMemory::new(0x1000 + i * per_layer_size, per_layer_size) + as Arc + }) + .collect(); + + let layout = + LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsSecondDim).unwrap(); + assert_eq!(layout.num_blocks(), 10); + assert!(!layout.is_fully_contiguous()); + } + + #[test] + fn test_memory_region() { + let config = LayoutConfig::builder() + .num_blocks(2) + .num_layers(2) + .outer_dim(2) + .page_size(16) + .inner_dim(128) + .dtype_width_bytes(2) + .build() + .unwrap(); + + let per_layer_size = 2 * 2 * 16 * 128 * 2; + let memory: Vec> = (0..2) + .map(|i| { + MockMemory::new(0x1000 + i * per_layer_size, per_layer_size) + as Arc + }) + .collect(); + + let layout = + LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap(); + + // Test accessing specific memory regions + let region_size = 16 * 128 * 2; + + // Block 0, Layer 0, Outer 0 - should be at layer 0's base address + let region = layout.memory_region(0, 0, 0).unwrap(); + assert_eq!(region.addr, 0x1000); + assert_eq!(region.size, region_size); + + // Block 0, Layer 1, Outer 0 - should be at layer 1's base address + let region = layout.memory_region(0, 1, 0).unwrap(); + assert_eq!(region.addr, 0x1000 + per_layer_size); + assert_eq!(region.size, region_size); + + // Block 0, Layer 0, Outer 1 - should be offset within layer 0 + let region = layout.memory_region(0, 0, 1).unwrap(); + assert_eq!(region.addr, 0x1000 + region_size); + assert_eq!(region.size, region_size); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/layout/mod.rs b/lib/llm/src/block_manager/v2/physical/layout/mod.rs new file mode 100644 index 0000000000..1b125a41b0 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/layout/mod.rs @@ -0,0 +1,135 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Decoupled layout system for block management. +//! +//! This module provides a simplified layout abstraction that: +//! - Maps block IDs to physical memory regions (address + size) +//! - Decouples memory regions from storage type information +//! - Specifies allocation requirements without performing allocation +//! - Uses trait objects for memory ownership + +pub(crate) mod builder; + +mod config; +mod fully_contiguous; +mod layer_separate; +mod physical; +mod serialize; +mod validation; + +#[cfg(test)] +pub(super) mod tests; + +// #[cfg(test)] +// mod integration_tests; + +pub use builder::{LayoutKind, PhysicalLayoutBuilder}; +pub use config::{BlockDimension, LayoutConfig}; +pub use fully_contiguous::FullyContiguousLayout; +pub use layer_separate::LayerSeparateLayout; +pub use physical::{NixlMetadata, PhysicalLayout}; +pub use serialize::{ + BlockFormat, FullyContiguousDetails, LayerSeparateDetails, LayoutDescriptor, LayoutTypeDetails, +}; +pub use validation::{TensorFormat, validate_tensor_shapes, validate_tensor_strides}; + +// mod registration; +// pub use registration::{RegisteredLayout, RegisteredStorageMetadata, RegistrationManager}; + +use anyhow::Result; +use serde::{Deserialize, Serialize}; + +pub use crate::block_manager::v2::memory::{MemoryDescriptor, MemoryRegion, OwnedMemoryRegion}; + +/// Core layout trait for mapping block IDs to memory regions. +/// +/// Layouts specify how KV cache blocks are organized in memory without +/// performing allocation themselves. They provide: +/// - Memory region lookup for specific blocks +/// - Allocation requirements for external allocators +/// - Metadata about block organization +pub trait Layout: Send + Sync + std::fmt::Debug { + /// Get the configuration for this layout. + fn config(&self) -> &LayoutConfig; + + /// Get the root memory regions backing this layout. + /// + /// These regions correspond to the concrete allocations that store the layout's data. + /// Implementations that derive memory procedurally can return an empty slice. + fn memory_regions(&self) -> &[OwnedMemoryRegion]; + + /// Get memory regions for a specific block_id, layer_id, outer_id. + /// + /// Returns a [MemoryRegion] for the continuous region specified by the given block_id, + /// layer_id, outer_id. + /// + /// # Arguments + /// * `block_id` - The ID of the block to query (0..num_blocks) + /// * `layer_id` - The ID of the layer to query (0..num_layers) + /// * `outer_id` - The ID of the outer dimension to query (0..outer_dim) + fn memory_region( + &self, + block_id: usize, + layer_id: usize, + outer_id: usize, + ) -> Result; + + /// Get the allocation requirements for this layout. + /// + /// Returns a vector of allocation sizes needed to back this layout. + /// For fully contiguous layouts, this will be a single size. + /// For layer-separate layouts, this will contain one size per layer. + /// + /// # Returns + /// Vector of allocation sizes in bytes. + fn required_allocations(&self) -> Vec; + + /// Check if this layout uses fully contiguous memory. + /// + /// Fully contiguous layouts have all blocks in a single allocation, + /// which enables certain optimizations. + fn is_fully_contiguous(&self) -> bool; + + /// Get the total number of blocks in this layout. + fn num_blocks(&self) -> usize; + + /// Get the number of layers per block. + fn num_layers(&self) -> usize; + + /// Get the outer dimension size. + /// + /// In typical KV cache layouts, this is often 2 (for K and V), + /// but can be 1 for architectures like MLA. + fn outer_dim(&self) -> usize; + + /// Get the page size (often corresponds to block size in tokens). + fn page_size(&self) -> usize; + + /// Get the inner dimension size. + /// + /// This is typically the hidden size divided by tensor parallel size. + fn inner_dim(&self) -> usize; + + /// Get the data type width in bytes. + fn dtype_width_bytes(&self) -> usize; + + /// Get serialization details for this layout type. + /// + /// This provides the layout-type-specific information needed to serialize + /// and reconstruct the layout on a remote node. + fn serialization_details(&self) -> serialize::LayoutTypeDetails; +} + +/// Inner shape format for tensor layout +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum InnerShape { + /// Unknown shape - fallback when we can't determine the format + Unknown, + /// NHD format: [block_size, num_heads, head_dim] + /// Common for attention layers where N=tokens, H=heads, D=dimension + NHD, + /// HND format: [num_heads, block_size, head_dim] + /// Alternative layout with heads first + HND, +} diff --git a/lib/llm/src/block_manager/v2/physical/layout/physical.rs b/lib/llm/src/block_manager/v2/physical/layout/physical.rs new file mode 100644 index 0000000000..886c791939 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/layout/physical.rs @@ -0,0 +1,290 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Physical layout types that combine abstract layouts with storage location metadata. + +use super::{ + FullyContiguousLayout, LayerSeparateLayout, Layout, MemoryDescriptor, + builder::{PhysicalLayoutBuilder, PhysicalLayoutBuilderDefault}, + serialize::{LayoutDescriptor, LayoutTypeDetails}, +}; + +use crate::block_manager::v2::memory::{MemoryRegion, StorageKind}; +use anyhow::{Result, anyhow}; +use serde::{Deserialize, Serialize}; +use std::any::Any; +use std::sync::Arc; + +use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent; + +/// Runtime representation of a layout with its physical storage location. +/// +/// A `PhysicalLayout` wraps an abstract [`Layout`] with information about where +/// its memory physically resides (GPU, host, disk) and whether it's local or remote. +/// This enables the transfer system to select appropriate copy strategies and build +/// NIXL transfer descriptors. +#[derive(Debug, Clone)] +pub struct PhysicalLayout { + /// The abstract layout defining memory organization + layout: Arc, + + /// Physical storage location (System, Device, Pinned, Disk) + location: StorageKind, + + /// NIXL registration metadata + nixl_metadata: NixlMetadata, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NixlMetadata { + agent_name: String, + mem_type: nixl_sys::MemType, + device_id: u64, +} + +impl NixlMetadata { + pub fn new(agent_name: String, mem_type: nixl_sys::MemType, device_id: u64) -> Self { + Self { + agent_name, + mem_type, + device_id, + } + } + + pub fn agent_name(&self) -> &str { + &self.agent_name + } + + pub fn mem_type(&self) -> nixl_sys::MemType { + self.mem_type + } + + pub fn device_id(&self) -> u64 { + self.device_id + } +} + +impl PhysicalLayout { + /// Create a typed builder that enforces NIXL registration. + pub fn builder(agent: NixlAgent) -> PhysicalLayoutBuilderDefault { + PhysicalLayoutBuilder::new(agent) + } + + /// Create a new local physical layout. + /// + /// # Arguments + /// * `layout` - The abstract layout to wrap + /// * `location` - Where the layout's memory resides + pub(crate) fn new_local( + layout: Arc, + location: StorageKind, + nixl_metadata: NixlMetadata, + ) -> Self { + Self { + layout, + location, + nixl_metadata, + } + } + + // /// Create a new remote physical layout from a descriptor. + // /// + // /// # Arguments + // /// * `layout` - The abstract layout to wrap + // /// * `location` - Where the layout's memory resides (on remote node) + // /// * `remote_agent` - Name of the NIXL agent on the remote node + // pub fn new_remote( + // layout: Arc, + // location: StorageKind, + // remote_agent: String, + // ) -> Self { + // let metadata = NixlMetadata::new( + // remote_agent.clone(), + // location.to_nixl_mem_type(), + // location.device_id(), + // ); + // let registrations = vec![RegisteredStorageMetadata::new( + // metadata.agent_name().to_string(), + // location, + // )]; + // Self { + // layout, + // location, + // locality: Locality::Remote(remote_agent), + // nixl_metadata: Some(metadata), + // registered: registrations, + // } + // } + + /// Get the underlying layout. + pub fn layout(&self) -> &Arc { + &self.layout + } + + /// Get the storage location. + pub fn location(&self) -> StorageKind { + self.location + } + + /// Get the NIXL metadata. + pub fn nixl_metadata(&self) -> &NixlMetadata { + &self.nixl_metadata + } + + /// Get a memory region with location information. + /// + /// # Arguments + /// * `block_id` - Block identifier + /// * `layer_id` - Layer identifier + /// * `outer_id` - Outer dimension identifier + pub fn memory_region( + &self, + block_id: usize, + layer_id: usize, + outer_id: usize, + ) -> Result { + self.layout.memory_region(block_id, layer_id, outer_id) + } + + /// Serialize this physical layout for transmission to remote nodes. + /// + /// This converts the runtime `PhysicalLayout` into a `LayoutDescriptor` that + /// contains all information needed to reconstruct the layout on a remote node, + /// including layout configuration, memory descriptors, NIXL metadata, and + /// layout-type-specific details. + /// + /// # Returns + /// A serializable representation of this layout + pub fn to_descriptor(&self) -> Result { + // Extract memory descriptors + let memory_descriptors = self + .layout + .memory_regions() + .iter() + .map(|region| MemoryDescriptor { + addr: region.addr(), + size: region.size(), + }) + .collect(); + + // Get layout type details from the layout itself + let layout_type_details = self.layout.serialization_details(); + + Ok(LayoutDescriptor { + version: LayoutDescriptor::CURRENT_VERSION, + layout_config: self.layout.config().clone(), + location: self.location, + nixl_metadata: self.nixl_metadata.clone(), + memory_descriptors, + layout_type_details, + }) + } + + /// Reconstruct a physical layout from serialized data received from a remote node. + /// + /// This creates a new `PhysicalLayout` from a `LayoutDescriptor`. The reconstructed + /// layout will have memory descriptors that point to the remote node's memory, + /// allowing NIXL to build RDMA descriptors for remote access. + /// + /// # Arguments + /// * `serialized` - Serialized layout data from a remote node + /// + /// # Returns + /// A new `PhysicalLayout` representing the remote layout + /// + /// # Note + /// The memory regions in the reconstructed layout are not valid for local access; + /// they represent remote memory addresses and are used to build NIXL transfer descriptors. + pub fn from_descriptor(serialized: LayoutDescriptor) -> Result { + // Validate version + if serialized.version > LayoutDescriptor::CURRENT_VERSION { + return Err(anyhow!( + "Unsupported serialization version: {}. Maximum supported: {}", + serialized.version, + LayoutDescriptor::CURRENT_VERSION + )); + } + + // Create remote memory regions from descriptors + let remote_regions: Vec> = serialized + .memory_descriptors + .iter() + .map(|desc| { + Arc::new(RemoteMemoryDescriptor { + addr: desc.addr, + size: desc.size, + storage_kind: serialized.location, + }) as Arc + }) + .collect(); + + // Reconstruct the layout based on type + let layout: Arc = match serialized.layout_type_details { + LayoutTypeDetails::FullyContiguous(details) => { + if remote_regions.len() != 1 { + return Err(anyhow!( + "FullyContiguous layout requires exactly 1 memory region, got {}", + remote_regions.len() + )); + } + let layout = FullyContiguousLayout::new_with_format( + serialized.layout_config.clone(), + remote_regions[0].clone(), + details.block_format, + )?; + Arc::new(layout) + } + LayoutTypeDetails::LayerSeparate(details) => { + if remote_regions.len() != serialized.layout_config.num_layers { + return Err(anyhow!( + "LayerSeparate layout requires {} memory regions (one per layer), got {}", + serialized.layout_config.num_layers, + remote_regions.len() + )); + } + let layout = LayerSeparateLayout::new( + serialized.layout_config.clone(), + remote_regions, + details.block_dim, + )?; + Arc::new(layout) + } + }; + + Ok(Self { + layout, + location: serialized.location, + nixl_metadata: serialized.nixl_metadata, + }) + } +} + +/// A memory region that represents remote memory addresses. +/// +/// This type is used when reconstructing layouts from serialized data. +/// The addresses are not valid for local access but can be used to +/// build NIXL transfer descriptors for remote memory access. +#[derive(Debug)] +struct RemoteMemoryDescriptor { + addr: usize, + size: usize, + storage_kind: StorageKind, +} + +impl MemoryRegion for RemoteMemoryDescriptor { + fn addr(&self) -> usize { + self.addr + } + + fn size(&self) -> usize { + self.size + } + + fn storage_kind(&self) -> StorageKind { + self.storage_kind + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/lib/llm/src/block_manager/v2/physical/layout/serialize.rs b/lib/llm/src/block_manager/v2/physical/layout/serialize.rs new file mode 100644 index 0000000000..997742a075 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/layout/serialize.rs @@ -0,0 +1,268 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Serialization types for physical layouts. +//! +//! This module provides types for serializing and deserializing physical layouts +//! so they can be transmitted to remote nodes and reconstructed there for RDMA operations. + +use super::physical::NixlMetadata; +use super::{BlockDimension, LayoutConfig}; +use crate::block_manager::v2::memory::{MemoryDescriptor, StorageKind}; +use anyhow::Result; +use serde::{Deserialize, Serialize}; + +/// Format of blocks in a fully contiguous layout. +/// +/// This enum describes how the blocks are organized and formatted in memory. +/// Currently only `Operational` is supported, but future variants may include +/// different compression schemes or memory layouts. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum BlockFormat { + /// Standard operational format - blocks are stored in their normal, uncompressed form. + Operational, +} + +impl Default for BlockFormat { + fn default() -> Self { + Self::Operational + } +} + +/// Details specific to fully contiguous layouts. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FullyContiguousDetails { + /// Format of the blocks in memory + pub block_format: BlockFormat, +} + +/// Details specific to layer-separate layouts. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LayerSeparateDetails { + /// Block dimension ordering (block-first or block-second) + pub block_dim: BlockDimension, +} + +/// Layout-type-specific details. +/// +/// This enum captures the information that differs between layout types +/// and is needed to reconstruct the layout on a remote node. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum LayoutTypeDetails { + /// Fully contiguous layout details + FullyContiguous(FullyContiguousDetails), + /// Layer-separate layout details + LayerSeparate(LayerSeparateDetails), +} + +/// Serializable representation of a physical layout. +/// +/// This structure contains all information needed to reconstruct a layout +/// on a remote node, including: +/// - Layout configuration (dimensions, sizes, etc.) +/// - Storage location and NIXL metadata +/// - Memory descriptors for all regions +/// - Layout-type-specific details +/// +/// The serialized form can be transmitted over the network and used to +/// build NIXL transfer descriptors for remote memory access. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LayoutDescriptor { + /// Serialization format version (for future compatibility) + pub version: u32, + + /// Layout configuration + pub layout_config: LayoutConfig, + + /// Storage location + pub location: StorageKind, + + /// NIXL metadata from the source node + pub nixl_metadata: NixlMetadata, + + /// Memory descriptors for all regions backing this layout + pub memory_descriptors: Vec, + + /// Layout-type-specific details + pub layout_type_details: LayoutTypeDetails, +} + +impl LayoutDescriptor { + /// Current serialization version + pub const CURRENT_VERSION: u32 = 1; + + /// Serialize this layout to a JSON string. + /// + /// # Returns + /// JSON string representation of the layout + pub fn to_json(&self) -> Result { + serde_json::to_string(self) + .map_err(|e| anyhow::anyhow!("failed to serialize layout to JSON: {}", e)) + } + + /// Serialize this layout to JSON bytes. + /// + /// # Returns + /// UTF-8 encoded JSON bytes + pub fn to_json_bytes(&self) -> Result> { + serde_json::to_vec(self) + .map_err(|e| anyhow::anyhow!("failed to serialize layout to JSON bytes: {}", e)) + } + + /// Deserialize a layout from a JSON string. + /// + /// # Arguments + /// * `json` - JSON string representation + /// + /// # Returns + /// Deserialized layout + pub fn from_json(json: &str) -> Result { + serde_json::from_str(json) + .map_err(|e| anyhow::anyhow!("failed to deserialize layout from JSON: {}", e)) + } + + /// Deserialize a layout from JSON bytes. + /// + /// # Arguments + /// * `bytes` - UTF-8 encoded JSON bytes + /// + /// # Returns + /// Deserialized layout + pub fn from_json_bytes(bytes: &[u8]) -> Result { + serde_json::from_slice(bytes) + .map_err(|e| anyhow::anyhow!("failed to deserialize layout from JSON bytes: {}", e)) + } + + /// Get the layout configuration. + pub fn layout_config(&self) -> &LayoutConfig { + &self.layout_config + } + + /// Get the storage location. + pub fn location(&self) -> StorageKind { + self.location + } + + /// Get the NIXL metadata from the source node. + pub fn nixl_metadata(&self) -> &NixlMetadata { + &self.nixl_metadata + } + + /// Get the memory descriptors. + pub fn memory_descriptors(&self) -> &[MemoryDescriptor] { + &self.memory_descriptors + } + + /// Get the layout type details. + pub fn layout_type_details(&self) -> &LayoutTypeDetails { + &self.layout_type_details + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_test_config() -> LayoutConfig { + LayoutConfig::builder() + .num_blocks(10) + .num_layers(4) + .outer_dim(2) + .page_size(16) + .inner_dim(128) + .dtype_width_bytes(2) + .build() + .unwrap() + } + + #[test] + fn test_block_format_default() { + assert_eq!(BlockFormat::default(), BlockFormat::Operational); + } + + #[test] + fn test_serialized_layout_json_roundtrip() { + let layout = LayoutDescriptor { + version: LayoutDescriptor::CURRENT_VERSION, + layout_config: make_test_config(), + location: StorageKind::System, + nixl_metadata: NixlMetadata::new("test_agent".to_string(), nixl_sys::MemType::Dram, 0), + memory_descriptors: vec![MemoryDescriptor::new(0x1000, 4096)], + layout_type_details: LayoutTypeDetails::FullyContiguous(FullyContiguousDetails { + block_format: BlockFormat::Operational, + }), + }; + + // Test to_json/from_json + let json = layout.to_json().unwrap(); + let deserialized = LayoutDescriptor::from_json(&json).unwrap(); + + assert_eq!(deserialized.version, layout.version); + assert_eq!(deserialized.layout_config, layout.layout_config); + assert_eq!(deserialized.location, layout.location); + assert_eq!( + deserialized.nixl_metadata.agent_name(), + layout.nixl_metadata.agent_name() + ); + assert_eq!(deserialized.memory_descriptors.len(), 1); + } + + #[test] + fn test_serialized_layout_json_bytes_roundtrip() { + let layout = LayoutDescriptor { + version: LayoutDescriptor::CURRENT_VERSION, + layout_config: make_test_config(), + location: StorageKind::System, + nixl_metadata: NixlMetadata::new("test_agent".to_string(), nixl_sys::MemType::Vram, 5), + memory_descriptors: vec![ + MemoryDescriptor::new(0x1000, 2048), + MemoryDescriptor::new(0x2000, 2048), + ], + layout_type_details: LayoutTypeDetails::LayerSeparate(LayerSeparateDetails { + block_dim: BlockDimension::BlockIsFirstDim, + }), + }; + + // Test to_json_bytes/from_json_bytes + let bytes = layout.to_json_bytes().unwrap(); + let deserialized = LayoutDescriptor::from_json_bytes(&bytes).unwrap(); + + assert_eq!(deserialized.version, layout.version); + assert_eq!(deserialized.nixl_metadata.device_id(), 5); + assert_eq!(deserialized.memory_descriptors.len(), 2); + } + + #[test] + fn test_fully_contiguous_details_serialization() { + let details = LayoutTypeDetails::FullyContiguous(FullyContiguousDetails { + block_format: BlockFormat::Operational, + }); + + let json = serde_json::to_string(&details).unwrap(); + let deserialized: LayoutTypeDetails = serde_json::from_str(&json).unwrap(); + + match deserialized { + LayoutTypeDetails::FullyContiguous(d) => { + assert_eq!(d.block_format, BlockFormat::Operational); + } + _ => panic!("Expected FullyContiguous variant"), + } + } + + #[test] + fn test_layer_separate_details_serialization() { + let details = LayoutTypeDetails::LayerSeparate(LayerSeparateDetails { + block_dim: BlockDimension::BlockIsSecondDim, + }); + + let json = serde_json::to_string(&details).unwrap(); + let deserialized: LayoutTypeDetails = serde_json::from_str(&json).unwrap(); + + match deserialized { + LayoutTypeDetails::LayerSeparate(d) => { + assert_eq!(d.block_dim, BlockDimension::BlockIsSecondDim); + } + _ => panic!("Expected LayerSeparate variant"), + } + } +} diff --git a/lib/llm/src/block_manager/v2/physical/layout/tests.rs b/lib/llm/src/block_manager/v2/physical/layout/tests.rs new file mode 100644 index 0000000000..f0c763a177 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/layout/tests.rs @@ -0,0 +1,367 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Integration tests for layout serialization. +//! +//! These tests verify the complete serialization and deserialization flow, +//! ensuring that layouts can be transmitted to remote nodes and reconstructed +//! with all necessary metadata intact. + +use crate::block_manager::v2::memory::{ + MemoryRegion, NixlDescriptor, OwnedMemoryRegion, StorageKind, +}; +use crate::block_manager::v2::physical::layout::physical::PhysicalLayout; +use crate::block_manager::v2::physical::layout::{BlockDimension, LayoutConfig, LayoutDescriptor}; +use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent; +use std::any::Any; +use std::sync::Arc; + +// Simple mock implementation for testing +#[derive(Debug)] +pub struct MockMemory { + addr: usize, + size: usize, +} + +impl MockMemory { + pub fn new(addr: usize, size: usize) -> Arc { + Arc::new(Self { addr, size }) + } +} + +impl MemoryRegion for MockMemory { + fn addr(&self) -> usize { + self.addr + } + fn size(&self) -> usize { + self.size + } + fn storage_kind(&self) -> StorageKind { + StorageKind::System + } + fn as_any(&self) -> &dyn Any { + self + } +} + +/// Mock memory region for testing serialization +#[derive(Debug)] +struct TestMemoryRegion { + addr: usize, + size: usize, + kind: StorageKind, + descriptor: NixlDescriptor, +} + +impl TestMemoryRegion { + fn new(addr: usize, size: usize, kind: StorageKind) -> Arc { + Arc::new(Self { + addr, + size, + kind, + descriptor: NixlDescriptor { + addr: addr as u64, + size, + mem_type: nixl_sys::MemType::Dram, + device_id: 0, + }, + }) + } +} + +impl MemoryRegion for TestMemoryRegion { + fn addr(&self) -> usize { + self.addr + } + + fn size(&self) -> usize { + self.size + } + + fn storage_kind(&self) -> StorageKind { + self.kind + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn nixl_descriptor(&self) -> Option { + Some(self.descriptor.clone()) + } +} + +fn make_test_config() -> LayoutConfig { + LayoutConfig::builder() + .num_blocks(10) + .num_layers(4) + .outer_dim(2) + .page_size(16) + .inner_dim(128) + .dtype_width_bytes(2) + .build() + .unwrap() +} + +#[test] +fn test_fully_contiguous_layout_serialization_roundtrip() { + let agent = NixlAgent::require_backends("test-fc-serialize", &[]) + .expect("failed to create wrapped agent"); + let config = make_test_config(); + + // Calculate required size + let required_size = config.num_blocks + * config.num_layers + * config.outer_dim + * config.page_size + * config.inner_dim + * config.dtype_width_bytes; + + // Create test memory region + let memory = TestMemoryRegion::new(0x10000, required_size, StorageKind::System); + let regions = vec![memory as OwnedMemoryRegion]; + + // Build physical layout + let original_layout = PhysicalLayout::builder(agent) + .with_config(config.clone()) + .fully_contiguous() + .with_registered_regions(regions) + .expect("failed to provide regions") + .build() + .expect("failed to build layout"); + + // Serialize to LayoutDescriptor + let serialized = original_layout + .to_descriptor() + .expect("failed to serialize layout"); + + // Verify serialized data + assert_eq!(serialized.version, LayoutDescriptor::CURRENT_VERSION); + assert_eq!(serialized.layout_config, config); + assert_eq!(serialized.location, StorageKind::System); + assert_eq!(serialized.memory_descriptors.len(), 1); + assert_eq!(serialized.memory_descriptors[0].addr, 0x10000); + assert_eq!(serialized.memory_descriptors[0].size, required_size); + + // Serialize to JSON + let json = serialized.to_json().expect("failed to serialize to JSON"); + assert!(json.contains("\"version\":1")); + assert!(json.contains("\"num_blocks\":10")); + + // Deserialize from JSON + let deserialized = LayoutDescriptor::from_json(&json).expect("failed to deserialize from JSON"); + + // Verify deserialized matches original + assert_eq!(deserialized.version, serialized.version); + assert_eq!(deserialized.layout_config, serialized.layout_config); + assert_eq!(deserialized.location, serialized.location); + assert_eq!( + deserialized.memory_descriptors.len(), + serialized.memory_descriptors.len() + ); + + // Reconstruct layout from serialized data + let reconstructed = + PhysicalLayout::from_descriptor(deserialized).expect("failed to reconstruct layout"); + + // Verify reconstructed layout has same configuration + assert_eq!(reconstructed.layout().config(), &config); + assert_eq!(reconstructed.location(), StorageKind::System); + assert_eq!(reconstructed.layout().num_blocks(), 10); + assert_eq!(reconstructed.layout().num_layers(), 4); + assert!(reconstructed.layout().is_fully_contiguous()); +} + +#[test] +fn test_layer_separate_layout_serialization_roundtrip() { + let agent = NixlAgent::require_backends("test-ls-serialize", &[]) + .expect("failed to create wrapped agent"); + let config = make_test_config(); + + // Calculate per-layer size + let per_layer_size = config.num_blocks + * config.outer_dim + * config.page_size + * config.inner_dim + * config.dtype_width_bytes; + + // Create memory regions (one per layer) + let regions: Vec = (0..config.num_layers) + .map(|i| { + TestMemoryRegion::new( + 0x10000 + i * per_layer_size, + per_layer_size, + StorageKind::System, + ) as OwnedMemoryRegion + }) + .collect(); + + // Build physical layout + let original_layout = PhysicalLayout::builder(agent) + .with_config(config.clone()) + .layer_separate(BlockDimension::BlockIsFirstDim) + .with_registered_regions(regions) + .expect("failed to provide regions") + .build() + .expect("failed to build layout"); + + // Serialize to LayoutDescriptor + let serialized = original_layout + .to_descriptor() + .expect("failed to serialize layout"); + + // Verify serialized data + assert_eq!(serialized.version, LayoutDescriptor::CURRENT_VERSION); + assert_eq!(serialized.layout_config, config); + assert_eq!(serialized.memory_descriptors.len(), 4); // One per layer + + // Verify memory descriptors + for (i, desc) in serialized.memory_descriptors.iter().enumerate() { + assert_eq!(desc.addr, 0x10000 + i * per_layer_size); + assert_eq!(desc.size, per_layer_size); + } + + // Serialize to JSON bytes + let json_bytes = serialized + .to_json_bytes() + .expect("failed to serialize to JSON bytes"); + + // Deserialize from JSON bytes + let deserialized = LayoutDescriptor::from_json_bytes(&json_bytes) + .expect("failed to deserialize from JSON bytes"); + + // Verify deserialized matches original + assert_eq!(deserialized.version, serialized.version); + assert_eq!(deserialized.layout_config, serialized.layout_config); + assert_eq!( + deserialized.memory_descriptors.len(), + serialized.memory_descriptors.len() + ); + + // Reconstruct layout from serialized data + let reconstructed = + PhysicalLayout::from_descriptor(deserialized).expect("failed to reconstruct layout"); + + // Verify reconstructed layout has same configuration + assert_eq!(reconstructed.layout().config(), &config); + assert_eq!(reconstructed.location(), StorageKind::System); + assert_eq!(reconstructed.layout().num_blocks(), 10); + assert_eq!(reconstructed.layout().num_layers(), 4); + assert!(!reconstructed.layout().is_fully_contiguous()); +} + +#[test] +fn test_memory_region_calculation_after_deserialization() { + let agent = NixlAgent::require_backends("test-memory-calc", &[]) + .expect("failed to create wrapped agent"); + let config = LayoutConfig::builder() + .num_blocks(2) + .num_layers(2) + .outer_dim(2) + .page_size(4) + .inner_dim(8) + .dtype_width_bytes(2) + .build() + .unwrap(); + + let required_size = config.num_blocks + * config.num_layers + * config.outer_dim + * config.page_size + * config.inner_dim + * config.dtype_width_bytes; + + let memory = TestMemoryRegion::new(0x1000, required_size, StorageKind::System); + let regions = vec![memory as OwnedMemoryRegion]; + + let original_layout = PhysicalLayout::builder(agent) + .with_config(config.clone()) + .fully_contiguous() + .with_registered_regions(regions) + .expect("failed to provide regions") + .build() + .expect("failed to build layout"); + + // Serialize and deserialize + let serialized = original_layout + .to_descriptor() + .expect("failed to serialize"); + let reconstructed = PhysicalLayout::from_descriptor(serialized).expect("failed to reconstruct"); + + // Verify memory region calculations + let region = reconstructed + .memory_region(0, 0, 0) + .expect("failed to get memory region"); + assert_eq!(region.addr, 0x1000); + + let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes; + assert_eq!(region.size, region_size); + + // Test different block/layer/outer indices + let region = reconstructed + .memory_region(1, 1, 1) + .expect("failed to get memory region"); + // Address should be: base + block_stride + layer_stride + outer_stride + let layer_stride = config.outer_dim * region_size; + let block_stride = config.num_layers * layer_stride; + let expected_addr = 0x1000 + block_stride + layer_stride + region_size; + assert_eq!(region.addr, expected_addr); +} + +#[test] +fn test_version_check_on_deserialization() { + let config = make_test_config(); + + // Calculate required size for fully contiguous layout + let required_size = config.num_blocks + * config.num_layers + * config.outer_dim + * config.page_size + * config.inner_dim + * config.dtype_width_bytes; + + let mut serialized = LayoutDescriptor { + version: 999, // Future version + layout_config: config.clone(), + location: StorageKind::System, + nixl_metadata: crate::block_manager::v2::physical::layout::physical::NixlMetadata::new( + "test".to_string(), + nixl_sys::MemType::Dram, + 0, + ), + memory_descriptors: vec![], + layout_type_details: + crate::block_manager::v2::physical::layout::LayoutTypeDetails::FullyContiguous( + crate::block_manager::v2::physical::layout::FullyContiguousDetails { + block_format: + crate::block_manager::v2::physical::layout::BlockFormat::Operational, + }, + ), + }; + + // Should fail with unsupported version + let result = PhysicalLayout::from_descriptor(serialized.clone()); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Unsupported serialization version") + ); + + // Should succeed with supported version + serialized.version = LayoutDescriptor::CURRENT_VERSION; + serialized.memory_descriptors = vec![crate::block_manager::v2::memory::MemoryDescriptor::new( + 0x1000, + required_size, + )]; + let result = PhysicalLayout::from_descriptor(serialized); + if let Err(ref e) = result { + eprintln!("Error during deserialization: {}", e); + } + assert!( + result.is_ok(), + "Expected successful deserialization, got error: {:?}", + result.err() + ); +} diff --git a/lib/llm/src/block_manager/v2/physical/layout/validation.rs b/lib/llm/src/block_manager/v2/physical/layout/validation.rs new file mode 100644 index 0000000000..126c7299f1 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/layout/validation.rs @@ -0,0 +1,122 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Tensor validation utilities for layout creation. + +use anyhow::{Result, anyhow}; +use std::sync::Arc; + +use crate::block_manager::v2::memory::TorchTensor; + +/// Format of tensor layout (for future TP translation). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TensorFormat { + /// NHD format: [N, H, D] where N=block_size, H=heads, D=hidden + NHD, + /// HND format: [H, N, D] where H=heads, N=block_size, D=hidden + HND, + /// Unknown or ambiguous format + Unknown, +} + +/// Validate tensor strides and detect format. +/// +/// This function checks that tensor strides are monotonically decreasing, +/// which ensures tensor-contiguous layout. The stride validation is flexible +/// at the inner dimension boundary to accommodate different layouts. +/// +/// Additionally, it attempts to detect whether the layout is NHD or HND format, +/// which is important for future tensor parallel (TP) translation. +/// +/// # Arguments +/// * `tensors` - Slice of tensors to validate +/// +/// # Returns +/// The detected tensor format (NHD, HND, or Unknown) +pub fn validate_tensor_strides(tensors: &[Arc]) -> Result { + if tensors.is_empty() { + return Err(anyhow!("Cannot validate empty tensor list")); + } + + let mut format = TensorFormat::Unknown; + + for tensor in tensors { + let stride = tensor.stride(); + let shape = tensor.shape(); + + if stride.len() < 2 { + return Err(anyhow!( + "Tensor must have at least 2 dimensions, got stride: {:?}", + stride + )); + } + + // Check monotonic decreasing stride + // Note: We're flexible at the combined inner dimension boundary as per requirements + let mut prev_stride = usize::MAX; + for (i, ¤t_stride) in stride.iter().enumerate() { + if current_stride > prev_stride { + return Err(anyhow!( + "Tensor strides must be monotonically decreasing (until inner dimension). \ + Got stride: {:?} at position {}", + stride, + i + )); + } + prev_stride = current_stride; + } + + // Attempt to detect NHD vs HND format based on shape and stride patterns + // This is a heuristic and may need refinement based on actual usage + if shape.len() >= 3 { + // If the first dimension stride is smaller than the second, likely HND + // If the first dimension stride is larger than the second, likely NHD + if stride[0] < stride[1] { + format = TensorFormat::HND; + } else if stride[0] > stride[1] { + format = TensorFormat::NHD; + } + } + } + + Ok(format) +} + +/// Validate that all tensors have consistent shapes. +/// +/// # Arguments +/// * `tensors` - Slice of tensors to validate +/// +/// # Returns +/// The common shape shared by all tensors +pub fn validate_tensor_shapes(tensors: &[Arc]) -> Result> { + if tensors.is_empty() { + return Err(anyhow!("Cannot validate empty tensor list")); + } + + let first_shape = tensors[0].shape(); + + for tensor in &tensors[1..] { + if tensor.shape() != first_shape { + return Err(anyhow!( + "All tensors must have the same shape. Expected {:?}, got {:?}", + first_shape, + tensor.shape() + )); + } + } + + Ok(first_shape) +} + +#[allow(dead_code)] +pub fn determine_compressed_shape(shape: &[usize]) -> usize { + shape.iter().product() +} + +#[cfg(test)] +mod tests { + + // Note: These tests would require mock TorchTensor implementations + // which we can add if needed for testing infrastructure +} diff --git a/lib/llm/src/block_manager/v2/physical/manager/handle.rs b/lib/llm/src/block_manager/v2/physical/manager/handle.rs new file mode 100644 index 0000000000..25bd013227 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/manager/handle.rs @@ -0,0 +1,115 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Layout handle type encoding worker ID and layout ID. + +use bincode::{Decode, Encode}; + +/// Unique handle for a layout combining worker_id and layout_id. +/// +/// The handle encodes: +/// - Bits 0-63: worker_id (u64) +/// - Bits 64-79: layout_id (u16) +/// - Bits 80-127: Reserved (48 bits, currently unused) +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Encode, Decode)] +pub struct LayoutHandle(u128); + +impl LayoutHandle { + /// Create a new layout handle from worker_id and layout_id. + /// + /// # Arguments + /// * `worker_id` - Unique identifier for the worker (0-63 bits) + /// * `layout_id` - Layout identifier within the worker (64-79 bits) + pub fn new(worker_id: u64, layout_id: u16) -> Self { + let handle = (worker_id as u128) | ((layout_id as u128) << 64); + Self(handle) + } + + /// Extract the worker_id from this handle. + pub fn worker_id(&self) -> u64 { + (self.0 & 0xFFFF_FFFF_FFFF_FFFF) as u64 + } + + /// Extract the layout_id from this handle. + pub fn layout_id(&self) -> u16 { + ((self.0 >> 64) & 0xFFFF) as u16 + } + + /// Get the raw u128 value. + pub fn as_u128(&self) -> u128 { + self.0 + } + + /// Create a handle from a raw u128 value. + pub fn from_u128(value: u128) -> Self { + Self(value) + } +} + +impl std::fmt::Display for LayoutHandle { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "LayoutHandle(worker={}, layout={})", + self.worker_id(), + self.layout_id() + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_handle_encoding() { + let worker_id = 0x1234_5678_9ABC_DEF0u64; + let layout_id = 0x4242u16; + + let handle = LayoutHandle::new(worker_id, layout_id); + + assert_eq!(handle.worker_id(), worker_id); + assert_eq!(handle.layout_id(), layout_id); + } + + #[test] + fn test_handle_roundtrip() { + let handle = LayoutHandle::new(42, 100); + let raw = handle.as_u128(); + let restored = LayoutHandle::from_u128(raw); + + assert_eq!(handle, restored); + assert_eq!(restored.worker_id(), 42); + assert_eq!(restored.layout_id(), 100); + } + + #[test] + fn test_handle_max_values() { + let max_worker = u64::MAX; + let max_layout = u16::MAX; + + let handle = LayoutHandle::new(max_worker, max_layout); + + assert_eq!(handle.worker_id(), max_worker); + assert_eq!(handle.layout_id(), max_layout); + } + + #[test] + fn test_handle_bincode_roundtrip() { + let handle = LayoutHandle::new(999, 42); + + let encoded = bincode::encode_to_vec(handle, bincode::config::standard()).unwrap(); + let (decoded, _): (LayoutHandle, _) = + bincode::decode_from_slice(&encoded, bincode::config::standard()).unwrap(); + + assert_eq!(handle, decoded); + } + + #[test] + fn test_handle_display() { + let handle = LayoutHandle::new(123, 456); + let display = format!("{}", handle); + assert!(display.contains("123")); + assert!(display.contains("456")); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/manager/local.rs b/lib/llm/src/block_manager/v2/physical/manager/local.rs new file mode 100644 index 0000000000..8157b3671c --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/manager/local.rs @@ -0,0 +1,119 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Local layout wrapper with handle and metadata. + +use std::ops::Deref; + +use super::handle::LayoutHandle; +use crate::block_manager::v2::physical::layout::PhysicalLayout; + +/// A local physical layout with an assigned handle. +/// +/// This wraps a `PhysicalLayout` that exists on the local worker, +/// associating it with a unique handle that combines the worker_id +/// and a locally-assigned layout_id. +/// +/// This type is cheap to clone as `PhysicalLayout` contains `Arc` internally. +#[derive(Debug, Clone)] +pub struct LocalLayout { + handle: LayoutHandle, + layout: PhysicalLayout, +} + +#[allow(dead_code)] +impl LocalLayout { + /// Create a new local layout. + /// + /// # Arguments + /// * `handle` - Unique handle for this layout + /// * `layout` - The physical layout + pub fn new(handle: LayoutHandle, layout: PhysicalLayout) -> Self { + Self { handle, layout } + } + + /// Get the handle for this layout. + pub fn handle(&self) -> LayoutHandle { + self.handle + } + + /// Get a reference to the physical layout. + pub fn layout(&self) -> &PhysicalLayout { + &self.layout + } + + /// Get the worker_id from the handle. + pub fn worker_id(&self) -> u64 { + self.handle.worker_id() + } + + /// Get the layout_id from the handle. + pub fn layout_id(&self) -> u16 { + self.handle.layout_id() + } + + /// Consume this local layout and return the physical layout. + pub fn into_layout(self) -> PhysicalLayout { + self.layout + } +} + +impl Deref for LocalLayout { + type Target = PhysicalLayout; + + fn deref(&self) -> &Self::Target { + &self.layout + } +} + +#[cfg(all(test, feature = "testing-nixl"))] +mod tests { + use super::*; + use crate::block_manager::v2::physical::layout::{LayoutConfig, PhysicalLayout}; + use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent; + + fn create_test_agent(name: &str) -> NixlAgent { + NixlAgent::require_backends(name, &[]).expect("failed to create wrapped agent") + } + + fn make_test_layout() -> PhysicalLayout { + let agent = create_test_agent("test-local"); + let config = LayoutConfig::builder() + .num_blocks(2) + .num_layers(2) + .outer_dim(2) + .page_size(4) + .inner_dim(8) + .dtype_width_bytes(2) + .build() + .unwrap(); + + PhysicalLayout::builder(agent) + .with_config(config) + .fully_contiguous() + .allocate_system() + .build() + .unwrap() + } + + #[test] + fn test_local_layout_creation() { + let handle = LayoutHandle::new(42, 100); + let layout = make_test_layout(); + let local = LocalLayout::new(handle, layout); + + assert_eq!(local.handle(), handle); + assert_eq!(local.worker_id(), 42); + assert_eq!(local.layout_id(), 100); + } + + #[test] + fn test_local_layout_into_layout() { + let handle = LayoutHandle::new(1, 2); + let layout = make_test_layout(); + let local = LocalLayout::new(handle, layout); + + let _recovered = local.into_layout(); + // Successfully consumed and returned the layout + } +} diff --git a/lib/llm/src/block_manager/v2/physical/manager/metadata.rs b/lib/llm/src/block_manager/v2/physical/manager/metadata.rs new file mode 100644 index 0000000000..a64144e71f --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/manager/metadata.rs @@ -0,0 +1,239 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Serialization types for exporting/importing layout metadata with NIXL integration. + +use super::handle::LayoutHandle; +use crate::block_manager::v2::physical::layout::LayoutDescriptor; +use anyhow::Result; +use bincode::{Decode, Encode}; +use bytes::Bytes; + +/// Worker identification combining worker_id and NIXL agent name. +#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)] +pub struct WorkerAddress { + /// Unique identifier for this worker + pub worker_id: u64, + /// NIXL agent name on this worker + pub nixl_agent_name: String, +} + +impl WorkerAddress { + /// Create a new worker address. + pub fn new(worker_id: u64, nixl_agent_name: String) -> Self { + Self { + worker_id, + nixl_agent_name, + } + } +} + +/// Local layout descriptor with its assigned handle from the TransportManager. +#[derive(Debug, Clone, Encode, Decode)] +pub struct LocalLayoutDescriptor { + /// Unique handle for this layout + pub handle: LayoutHandle, + /// Serialized layout data (uses Serde, bridged via bincode) + #[bincode(with_serde)] + pub layout: LayoutDescriptor, +} + +impl LocalLayoutDescriptor { + /// Create a new serialized layout with handle. + pub fn new(handle: LayoutHandle, layout: LayoutDescriptor) -> Self { + Self { handle, layout } + } +} + +/// The set of [`LocalLayoutDescriptor`] that are RDMA enabled. This object packages the detail +/// about the layouts and the NIXL RDMA metadata required to reconstruct the layouts and access +/// the memory via NIXL RDMA. +#[derive(Debug, Encode, Decode)] +pub struct RdmaLayoutDescriptors { + /// Worker identification + pub worker_address: WorkerAddress, + /// Exported NIXL metadata from nixl_sys::Agent::get_local_md() + pub nixl_metadata: Vec, + /// Serialized layouts (handle + layout data) + pub layouts: Vec, +} + +/// Managed memory metadata package for export/import. +/// +/// This is the wire format for transmitting layout metadata between workers. +/// It contains everything needed to reconstruct remote layouts and load their +/// NIXL registration data. +pub struct SerializedLayout(Bytes); + +impl SerializedLayout { + /// Pack metadata into a serialized form. + /// + /// # Arguments + /// * `worker_address` - Worker identification + /// * `nixl_metadata` - NIXL metadata blob from get_local_md() + /// * `layouts` - Vector of layouts with handles to export + /// + /// # Returns + /// Packed metadata ready for transmission + pub fn pack( + worker_address: WorkerAddress, + nixl_metadata: Vec, + layouts: Vec, + ) -> Result { + let inner = RdmaLayoutDescriptors { + worker_address, + nixl_metadata, + layouts, + }; + let bytes = bincode::encode_to_vec(&inner, bincode::config::standard()) + .map_err(|e| anyhow::anyhow!("failed to encode managed memory metadata: {}", e))?; + Ok(Self(Bytes::from(bytes))) + } + + /// Unpack metadata from serialized form. + /// + /// # Returns + /// Unpacked metadata structure + pub fn unpack(&self) -> Result { + let (inner, _) = bincode::decode_from_slice(&self.0, bincode::config::standard()) + .map_err(|e| anyhow::anyhow!("failed to decode managed memory metadata: {}", e))?; + Ok(inner) + } + + /// Get the raw bytes. + pub fn as_bytes(&self) -> &Bytes { + &self.0 + } + + /// Create from raw bytes. + pub fn from_bytes(bytes: Bytes) -> Self { + Self(bytes) + } + + /// Get the size in bytes. + pub fn len(&self) -> usize { + self.0.len() + } + + /// Check if empty. + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +impl std::fmt::Debug for SerializedLayout { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SerializedLayout") + .field("size_bytes", &self.len()) + .finish() + } +} + +#[cfg(all(test, feature = "testing-nixl"))] +mod tests { + use super::*; + use crate::block_manager::v2::memory::{MemoryDescriptor, StorageKind}; + use crate::block_manager::v2::physical::layout::{ + BlockFormat, FullyContiguousDetails, LayoutConfig, LayoutDescriptor, LayoutTypeDetails, + NixlMetadata, + }; + + fn make_test_serialized_layout() -> LayoutDescriptor { + let config = LayoutConfig::builder() + .num_blocks(2) + .num_layers(2) + .outer_dim(2) + .page_size(4) + .inner_dim(8) + .dtype_width_bytes(2) + .build() + .unwrap(); + + LayoutDescriptor { + version: 1, + layout_config: config, + location: StorageKind::System, + nixl_metadata: NixlMetadata::new("test".to_string(), nixl_sys::MemType::Dram, 0), + memory_descriptors: vec![MemoryDescriptor::new(0x1000, 4096)], + layout_type_details: LayoutTypeDetails::FullyContiguous(FullyContiguousDetails { + block_format: BlockFormat::Operational, + }), + } + } + + #[test] + fn test_worker_address() { + let addr = WorkerAddress::new(42, "test_agent".to_string()); + assert_eq!(addr.worker_id, 42); + assert_eq!(addr.nixl_agent_name, "test_agent"); + } + + #[test] + fn test_serialized_layout_with_handle() { + let handle = LayoutHandle::new(1, 2); + let layout = make_test_serialized_layout(); + let with_handle = LocalLayoutDescriptor::new(handle, layout); + + assert_eq!(with_handle.handle, handle); + } + + #[test] + fn test_metadata_pack_unpack() { + let worker_address = WorkerAddress::new(100, "worker_100".to_string()); + let nixl_metadata = vec![1, 2, 3, 4, 5]; + let layouts = vec![LocalLayoutDescriptor::new( + LayoutHandle::new(100, 1), + make_test_serialized_layout(), + )]; + + let packed = + SerializedLayout::pack(worker_address.clone(), nixl_metadata.clone(), layouts).unwrap(); + + assert!(!packed.is_empty()); + assert!(!packed.is_empty()); + + let unpacked = packed.unpack().unwrap(); + + assert_eq!(unpacked.worker_address, worker_address); + assert_eq!(unpacked.nixl_metadata, nixl_metadata); + assert_eq!(unpacked.layouts.len(), 1); + assert_eq!(unpacked.layouts[0].handle.worker_id(), 100); + assert_eq!(unpacked.layouts[0].handle.layout_id(), 1); + } + + #[test] + fn test_metadata_multiple_layouts() { + let worker_address = WorkerAddress::new(200, "worker_200".to_string()); + let nixl_metadata = vec![10, 20, 30]; + let layouts = vec![ + LocalLayoutDescriptor::new(LayoutHandle::new(200, 1), make_test_serialized_layout()), + LocalLayoutDescriptor::new(LayoutHandle::new(200, 2), make_test_serialized_layout()), + LocalLayoutDescriptor::new(LayoutHandle::new(200, 3), make_test_serialized_layout()), + ]; + + let packed = + SerializedLayout::pack(worker_address, nixl_metadata, layouts.clone()).unwrap(); + let unpacked = packed.unpack().unwrap(); + + assert_eq!(unpacked.layouts.len(), 3); + for (i, layout) in unpacked.layouts.iter().enumerate() { + assert_eq!(layout.handle.worker_id(), 200); + assert_eq!(layout.handle.layout_id(), (i + 1) as u16); + } + } + + #[test] + fn test_metadata_from_bytes() { + let worker_address = WorkerAddress::new(42, "test".to_string()); + let nixl_metadata = vec![1, 2, 3]; + let layouts = vec![]; + + let packed = SerializedLayout::pack(worker_address, nixl_metadata, layouts).unwrap(); + let bytes = packed.as_bytes().clone(); + + let restored = SerializedLayout::from_bytes(bytes); + let unpacked = restored.unpack().unwrap(); + + assert_eq!(unpacked.worker_address.worker_id, 42); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/manager/mod.rs b/lib/llm/src/block_manager/v2/physical/manager/mod.rs new file mode 100644 index 0000000000..b5e8cdd6c5 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/manager/mod.rs @@ -0,0 +1,627 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Transport manager for local and remote physical layouts with transfer execution. + +mod handle; +mod local; +mod metadata; +mod remote; + +pub use handle::LayoutHandle; +pub use metadata::{SerializedLayout, WorkerAddress}; + +pub(crate) use local::LocalLayout; +pub(crate) use metadata::LocalLayoutDescriptor; +pub(crate) use remote::RemoteLayout; + +use crate::block_manager::v2::memory::StorageKind; +use crate::block_manager::v2::physical::layout::PhysicalLayout; +use crate::block_manager::v2::physical::transfer::TransferContext; +use crate::block_manager::v2::physical::transfer::context::TransferCompleteNotification; +use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent; +use crate::block_manager::v2::physical::transfer::options::TransferOptions; +use anyhow::{Result, anyhow, bail}; +use std::collections::{HashMap, HashSet}; +use std::sync::atomic::{AtomicU16, Ordering}; +use std::sync::{Arc, RwLock}; + +/// Public entry point for layout and transfer management. +/// +/// TransportManager combines layout registration/metadata management with +/// transfer execution capabilities, providing a unified API for: +/// - Registering local layouts and obtaining handles +/// - Exporting/importing layout metadata for remote workers +/// - Executing transfers between layouts using handles +/// - Managing CUDA, NIXL, and other execution resources +#[derive(Clone)] +pub struct TransportManager { + registry: Arc>, + context: Arc, +} + +impl TransportManager { + /// Create a new TransportManager builder. + /// + /// The builder configures the worker ID, NIXL agent, CUDA device, + /// and other execution parameters before creating the manager. + /// + /// # Example + /// ```ignore + /// let manager = TransportManager::builder() + /// .worker_id(0) // NIXL agent name defaults to "worker-0" + /// .nixl_backend("ucx") // Optional: defaults to UCX from env + /// .cuda_device_id(0) + /// .build()?; + /// + /// // Or with custom agent name: + /// let manager = TransportManager::builder() + /// .worker_id(0) + /// .nixl_agent_name("custom-agent") + /// .build()?; + /// ``` + pub fn builder() -> crate::block_manager::v2::physical::transfer::context::TransferConfigBuilder + { + TransferContext::builder() + } + + /// Create a TransportManager from a built TransferContext. + /// + /// This is used internally by the builder to wrap the context + /// and create the associated registry. + pub(crate) fn from_context(context: TransferContext) -> Self { + let worker_id = context.worker_id(); + let nixl_agent = context.nixl_agent().clone(); + let registry = Arc::new(RwLock::new(LayoutRegistry::new(nixl_agent, worker_id))); + + Self { + registry, + context: Arc::new(context), + } + } + + // ===== Layout Registration and Metadata Management ===== + + /// Register a local physical layout and return a unique handle. + /// + /// This registers the layout with the embedded memory manager, assigning + /// it a unique handle that can be used for handle-based transfers. + /// + /// # Arguments + /// * `layout` - Physical layout to register + /// + /// # Returns + /// Unique handle for the registered layout + /// + /// # Errors + /// Returns an error if layout IDs are exhausted (u16::MAX reached) + pub fn register_layout(&self, layout: PhysicalLayout) -> Result { + self.registry.write().unwrap().register_local(layout) + } + + /// Export layout metadata for transmission to remote workers. + /// + /// This exports all registered local layouts along with NIXL metadata + /// needed for remote memory registration. + /// + /// # Returns + /// Packed metadata ready for transmission to remote workers + pub fn export_metadata(&self) -> Result { + self.registry.read().unwrap().export_metadata() + } + + /// Import remote layout metadata. + /// + /// This loads NIXL metadata and reconstructs physical layouts from a remote + /// worker's exported metadata. + /// + /// # Arguments + /// * `metadata` - Packed metadata from remote worker + /// + /// # Returns + /// Vector of handles for the imported remote layouts + /// + /// # Errors + /// Returns an error if the remote worker was already loaded or if metadata + /// loading/reconstruction fails + pub fn import_metadata(&self, metadata: SerializedLayout) -> Result> { + self.registry.write().unwrap().import_metadata(metadata) + } + + // ===== Handle-Based Transfer API ===== + + /// Transfer complete blocks between layouts using handles. + /// + /// This function copies entire blocks (all layers and outer dimensions) between + /// the source and destination layouts identified by their handles. The transfer + /// strategy (memcpy, CUDA, NIXL) is automatically selected based on storage locations. + /// + /// The lock on the registry is held only briefly during layout lookup, + /// then released before executing the actual transfer. + /// + /// # Arguments + /// * `src_handle` - Handle to source layout + /// * `src_blocks` - Source block IDs to transfer + /// * `dst_handle` - Handle to destination layout + /// * `dst_blocks` - Destination block IDs to transfer + /// + /// # Returns + /// A notification handle that can be awaited for transfer completion + /// + /// # Errors + /// Returns an error if: + /// - Either handle is invalid + /// - Block IDs are out of bounds + /// - Transfer execution fails + pub fn execute_transfer( + &self, + src_handle: LayoutHandle, + src_blocks: &[usize], + dst_handle: LayoutHandle, + dst_blocks: &[usize], + options: TransferOptions, + ) -> Result { + // Clone layouts inside the lock, then drop lock before transfer + let (src_layout, dst_layout) = { + let registry = self.registry.read().unwrap(); + let src = registry + .get_layout(src_handle) + .ok_or_else(|| anyhow!("invalid source handle: {}", src_handle))? + .clone(); // Cheap: just Arc refcount bump + let dst = registry + .get_layout(dst_handle) + .ok_or_else(|| anyhow!("invalid destination handle: {}", dst_handle))? + .clone(); + (src, dst) + }; // Lock released here + + // Execute transfer with no lock held + super::transfer::executor::execute_transfer( + &src_layout, + &dst_layout, + src_blocks, + dst_blocks, + options, + &self.context, + ) + } + + // ===== Query Methods ===== + + /// Get the worker ID for this manager. + pub fn worker_id(&self) -> u64 { + self.context.worker_id() + } + + /// Get handles for all locally registered layouts. + pub fn get_local_handles(&self) -> Vec { + self.registry.read().unwrap().local_handles() + } + + /// Get handles for all imported remote layouts. + pub fn get_remote_handles(&self) -> Vec { + self.registry.read().unwrap().remote_handles() + } + + // ===== Internal Methods for Testing ===== + + /// Get the internal transfer context (for testing only). + pub fn context(&self) -> &Arc { + &self.context + } + + /// Get the H2D stream (for testing only). + #[cfg(all(test, feature = "testing-cuda"))] + pub(crate) fn h2d_stream(&self) -> &std::sync::Arc { + self.context.h2d_stream() + } + + /// Get the D2H stream (for testing only). + #[cfg(all(test, feature = "testing-cuda"))] + #[allow(dead_code)] + pub(crate) fn d2h_stream(&self) -> &std::sync::Arc { + self.context.d2h_stream() + } + + /// Get the CUDA context (for testing only). + #[cfg(all(test, feature = "testing-cuda"))] + pub(crate) fn cuda_context(&self) -> &std::sync::Arc { + self.context.cuda_context() + } + + /// Register a CUDA event for completion (for testing only). + #[cfg(all(test, feature = "testing-cuda"))] + pub(crate) fn register_cuda_event( + &self, + event: cudarc::driver::CudaEvent, + ) -> TransferCompleteNotification { + self.context.register_cuda_event(event) + } +} + +/// Internal registry for local and remote physical layouts with NIXL integration. +/// +/// The LayoutRegistry handles: +/// - Registering local layouts with unique handles +/// - Exporting local layout metadata for remote access +/// - Importing remote layout metadata and reconstructing layouts +/// - Managing NIXL metadata for RDMA operations +#[derive(Debug)] +pub(crate) struct LayoutRegistry { + /// NIXL agent for memory registration + nixl_agent: NixlAgent, + /// Worker ID for this manager + worker_id: u64, + /// Next layout ID to assign (monotonically increasing) + next_layout_id: AtomicU16, + /// Local layouts registered on this worker + local_layouts: HashMap, + /// Remote layouts imported from other workers + remote_layouts: HashMap, + /// Set of loaded remote workers (agent_name, worker_id) to prevent duplicates + loaded_remotes: HashSet<(String, u64)>, +} + +#[expect(dead_code)] +impl LayoutRegistry { + /// Create a new layout manager. + /// + /// # Arguments + /// * `nixl_agent` - NIXL agent for memory registration + /// * `worker_id` - Unique identifier for this worker + pub(crate) fn new(nixl_agent: NixlAgent, worker_id: u64) -> Self { + Self { + nixl_agent, + worker_id, + next_layout_id: AtomicU16::new(0), + local_layouts: HashMap::new(), + remote_layouts: HashMap::new(), + loaded_remotes: HashSet::new(), + } + } + + /// Register a local physical layout. + /// + /// # Arguments + /// * `layout` - Physical layout to register + /// + /// # Returns + /// Unique handle for the registered layout + /// + /// # Errors + /// Returns an error if layout IDs are exhausted (u16::MAX reached) + pub(crate) fn register_local(&mut self, layout: PhysicalLayout) -> Result { + // Get next layout ID + let layout_id = self.next_layout_id.fetch_add(1, Ordering::SeqCst); + if layout_id == u16::MAX { + bail!("Layout ID overflow: maximum number of layouts (65535) reached"); + } + + // Create handle + let handle = LayoutHandle::new(self.worker_id, layout_id); + + // Wrap in LocalLayout + let local_layout = LocalLayout::new(handle, layout); + + // Store + self.local_layouts.insert(handle, local_layout); + + Ok(handle) + } + + /// Export local layout metadata for transmission to remote workers. + /// + /// This exports: + /// - NIXL agent metadata for remote memory registration + /// - All host and device layouts (disk layouts are excluded) + /// - Worker address information + /// + /// # Returns + /// Packed metadata ready for transmission + pub(crate) fn export_metadata(&self) -> Result { + // Get NIXL metadata from agent + let nixl_metadata = self + .nixl_agent + .get_local_md() + .map_err(|e| anyhow!("failed to get NIXL local metadata: {:?}", e))?; + + // Create worker address + let worker_address = WorkerAddress::new(self.worker_id, self.nixl_agent.name().to_string()); + + // Filter and serialize layouts (only host and device, skip disk) + let mut serialized_layouts = Vec::new(); + for (handle, local_layout) in &self.local_layouts { + let location = local_layout.layout().location(); + + // Only export host and device layouts + if matches!( + location, + StorageKind::System | StorageKind::Device(_) | StorageKind::Pinned + ) { + let serialized = local_layout + .layout() + .to_descriptor() + .map_err(|e| anyhow!("failed to serialize layout {}: {}", handle, e))?; + + serialized_layouts.push(LocalLayoutDescriptor::new(*handle, serialized)); + } + } + + // Pack into managed metadata + SerializedLayout::pack(worker_address, nixl_metadata, serialized_layouts) + } + + /// Import remote layout metadata. + /// + /// This: + /// - Validates the remote worker hasn't been loaded already + /// - Loads NIXL metadata into the agent + /// - Reconstructs physical layouts from serialized data + /// - Stores them as remote layouts + /// + /// # Arguments + /// * `metadata` - Packed metadata from remote worker + /// + /// # Returns + /// Vector of handles for the imported layouts + /// + /// # Errors + /// Returns an error if: + /// - The remote worker was already loaded + /// - NIXL metadata loading fails + /// - Agent name mismatch after loading + /// - Layout reconstruction fails + pub(crate) fn import_metadata( + &mut self, + metadata: SerializedLayout, + ) -> Result> { + // Unpack metadata + let inner = metadata.unpack()?; + + // Validate not already loaded + let remote_key = ( + inner.worker_address.nixl_agent_name.clone(), + inner.worker_address.worker_id, + ); + if self.loaded_remotes.contains(&remote_key) { + bail!( + "Remote worker already loaded: {} (worker_id={})", + remote_key.0, + remote_key.1 + ); + } + + // Load NIXL metadata + let returned_agent_name = self + .nixl_agent + .load_remote_md(&inner.nixl_metadata) + .map_err(|e| anyhow!("failed to load remote NIXL metadata: {:?}", e))?; + + // Verify agent name matches + if returned_agent_name != inner.worker_address.nixl_agent_name { + bail!( + "Agent name mismatch: expected '{}', got '{}'", + inner.worker_address.nixl_agent_name, + returned_agent_name + ); + } + + // Reconstruct layouts + let mut imported_handles = Vec::new(); + for serialized_with_handle in inner.layouts { + let handle = serialized_with_handle.handle; + let layout = PhysicalLayout::from_descriptor(serialized_with_handle.layout) + .map_err(|e| anyhow!("failed to reconstruct layout {}: {}", handle, e))?; + + let remote_layout = RemoteLayout::new(handle, layout); + self.remote_layouts.insert(handle, remote_layout); + imported_handles.push(handle); + } + + // Mark remote as loaded + self.loaded_remotes.insert(remote_key); + + Ok(imported_handles) + } + + /// Get a local layout by handle. + pub(crate) fn get_local(&self, handle: LayoutHandle) -> Option<&LocalLayout> { + self.local_layouts.get(&handle) + } + + /// Get a remote layout by handle. + pub(crate) fn get_remote(&self, handle: LayoutHandle) -> Option<&RemoteLayout> { + self.remote_layouts.get(&handle) + } + + /// Get a layout by handle (either local or remote). + /// + /// # Returns + /// Returns a reference to the PhysicalLayout if found + pub(crate) fn get_layout(&self, handle: LayoutHandle) -> Option<&PhysicalLayout> { + self.local_layouts + .get(&handle) + .map(|l| l.layout()) + .or_else(|| self.remote_layouts.get(&handle).map(|r| r.layout())) + } + + /// Check if a handle refers to a local layout. + pub(crate) fn is_local(&self, handle: LayoutHandle) -> bool { + self.local_layouts.contains_key(&handle) + } + + /// Check if a handle refers to a remote layout. + pub(crate) fn is_remote(&self, handle: LayoutHandle) -> bool { + self.remote_layouts.contains_key(&handle) + } + + /// Get the number of local layouts. + pub(crate) fn local_count(&self) -> usize { + self.local_layouts.len() + } + + /// Get the number of remote layouts. + pub(crate) fn remote_count(&self) -> usize { + self.remote_layouts.len() + } + + /// Get the worker ID for this manager. + pub(crate) fn worker_id(&self) -> u64 { + self.worker_id + } + + /// Get all local layout handles. + pub(crate) fn local_handles(&self) -> Vec { + self.local_layouts.keys().copied().collect() + } + + /// Get all remote layout handles. + pub(crate) fn remote_handles(&self) -> Vec { + self.remote_layouts.keys().copied().collect() + } +} + +#[cfg(all(test, feature = "testing-nixl"))] +mod tests { + use super::*; + use crate::block_manager::v2::physical::layout::LayoutConfig; + use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent; + + fn make_test_agent(name: &str) -> NixlAgent { + NixlAgent::require_backends(name, &[]).expect("failed to create wrapped agent") + } + + fn make_test_layout(agent: &NixlAgent) -> PhysicalLayout { + let config = LayoutConfig::builder() + .num_blocks(2) + .num_layers(2) + .outer_dim(2) + .page_size(4) + .inner_dim(8) + .dtype_width_bytes(2) + .build() + .unwrap(); + + PhysicalLayout::builder(agent.clone()) + .with_config(config) + .fully_contiguous() + .allocate_system() + .build() + .unwrap() + } + + #[test] + fn test_manager_creation() { + let agent = make_test_agent("test-manager"); + let manager = LayoutRegistry::new(agent, 42); + + assert_eq!(manager.worker_id(), 42); + assert_eq!(manager.local_count(), 0); + assert_eq!(manager.remote_count(), 0); + } + + #[test] + fn test_register_local() { + let agent = make_test_agent("test-register"); + let mut manager = LayoutRegistry::new(agent.clone(), 100); + + let layout = make_test_layout(&agent); + let handle = manager.register_local(layout).unwrap(); + + assert_eq!(handle.worker_id(), 100); + assert_eq!(handle.layout_id(), 0); + assert_eq!(manager.local_count(), 1); + assert!(manager.is_local(handle)); + assert!(!manager.is_remote(handle)); + } + + #[test] + fn test_register_multiple_locals() { + let agent = make_test_agent("test-multiple"); + let mut manager = LayoutRegistry::new(agent.clone(), 1); + + let handle1 = manager.register_local(make_test_layout(&agent)).unwrap(); + let handle2 = manager.register_local(make_test_layout(&agent)).unwrap(); + let handle3 = manager.register_local(make_test_layout(&agent)).unwrap(); + + assert_eq!(handle1.layout_id(), 0); + assert_eq!(handle2.layout_id(), 1); + assert_eq!(handle3.layout_id(), 2); + assert_eq!(manager.local_count(), 3); + } + + #[test] + #[ignore] // Requires actual NIXL memory registration + fn test_export_import_roundtrip() { + // Create source manager and register layouts + let source_agent = make_test_agent("source"); + let mut source_manager = LayoutRegistry::new(source_agent.clone(), 1); + + let handle1 = source_manager + .register_local(make_test_layout(&source_agent)) + .unwrap(); + let handle2 = source_manager + .register_local(make_test_layout(&source_agent)) + .unwrap(); + + // Export metadata + let metadata = source_manager.export_metadata().unwrap(); + assert!(!metadata.is_empty()); + + // Create destination manager and import + let dest_agent = make_test_agent("dest"); + let mut dest_manager = LayoutRegistry::new(dest_agent, 2); + + let imported_handles = dest_manager.import_metadata(metadata).unwrap(); + + // Verify + assert_eq!(imported_handles.len(), 2); + assert_eq!(dest_manager.remote_count(), 2); + assert!(dest_manager.is_remote(handle1)); + assert!(dest_manager.is_remote(handle2)); + + // Can get layouts + assert!(dest_manager.get_remote(handle1).is_some()); + assert!(dest_manager.get_remote(handle2).is_some()); + assert!(dest_manager.get_layout(handle1).is_some()); + } + + #[test] + #[ignore] // Requires actual NIXL memory registration + fn test_import_duplicate_remote_fails() { + let source_agent = make_test_agent("source2"); + let mut source_manager = LayoutRegistry::new(source_agent.clone(), 10); + + source_manager + .register_local(make_test_layout(&source_agent)) + .unwrap(); + + let metadata = source_manager.export_metadata().unwrap(); + + let dest_agent = make_test_agent("dest2"); + let mut dest_manager = LayoutRegistry::new(dest_agent, 20); + + // First import succeeds + let metadata_clone = SerializedLayout::from_bytes(metadata.as_bytes().clone()); + dest_manager.import_metadata(metadata).unwrap(); + + // Second import should fail + let result = dest_manager.import_metadata(metadata_clone); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("already loaded")); + } + + #[test] + fn test_get_layout_handles() { + let agent = make_test_agent("test-handles"); + let mut manager = LayoutRegistry::new(agent.clone(), 5); + + let h1 = manager.register_local(make_test_layout(&agent)).unwrap(); + let h2 = manager.register_local(make_test_layout(&agent)).unwrap(); + + let handles = manager.local_handles(); + assert_eq!(handles.len(), 2); + assert!(handles.contains(&h1)); + assert!(handles.contains(&h2)); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/manager/remote.rs b/lib/llm/src/block_manager/v2/physical/manager/remote.rs new file mode 100644 index 0000000000..f80defcbce --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/manager/remote.rs @@ -0,0 +1,127 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Remote layout wrapper reconstructed from imported metadata. + +use super::handle::LayoutHandle; +use crate::block_manager::v2::physical::layout::PhysicalLayout; + +/// A remote physical layout reconstructed from imported metadata. +/// +/// This wraps a `PhysicalLayout` that was deserialized from another worker's +/// exported metadata. The layout's memory regions point to addresses on the +/// remote worker and are used for building NIXL RDMA transfer descriptors. +/// +/// This type is cheap to clone as `PhysicalLayout` contains `Arc` internally. +#[derive(Debug, Clone)] +pub struct RemoteLayout { + handle: LayoutHandle, + layout: PhysicalLayout, +} + +#[allow(dead_code)] +impl RemoteLayout { + /// Create a new remote layout. + /// + /// # Arguments + /// * `handle` - Unique handle for this layout (from remote worker) + /// * `layout` - The reconstructed physical layout + pub fn new(handle: LayoutHandle, layout: PhysicalLayout) -> Self { + Self { handle, layout } + } + + /// Get the handle for this layout. + pub fn handle(&self) -> LayoutHandle { + self.handle + } + + /// Get a reference to the physical layout. + pub fn layout(&self) -> &PhysicalLayout { + &self.layout + } + + /// Get the worker_id from the handle (identifies the remote worker). + pub fn worker_id(&self) -> u64 { + self.handle.worker_id() + } + + /// Get the layout_id from the handle. + pub fn layout_id(&self) -> u16 { + self.handle.layout_id() + } + + /// Consume this remote layout and return the physical layout. + pub fn into_layout(self) -> PhysicalLayout { + self.layout + } +} + +#[cfg(all(test, feature = "testing-nixl"))] +mod tests { + use super::*; + use crate::block_manager::v2::physical::layout::{ + LayoutConfig, LayoutDescriptor, PhysicalLayout, + }; + + fn make_serialized_layout() -> LayoutDescriptor { + use crate::block_manager::v2::memory::{MemoryDescriptor, StorageKind}; + use crate::block_manager::v2::physical::layout::{ + BlockFormat, FullyContiguousDetails, LayoutTypeDetails, NixlMetadata, + }; + + let config = LayoutConfig::builder() + .num_blocks(2) + .num_layers(2) + .outer_dim(2) + .page_size(4) + .inner_dim(8) + .dtype_width_bytes(2) + .build() + .unwrap(); + + let required_size = config.num_blocks + * config.num_layers + * config.outer_dim + * config.page_size + * config.inner_dim + * config.dtype_width_bytes; + + LayoutDescriptor { + version: 1, + layout_config: config, + location: StorageKind::System, + nixl_metadata: NixlMetadata::new( + "remote_agent".to_string(), + nixl_sys::MemType::Dram, + 0, + ), + memory_descriptors: vec![MemoryDescriptor::new(0x1000, required_size)], + layout_type_details: LayoutTypeDetails::FullyContiguous(FullyContiguousDetails { + block_format: BlockFormat::Operational, + }), + } + } + + #[test] + fn test_remote_layout_creation() { + let handle = LayoutHandle::new(999, 42); + let serialized = make_serialized_layout(); + let layout = PhysicalLayout::from_descriptor(serialized).unwrap(); + let remote = RemoteLayout::new(handle, layout); + + assert_eq!(remote.handle(), handle); + assert_eq!(remote.worker_id(), 999); + assert_eq!(remote.layout_id(), 42); + } + + #[test] + fn test_remote_layout_into_layout() { + let handle = LayoutHandle::new(100, 200); + let serialized = make_serialized_layout(); + let layout = PhysicalLayout::from_descriptor(serialized).unwrap(); + let remote = RemoteLayout::new(handle, layout); + + let _recovered = remote.into_layout(); + // Successfully consumed and returned the layout + } +} diff --git a/lib/llm/src/block_manager/v2/physical/mod.rs b/lib/llm/src/block_manager/v2/physical/mod.rs new file mode 100644 index 0000000000..38be109612 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/mod.rs @@ -0,0 +1,6 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod layout; +pub mod manager; +pub mod transfer; diff --git a/lib/llm/src/block_manager/v2/physical/transfer/capabilities.rs b/lib/llm/src/block_manager/v2/physical/transfer/capabilities.rs new file mode 100644 index 0000000000..065b38e092 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/capabilities.rs @@ -0,0 +1,209 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Transfer capability flags for controlling direct path enablement. +//! +//! By default, the transfer system uses a conservative staging policy where: +//! - Device can only transfer to/from Host +//! - Disk can only transfer to/from Host +//! - Host can transfer to Device, Disk, or Remote +//! - Device ↔ Device is allowed (native CUDA) +//! +//! These capability flags enable optional direct paths that bypass host staging. + +use serde::{Deserialize, Serialize}; +use std::sync::OnceLock; + +use crate::block_manager::v2::physical::{ + layout::LayoutConfig, + transfer::{ + PhysicalLayout, TransferOptions, TransportManager, executor::execute_transfer, + nixl_agent::NixlAgent, + }, +}; + +/// Transfer capability flags controlling which direct paths are enabled. +/// +/// # Default Policy (Conservative) +/// +/// With all flags disabled (default), the system uses host staging: +/// - **Device → Remote**: Device → Host → Remote (2 hops) +/// - **Disk → Remote**: Disk → Host → Remote (2 hops) +/// - **Device ↔ Disk**: Device → Host → Disk (2 hops) +/// +/// # Optional Direct Paths +/// +/// - `allow_gds`: Enables GPU Direct Storage (Disk ↔ Device without host) +/// - `allow_gpu_rdma`: Enables GPU RDMA (Device → Remote without host) +/// +/// # Example +/// +/// ``` +/// # use dynamo_kvbm::v2::physical::transfer::TransferCapabilities; +/// // Default conservative policy +/// let caps = TransferCapabilities::default(); +/// assert!(!caps.allow_gds); +/// assert!(!caps.allow_gpu_rdma); +/// +/// // Enable GDS for high-performance disk I/O +/// let caps = TransferCapabilities::default().with_gds(true); +/// ``` +static GDS_SUPPORTED: OnceLock = OnceLock::new(); + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +pub struct TransferCapabilities { + /// Enable GPU Direct Storage (Disk ↔ Device without host staging). + /// + /// When enabled: + /// - Disk → Device: Direct transfer (requires GDS support) + /// - Device → Disk: Direct transfer (requires GDS support) + /// + /// When disabled (default): + /// - Disk → Device: Disk → Host → Device (2 hops) + /// - Device → Disk: Device → Host → Disk (2 hops) + pub allow_gds: bool, + + /// Enable GPU RDMA (Device → Remote without host staging). + /// + /// When enabled: + /// - Device → Remote: Direct NIXL transfer + /// + /// When disabled (default): + /// - Device → Remote: Device → Host → Remote (2 hops) + /// + /// Note: This only affects Device → Remote. Host → Remote is always direct. + pub allow_gpu_rdma: bool, +} + +impl TransferCapabilities { + /// Create capabilities with default conservative policy (all direct paths disabled). + pub fn new() -> Self { + Self::default() + } + + /// Create capabilities with all direct paths enabled (high performance mode). + pub fn all_enabled() -> Self { + Self { + allow_gds: true, + allow_gpu_rdma: true, + } + } + + /// Set the GDS (GPU Direct Storage) capability. + pub fn with_gds(mut self, enabled: bool) -> Self { + self.allow_gds = enabled; + self + } + + fn test_gds_transfer(&self) -> anyhow::Result<()> { + let agent = NixlAgent::require_backends("agent", &["GDS_MT"])?; + + // Try a little test transfer and see if it works. + let config = LayoutConfig::builder() + .num_blocks(1) + .num_layers(1) + .outer_dim(1) + .page_size(1) + .inner_dim(4096) + .build()?; + + let src = PhysicalLayout::builder(agent.clone()) + .with_config(config.clone()) + .fully_contiguous() + .allocate_device(0) + .build()?; + let dst = PhysicalLayout::builder(agent.clone()) + .with_config(config) + .fully_contiguous() + .allocate_disk(None) + .build()?; + + let src_blocks = vec![0]; + let dst_blocks = vec![0]; + + let ctx = TransportManager::builder() + .worker_id(0) + .nixl_agent(agent) + .cuda_device_id(0) + .build()?; + + execute_transfer( + &src, + &dst, + &src_blocks, + &dst_blocks, + TransferOptions::default(), + ctx.context(), + )?; + + Ok(()) + } + + pub fn with_gds_if_supported(mut self) -> Self { + self.allow_gds = *GDS_SUPPORTED.get_or_init(|| self.test_gds_transfer().is_ok()); + + self + } + + /// Set the GPU RDMA capability. + pub fn with_gpu_rdma(mut self, enabled: bool) -> Self { + self.allow_gpu_rdma = enabled; + self + } + + /// Check if a direct path from Device to Disk is allowed. + pub fn allows_device_disk_direct(&self) -> bool { + self.allow_gds + } + + /// Check if a direct path from Device to Remote is allowed. + pub fn allows_device_remote_direct(&self) -> bool { + self.allow_gpu_rdma + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_capabilities() { + let caps = TransferCapabilities::default(); + assert!(!caps.allow_gds); + assert!(!caps.allow_gpu_rdma); + assert!(!caps.allows_device_disk_direct()); + assert!(!caps.allows_device_remote_direct()); + } + + #[test] + fn test_all_enabled() { + let caps = TransferCapabilities::all_enabled(); + assert!(caps.allow_gds); + assert!(caps.allow_gpu_rdma); + assert!(caps.allows_device_disk_direct()); + assert!(caps.allows_device_remote_direct()); + } + + #[test] + fn test_builder_pattern() { + let caps = TransferCapabilities::new() + .with_gds(true) + .with_gpu_rdma(false); + + assert!(caps.allow_gds); + assert!(!caps.allow_gpu_rdma); + } + + #[test] + fn test_selective_enablement() { + // Enable only GDS + let caps = TransferCapabilities::new().with_gds(true); + assert!(caps.allows_device_disk_direct()); + assert!(!caps.allows_device_remote_direct()); + + // Enable only GPU RDMA + let caps = TransferCapabilities::new().with_gpu_rdma(true); + assert!(!caps.allows_device_disk_direct()); + assert!(caps.allows_device_remote_direct()); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/checksum.rs b/lib/llm/src/block_manager/v2/physical/transfer/checksum.rs new file mode 100644 index 0000000000..85afadd3f5 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/checksum.rs @@ -0,0 +1,264 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Block checksum computation for verification. +//! +//! This module provides utilities to compute checksums of blocks for +//! round-trip test verification. + +use crate::block_manager::v2::memory::StorageKind; + +use super::PhysicalLayout; + +use aligned_vec::{AVec, avec}; +use anyhow::{Result, anyhow}; +use blake3::Hasher; + +use std::{ + collections::HashMap, + fs::File, + io::{Read, Seek}, + mem::ManuallyDrop, + ops::Range, + os::fd::FromRawFd, +}; + +use cudarc::runtime::sys::{cudaMemcpy, cudaMemcpyKind}; + +pub type BlockChecksum = String; + +/// Compute checksums for a list of blocks. +/// +/// # Arguments +/// * `layout` - The physical layout containing the blocks +/// * `block_ids` - List of block IDs to checksum +/// +/// # Returns +/// A map from block ID to its checksum +/// +/// # Errors +/// Returns an error if: +/// - Layout is remote (cannot checksum remote memory directly) +/// - Block IDs are out of range +pub fn compute_block_checksums( + layout: &PhysicalLayout, + block_ids: &[usize], +) -> Result> { + let mut checksums = HashMap::new(); + + for &block_id in block_ids { + let checksum = compute_single_block_checksum(layout, block_id, None)?; + checksums.insert(block_id, checksum); + } + + Ok(checksums) +} + +/// Compute checksums for specific layers in blocks. +/// +/// # Arguments +/// * `layout` - The physical layout containing the blocks +/// * `block_ids` - List of block IDs to checksum +/// * `layer_range` - Range of layers to include in checksum +/// +/// # Returns +/// A map from block ID to its checksum (for the specified layers only) +pub fn compute_layer_checksums( + layout: &PhysicalLayout, + block_ids: &[usize], + layer_range: Range, +) -> Result> { + let config = layout.layout().config(); + if layer_range.end > config.num_layers { + return Err(anyhow!( + "Layer range {:?} exceeds num_layers {}", + layer_range, + config.num_layers + )); + } + + let mut checksums = HashMap::new(); + + for &block_id in block_ids { + let checksum = compute_single_block_checksum(layout, block_id, Some(layer_range.clone()))?; + checksums.insert(block_id, checksum); + } + + Ok(checksums) +} + +/// Compute checksum for a single block. +fn compute_single_block_checksum( + layout: &PhysicalLayout, + block_id: usize, + layer_range: Option>, +) -> Result { + let config = layout.layout().config(); + + if block_id >= config.num_blocks { + return Err(anyhow!("Block ID {} out of range", block_id)); + } + + let num_layers = config.num_layers; + let outer_dim = config.outer_dim; + + let layers = layer_range.unwrap_or(0..num_layers); + + // validate layer range + if layers.end > config.num_layers { + return Err(anyhow!( + "Layer range {:?} exceeds num_layers {}", + layers, + config.num_layers + )); + } + + let mut hasher = Hasher::new(); + + // Iterate over all layers and outer dimensions + for layer_id in layers { + for outer_id in 0..outer_dim { + let region = layout.memory_region(block_id, layer_id, outer_id)?; + + match layout.location() { + StorageKind::System | StorageKind::Pinned => { + let slice = unsafe { + std::slice::from_raw_parts(region.addr() as *const u8, region.size()) + }; + hasher.update(slice); + } + StorageKind::Device(_) => { + let system_region: Vec = vec![0; region.size()]; + unsafe { + cudaMemcpy( + system_region.as_ptr() as *mut std::ffi::c_void, + region.addr() as *const std::ffi::c_void, + region.size(), + cudaMemcpyKind::cudaMemcpyDeviceToHost, + ); + } + hasher.update(system_region.as_slice()); + } + StorageKind::Disk(fd) => { + let mut system_region: AVec = avec![[4096]| 0; region.size()]; + + let mut file = ManuallyDrop::new(unsafe { File::from_raw_fd(fd as i32) }); + file.seek(std::io::SeekFrom::Start(region.addr() as u64))?; + file.read_exact(&mut system_region)?; + hasher.update(system_region.as_slice()); + } + } + } + } + + Ok(hasher.finalize().to_string()) +} + +#[cfg(test)] +mod tests { + use super::super::tests::*; + use super::*; + use crate::block_manager::v2::physical::transfer::{FillPattern, fill_blocks}; + + #[test] + fn test_checksum_constant_pattern() { + let physical = builder(2) + .fully_contiguous() + .allocate_system() + .build() + .unwrap(); + + fill_blocks(&physical, &[0, 1], FillPattern::Constant(42)).unwrap(); + + let checksums = compute_block_checksums(&physical, &[0, 1]).unwrap(); + + // Both blocks should have the same checksum values (same pattern) + assert_eq!(checksums[&0], checksums[&1]); + + let memory_region = physical.memory_region(0, 0, 0).unwrap(); + let slice = unsafe { + std::slice::from_raw_parts(memory_region.addr() as *const u8, memory_region.size()) + }; + assert!(slice.iter().all(|&b| b == 42)); + + let mut hasher = Hasher::new(); + hasher.update(slice); + let checksum_mr_slice = hasher.finalize().to_string(); + + let vec = vec![42; memory_region.size()]; + let mut hasher = Hasher::new(); + hasher.update(&vec); + let checksum_vec = hasher.finalize().to_string(); + + assert_eq!(checksum_mr_slice, checksum_vec); + } + + // #[test] + // fn test_checksum_different_patterns() { + // let (layout, _memory) = create_test_layout(2); + // let physical = PhysicalLayout::new_local(layout, StorageLocation::System); + + // // Fill blocks with different patterns + // fill_blocks(&physical, &[0], FillPattern::Constant(42)).unwrap(); + // fill_blocks(&physical, &[1], FillPattern::Constant(100)).unwrap(); + + // let checksums = compute_block_checksums(&physical, &[0, 1]).unwrap(); + + // // Blocks should have different checksums + // assert_ne!(checksums[&0], checksums[&1]); + // } + + // #[test] + // fn test_checksum_matches() { + // let (layout1, _memory1) = create_test_layout(1); + // let (layout2, _memory2) = create_test_layout(1); + + // let physical1 = PhysicalLayout::new_local(layout1, StorageLocation::System); + // let physical2 = PhysicalLayout::new_local(layout2, StorageLocation::System); + + // // Fill both with same pattern + // fill_blocks(&physical1, &[0], FillPattern::Sequential).unwrap(); + // fill_blocks(&physical2, &[0], FillPattern::Sequential).unwrap(); + + // let checksum1 = compute_block_checksums(&physical1, &[0]).unwrap(); + // let checksum2 = compute_block_checksums(&physical2, &[0]).unwrap(); + + // // Checksums should match (ignoring block_id) + // assert!(checksum1[&0].matches(&checksum2[&0])); + // } + + // #[test] + // fn test_layer_checksums() { + // let (layout, _memory) = create_test_layout(1); + // let physical = PhysicalLayout::new_local(layout, StorageLocation::System); + + // // Fill entire block + // fill_blocks(&physical, &[0], FillPattern::Sequential).unwrap(); + + // // Compute checksums for different layer ranges + // let full_checksum = compute_block_checksums(&physical, &[0]).unwrap(); + // let layer0_checksum = compute_layer_checksums(&physical, &[0], 0..1).unwrap(); + // let layer1_checksum = compute_layer_checksums(&physical, &[0], 1..2).unwrap(); + + // // Layer checksums should be different from full checksum + // assert_ne!(full_checksum[&0].byte_count, layer0_checksum[&0].byte_count); + // assert_ne!(full_checksum[&0].byte_count, layer1_checksum[&0].byte_count); + + // // Layer 0 and Layer 1 should have same byte count (same size) + // assert_eq!( + // layer0_checksum[&0].byte_count, + // layer1_checksum[&0].byte_count + // ); + // } + + // #[test] + // fn test_checksum_remote_layout_fails() { + // let (layout, _memory) = create_test_layout(1); + // let physical = + // PhysicalLayout::new_remote(layout, StorageLocation::System, "remote".to_string()); + + // let result = compute_block_checksums(&physical, &[0]); + // assert!(result.is_err()); + // assert!(result.unwrap_err().to_string().contains("remote")); + // } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/context.rs b/lib/llm/src/block_manager/v2/physical/transfer/context.rs new file mode 100644 index 0000000000..9da1963af2 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/context.rs @@ -0,0 +1,372 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Transfer context. + +use std::sync::Arc; + +use crate::block_manager::v2::kernels::OperationalCopyBackend; +use anyhow::Result; +use cudarc::driver::{CudaContext, CudaEvent, CudaStream}; +use derive_builder::Builder; +use nixl_sys::XferRequest; +use tokio::sync::{mpsc, oneshot}; +use uuid::Uuid; + +use super::nixl_agent::{NixlAgent, NixlBackendConfig}; + +use crate::block_manager::v2::physical::manager::TransportManager; + +// Notifications module is declared in ../mod.rs +// Re-export for convenience +use super::TransferCapabilities; +pub use super::notifications; +pub use super::notifications::TransferCompleteNotification; + +#[derive(Debug, Clone, Builder)] +#[builder(pattern = "owned", build_fn(private, name = "build_internal"), public)] +#[allow(dead_code)] // Fields are used in build() but derive macros confuse dead code analysis +pub(crate) struct TransferConfig { + worker_id: u64, + + /// Optional custom name for the NIXL agent. If not provided, defaults to "worker-{worker_id}" + #[builder(default = "None", setter(strip_option))] + nixl_agent_name: Option, + + /// Backend configuration for NIXL backends to enable + #[builder(default = "NixlBackendConfig::new()")] + nixl_backend_config: NixlBackendConfig, + + #[builder(default = "0")] + cuda_device_id: usize, + + #[builder(default = "get_tokio_runtime()")] + tokio_runtime: TokioRuntime, + + #[builder(default = "TransferCapabilities::default()")] + capabilities: TransferCapabilities, + + #[builder(default = "OperationalCopyBackend::Auto")] + operational_backend: OperationalCopyBackend, +} + +impl TransferConfigBuilder { + /// Directly provide a pre-configured wrapped NIXL agent (mainly for testing). + /// + /// This bypasses the agent creation and backend initialization logic, + /// using the provided agent directly. Useful for tests that need full + /// control over agent configuration. + pub fn nixl_agent(self, agent: NixlAgent) -> TransferConfigBuilderWithAgent { + TransferConfigBuilderWithAgent { + builder: self, + agent, + } + } + + /// Add a NIXL backend to enable (uses default plugin parameters). + pub fn nixl_backend(mut self, backend: impl Into) -> Self { + let config = self + .nixl_backend_config + .get_or_insert_with(NixlBackendConfig::new); + *config = config.clone().with_backend(backend); + self + } + + /// Load NIXL backend configuration from environment variables. + /// + /// This merges environment-based configuration with any backends already + /// configured via the builder. + pub fn with_env_backends(mut self) -> Result { + let env_config = NixlBackendConfig::from_env()?; + let config = self + .nixl_backend_config + .get_or_insert_with(NixlBackendConfig::new); + *config = config.clone().merge(env_config); + Ok(self) + } + + pub fn build(self) -> Result { + let mut config = self.build_internal()?; + + // Merge environment backends if not explicitly configured + if config.nixl_backend_config.backends().is_empty() { + config.nixl_backend_config = NixlBackendConfig::from_env()?; + } + + // Derive agent name from worker_id if not provided + let agent_name = config + .nixl_agent_name + .unwrap_or_else(|| format!("worker-{}", config.worker_id)); + + // Create wrapped NIXL agent with configured backends + let backend_names: Vec<&str> = config + .nixl_backend_config + .backends() + .iter() + .map(|s| s.as_str()) + .collect(); + + let nixl_agent = if backend_names.is_empty() { + // No backends configured - create agent without backends + NixlAgent::new_with_backends(&agent_name, &[])? + } else { + // Create agent with requested backends + NixlAgent::new_with_backends(&agent_name, &backend_names)? + }; + + let cuda_context = CudaContext::new(config.cuda_device_id)?; + let context = TransferContext::new( + config.worker_id, + nixl_agent, + cuda_context, + config.tokio_runtime, + config.capabilities, + config.operational_backend, + )?; + Ok(TransportManager::from_context(context)) + } +} + +/// Builder that already has a pre-configured NIXL agent. +/// +/// This is generally used for testing when you want to pass in an agent directly +/// rather than having it created by the builder. +pub struct TransferConfigBuilderWithAgent { + builder: TransferConfigBuilder, + agent: NixlAgent, +} + +impl TransferConfigBuilderWithAgent { + /// Build the TransportManager using the pre-configured agent. + pub fn build(self) -> Result { + let config = self.builder.build_internal()?; + let cuda_context = CudaContext::new(config.cuda_device_id)?; + let context = TransferContext::new( + config.worker_id, + self.agent, + cuda_context, + config.tokio_runtime, + config.capabilities, + config.operational_backend, + )?; + Ok(TransportManager::from_context(context)) + } + + // Proxy methods to allow configuring other builder fields + pub fn worker_id(mut self, worker_id: u64) -> Self { + self.builder = self.builder.worker_id(worker_id); + self + } + + pub fn cuda_device_id(mut self, cuda_device_id: usize) -> Self { + self.builder = self.builder.cuda_device_id(cuda_device_id); + self + } +} + +fn get_tokio_runtime() -> TokioRuntime { + match tokio::runtime::Handle::try_current() { + Ok(handle) => TokioRuntime::Handle(handle), + Err(_) => { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .max_blocking_threads(4) + .worker_threads(2) + .build() + .expect("failed to build tokio runtime"); + + TokioRuntime::Shared(Arc::new(rt)) + } + } +} + +#[derive(Debug, Clone)] +pub(crate) enum TokioRuntime { + Handle(tokio::runtime::Handle), + Shared(Arc), +} + +impl TokioRuntime { + pub fn handle(&self) -> &tokio::runtime::Handle { + match self { + TokioRuntime::Handle(handle) => handle, + TokioRuntime::Shared(runtime) => runtime.handle(), + } + } +} + +#[derive(Debug, Clone)] +pub struct TransferContext { + worker_id: u64, + nixl_agent: NixlAgent, + #[allow(dead_code)] + cuda_context: Arc, + d2h_stream: Arc, + h2d_stream: Arc, + #[allow(dead_code)] + tokio_runtime: TokioRuntime, + capabilities: TransferCapabilities, + operational_backend: OperationalCopyBackend, + // Channels for background notification handlers + tx_nixl_status: + mpsc::Sender>, + tx_cuda_event: + mpsc::Sender>, + #[allow(dead_code)] + tx_nixl_events: mpsc::Sender, +} + +impl TransferContext { + pub fn builder() -> TransferConfigBuilder { + TransferConfigBuilder::default() + } + + pub(crate) fn new( + worker_id: u64, + nixl_agent: NixlAgent, + cuda_context: Arc, + tokio_runtime: TokioRuntime, + capabilities: TransferCapabilities, + operational_backend: OperationalCopyBackend, + ) -> Result { + unsafe { cuda_context.disable_event_tracking() }; + + // Create channels for background notification handlers + let (tx_nixl_status, rx_nixl_status) = mpsc::channel(64); + let (tx_cuda_event, rx_cuda_event) = mpsc::channel(64); + let (tx_nixl_events, rx_nixl_events) = mpsc::channel(64); + + // Spawn background handlers + let handle = tokio_runtime.handle(); + + // Spawn NIXL status polling handler + handle.spawn(notifications::process_polling_notifications(rx_nixl_status)); + + // Spawn CUDA event polling handler + handle.spawn(notifications::process_polling_notifications(rx_cuda_event)); + + // Spawn NIXL notification events handler + handle.spawn(notifications::process_nixl_notification_events( + nixl_agent.raw_agent().clone(), + rx_nixl_events, + )); + + Ok(Self { + worker_id, + nixl_agent, + cuda_context: cuda_context.clone(), + d2h_stream: cuda_context.new_stream()?, + h2d_stream: cuda_context.new_stream()?, + tokio_runtime, + capabilities, + operational_backend, + tx_nixl_status, + tx_cuda_event, + tx_nixl_events, + }) + } + + pub(crate) fn nixl_agent(&self) -> &NixlAgent { + &self.nixl_agent + } + + #[allow(dead_code)] + pub(crate) fn cuda_context(&self) -> &Arc { + &self.cuda_context + } + + pub(crate) fn d2h_stream(&self) -> &Arc { + &self.d2h_stream + } + + pub(crate) fn h2d_stream(&self) -> &Arc { + &self.h2d_stream + } + + #[allow(dead_code)] + pub(crate) fn tokio(&self) -> &tokio::runtime::Handle { + self.tokio_runtime.handle() + } + + pub(crate) fn capabilities(&self) -> &TransferCapabilities { + &self.capabilities + } + + pub(crate) fn operational_backend(&self) -> OperationalCopyBackend { + self.operational_backend + } + + /// Register a NIXL transfer request for status polling completion. + /// + /// This method enqueues the transfer request to be polled for completion + /// using `agent.get_xfer_status()`. Returns a notification object that + /// can be awaited for completion. + pub(crate) fn register_nixl_status( + &self, + xfer_req: XferRequest, + ) -> TransferCompleteNotification { + let (done_tx, done_rx) = oneshot::channel(); + + let notification = notifications::RegisterPollingNotification { + uuid: Uuid::new_v4(), + checker: notifications::NixlStatusChecker::new( + self.nixl_agent.raw_agent().clone(), + xfer_req, + ), + done: done_tx, + }; + + // Send to background handler (ignore error if receiver dropped) + let _ = self.tx_nixl_status.try_send(notification); + + TransferCompleteNotification { status: done_rx } + } + + /// Register a CUDA event for polling completion. + /// + /// This method enqueues the CUDA event to be polled for completion. + /// Returns a notification object that can be awaited for completion. + pub(crate) fn register_cuda_event(&self, event: CudaEvent) -> TransferCompleteNotification { + let (done_tx, done_rx) = oneshot::channel(); + + let notification = notifications::RegisterPollingNotification { + uuid: Uuid::new_v4(), + checker: notifications::CudaEventChecker::new(event), + done: done_tx, + }; + + // Send to background handler (ignore error if receiver dropped) + let _ = self.tx_cuda_event.try_send(notification); + + TransferCompleteNotification { status: done_rx } + } + + /// Register a NIXL transfer request for notification-based completion. + /// + /// This method enqueues the transfer request to be completed via NIXL + /// notification events. Returns a notification object that can be awaited + /// for completion. + #[allow(dead_code)] + pub(crate) fn register_nixl_event( + &self, + xfer_req: XferRequest, + ) -> TransferCompleteNotification { + let (done_tx, done_rx) = oneshot::channel(); + + let notification = notifications::RegisterNixlNotification { + uuid: Uuid::new_v4(), + xfer_req, + done: done_tx, + }; + + // Send to background handler (ignore error if receiver dropped) + let _ = self.tx_nixl_events.try_send(notification); + + TransferCompleteNotification { status: done_rx } + } + + /// Get the worker ID for this context. + pub(crate) fn worker_id(&self) -> u64 { + self.worker_id + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/executor/cuda.rs b/lib/llm/src/block_manager/v2/physical/transfer/executor/cuda.rs new file mode 100644 index 0000000000..5f6ef5764d --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/executor/cuda.rs @@ -0,0 +1,318 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! CUDA executor for GPU memory transfers. + +use super::TransferContext; +use super::{PhysicalLayout, TransferStrategy}; +use crate::block_manager::v2::kernels::OperationalCopyBackend; +use crate::block_manager::v2::physical::transfer::context::TransferCompleteNotification; +use anyhow::{Result, anyhow}; +use cudarc::driver::result as cuda_result; +use std::ops::Range; + +// #[cfg(test)] +// mod cuda_kernel_tests; + +/// Execute a CUDA transfer between host and device memory. +/// +/// This executor handles transfers involving GPU memory using CUDA APIs. +/// Supports async and blocking transfers depending on the strategy. +/// +/// # Arguments +/// * `src` - Source physical layout +/// * `dst` - Destination physical layout +/// * `src_block_ids` - Source block IDs to transfer +/// * `dst_block_ids` - Destination block IDs to transfer +/// * `layer_range` - Optional range of layers to transfer (None = all layers) +/// * `strategy` - CUDA transfer strategy (H2D, D2H, D2D, async or blocking) +/// * `ctx` - Transfer context with CUDA stream +pub fn execute_cuda_transfer( + src: &PhysicalLayout, + dst: &PhysicalLayout, + src_block_ids: &[usize], + dst_block_ids: &[usize], + layer_range: Option>, + strategy: TransferStrategy, + ctx: &TransferContext, +) -> Result { + // Validate layouts + let src_layout = src.layout(); + let dst_layout = dst.layout(); + + if src_layout.num_layers() != dst_layout.num_layers() { + return Err(anyhow!( + "Layouts have incompatible layer counts: src={}, dst={}", + src_layout.num_layers(), + dst_layout.num_layers() + )); + } + + if src_layout.outer_dim() != dst_layout.outer_dim() { + return Err(anyhow!( + "Layouts have incompatible outer dimensions: src={}, dst={}", + src_layout.outer_dim(), + dst_layout.outer_dim() + )); + } + + // Determine layer range + let layers = layer_range.unwrap_or(0..src_layout.num_layers()); + + // Get appropriate CUDA stream based on transfer direction + let stream = match strategy { + TransferStrategy::CudaAsyncD2H | TransferStrategy::CudaBlockingD2H => ctx.d2h_stream(), + _ => ctx.h2d_stream(), // H2D and D2D use h2d_stream + }; + + // Perform CUDA transfers based on strategy + match strategy { + TransferStrategy::CudaAsyncH2D => { + let backend = ctx.operational_backend(); + if let Err(e) = try_execute_operational_kernel( + src, + dst, + src_block_ids, + dst_block_ids, + layers.clone(), + stream.as_ref(), + backend, + ) { + // Fallback to memcpy-based path + tracing::debug!("Kernel-based H2D failed ({}), falling back to memcpy", e); + execute_h2d( + src, + dst, + src_block_ids, + dst_block_ids, + layers, + stream.as_ref(), + )?; + } + } + TransferStrategy::CudaAsyncD2H => { + let backend = ctx.operational_backend(); + if let Err(e) = try_execute_operational_kernel( + src, + dst, + src_block_ids, + dst_block_ids, + layers.clone(), + stream.as_ref(), + backend, + ) { + // Fallback to memcpy-based path + tracing::debug!("Kernel-based D2H failed ({}), falling back to memcpy", e); + execute_d2h( + src, + dst, + src_block_ids, + dst_block_ids, + layers, + stream.as_ref(), + )?; + } + } + TransferStrategy::CudaAsyncD2D => { + // Try kernel-based path first, fall back to memcpy on error + let backend = ctx.operational_backend(); + if let Err(e) = try_execute_operational_kernel( + src, + dst, + src_block_ids, + dst_block_ids, + layers.clone(), + stream.as_ref(), + backend, + ) { + // Fallback to memcpy-based path + tracing::debug!("Kernel-based D2D failed ({}), falling back to memcpy", e); + execute_d2d( + src, + dst, + src_block_ids, + dst_block_ids, + layers, + stream.as_ref(), + )?; + } + } + TransferStrategy::CudaBlockingH2D => { + execute_h2d( + src, + dst, + src_block_ids, + dst_block_ids, + layers, + stream.as_ref(), + )?; + // Synchronize immediately for blocking transfer + stream.synchronize()?; + } + TransferStrategy::CudaBlockingD2H => { + execute_d2h( + src, + dst, + src_block_ids, + dst_block_ids, + layers, + stream.as_ref(), + )?; + // Synchronize immediately for blocking transfer + stream.synchronize()?; + } + _ => { + return Err(anyhow!("Invalid CUDA transfer strategy: {:?}", strategy)); + } + } + + // For async transfers, record an event and register it for completion tracking + if matches!( + strategy, + TransferStrategy::CudaAsyncH2D + | TransferStrategy::CudaAsyncD2H + | TransferStrategy::CudaAsyncD2D + ) { + let event = stream.record_event(None)?; + Ok(ctx.register_cuda_event(event)) + } else { + // Blocking transfers are already synchronized + Ok(TransferCompleteNotification::completed()) + } +} + +/// Execute host-to-device transfer. +fn execute_h2d( + src: &PhysicalLayout, + dst: &PhysicalLayout, + src_block_ids: &[usize], + dst_block_ids: &[usize], + layers: Range, + stream: &cudarc::driver::CudaStream, +) -> Result<()> { + for (&src_block_id, &dst_block_id) in src_block_ids.iter().zip(dst_block_ids.iter()) { + for layer_id in layers.clone() { + for outer_id in 0..src.layout().outer_dim() { + let src_region = src.memory_region(src_block_id, layer_id, outer_id)?; + let dst_region = dst.memory_region(dst_block_id, layer_id, outer_id)?; + + if src_region.size() != dst_region.size() { + return Err(anyhow!( + "Size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}", + src_block_id, + dst_block_id, + layer_id, + outer_id, + src_region.size(), + dst_region.size() + )); + } + + unsafe { + let src_ptr = src_region.addr() as *const u8; + let dst_ptr = dst_region.addr() as u64; + let src_slice = std::slice::from_raw_parts(src_ptr, src_region.size()); + cuda_result::memcpy_htod_async(dst_ptr, src_slice, stream.cu_stream())?; + } + } + } + } + Ok(()) +} + +/// Execute device-to-host transfer. +fn execute_d2h( + src: &PhysicalLayout, + dst: &PhysicalLayout, + src_block_ids: &[usize], + dst_block_ids: &[usize], + layers: Range, + stream: &cudarc::driver::CudaStream, +) -> Result<()> { + for (&src_block_id, &dst_block_id) in src_block_ids.iter().zip(dst_block_ids.iter()) { + for layer_id in layers.clone() { + for outer_id in 0..src.layout().outer_dim() { + let src_region = src.memory_region(src_block_id, layer_id, outer_id)?; + let dst_region = dst.memory_region(dst_block_id, layer_id, outer_id)?; + + if src_region.size() != dst_region.size() { + return Err(anyhow!( + "Size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}", + src_block_id, + dst_block_id, + layer_id, + outer_id, + src_region.size(), + dst_region.size() + )); + } + + unsafe { + let src_ptr = src_region.addr() as u64; + let dst_ptr = dst_region.addr() as *mut u8; + let dst_slice = std::slice::from_raw_parts_mut(dst_ptr, dst_region.size()); + cuda_result::memcpy_dtoh_async(dst_slice, src_ptr, stream.cu_stream())?; + } + } + } + } + Ok(()) +} + +/// Execute device-to-device transfer. +fn execute_d2d( + src: &PhysicalLayout, + dst: &PhysicalLayout, + src_block_ids: &[usize], + dst_block_ids: &[usize], + layers: Range, + stream: &cudarc::driver::CudaStream, +) -> Result<()> { + for (&src_block_id, &dst_block_id) in src_block_ids.iter().zip(dst_block_ids.iter()) { + for layer_id in layers.clone() { + for outer_id in 0..src.layout().outer_dim() { + let src_region = src.memory_region(src_block_id, layer_id, outer_id)?; + let dst_region = dst.memory_region(dst_block_id, layer_id, outer_id)?; + + if src_region.size() != dst_region.size() { + return Err(anyhow!( + "Size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}", + src_block_id, + dst_block_id, + layer_id, + outer_id, + src_region.size(), + dst_region.size() + )); + } + + unsafe { + let src_ptr = src_region.addr() as u64; + let dst_ptr = dst_region.addr() as u64; + cuda_result::memcpy_dtod_async( + dst_ptr, + src_ptr, + src_region.size(), + stream.cu_stream(), + )?; + } + } + } + } + Ok(()) +} + +/// TODO: For now, we've stubbed this out just so we can merge. +/// For now, we'll always just fall back to memcpy. +#[cfg_attr(test, allow(dead_code))] +pub(crate) fn try_execute_operational_kernel( + _src: &PhysicalLayout, + _dst: &PhysicalLayout, + _src_block_ids: &[usize], + _dst_block_ids: &[usize], + _layers: Range, + _stream: &cudarc::driver::CudaStream, + _backend: OperationalCopyBackend, +) -> Result<()> { + anyhow::bail!("Not implemented."); +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/executor/memcpy.rs b/lib/llm/src/block_manager/v2/physical/transfer/executor/memcpy.rs new file mode 100644 index 0000000000..52d27cc476 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/executor/memcpy.rs @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Memcpy executor for host-to-host transfers. + +use crate::block_manager::v2::physical::transfer::PhysicalLayout; +use crate::block_manager::v2::physical::transfer::context::TransferCompleteNotification; +use anyhow::Result; +use std::ops::Range; + +/// Execute a memcpy transfer between host memory locations. +/// +/// This executor handles transfers between System and Pinned memory using +/// standard CPU memcpy operations. The transfer is synchronous and blocking. +/// +/// # Arguments +/// * `src` - Source physical layout +/// * `dst` - Destination physical layout +/// * `block_pairs` - Pairs of (src_block_id, dst_block_id) to transfer +/// * `layer_range` - Optional range of layers to transfer (None = all layers) +pub fn execute_memcpy_transfer( + src: &PhysicalLayout, + dst: &PhysicalLayout, + src_block_ids: &[usize], + dst_block_ids: &[usize], + layer_range: Option>, +) -> Result { + // Validate layouts have compatible structure + let src_layout = src.layout(); + let dst_layout = dst.layout(); + + if src_layout.num_layers() != dst_layout.num_layers() { + return Err(anyhow::anyhow!( + "Layouts have incompatible layer counts: src={}, dst={}", + src_layout.num_layers(), + dst_layout.num_layers() + )); + } + + if src_layout.outer_dim() != dst_layout.outer_dim() { + return Err(anyhow::anyhow!( + "Layouts have incompatible outer dimensions: src={}, dst={}", + src_layout.outer_dim(), + dst_layout.outer_dim() + )); + } + + // Determine layer range + let layers = layer_range.unwrap_or(0..src_layout.num_layers()); + + // Perform synchronous copies + for (&src_block_id, &dst_block_id) in src_block_ids.iter().zip(dst_block_ids.iter()) { + for layer_id in layers.clone() { + for outer_id in 0..src_layout.outer_dim() { + // Get source and destination memory regions + let src_region = src.memory_region(src_block_id, layer_id, outer_id)?; + let dst_region = dst.memory_region(dst_block_id, layer_id, outer_id)?; + + // Validate sizes match + if src_region.size() != dst_region.size() { + return Err(anyhow::anyhow!( + "Memory region size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}", + src_block_id, + dst_block_id, + layer_id, + outer_id, + src_region.size(), + dst_region.size() + )); + } + + // Perform memcpy + unsafe { + let src_ptr = src_region.addr() as *const u8; + let dst_ptr = dst_region.addr() as *mut u8; + std::ptr::copy_nonoverlapping(src_ptr, dst_ptr, src_region.size()); + } + } + } + } + + // Memcpy is synchronous, so return already-completed notification + Ok(TransferCompleteNotification::completed()) +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/executor/mod.rs b/lib/llm/src/block_manager/v2/physical/transfer/executor/mod.rs new file mode 100644 index 0000000000..a3eeb36379 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/executor/mod.rs @@ -0,0 +1,303 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Transfer executors for different copy strategies. + +pub(super) mod cuda; +mod memcpy; +mod nixl; + +use super::strategy::select_strategy; +use super::validation::validate_block_transfer; +use super::{PhysicalLayout, TransferContext, TransferOptions, TransferPlan, TransferStrategy}; +use crate::block_manager::v2::physical::transfer::{ + StorageKind, context::TransferCompleteNotification, +}; +use anyhow::Result; +use std::ops::Range; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; + +// Re-export the NIXL transfer builder for public use +pub use nixl::NixlTransferBuilder; + +/// Execute a transfer between two physical layouts. +/// +/// This is an internal entry point for all transfer operations called by TransportManager. +/// It selects the appropriate strategy and dispatches to the corresponding executor. +/// +/// # Arguments +/// * `src` - Source physical layout +/// * `dst` - Destination physical layout +/// * `src_block_ids` - Source block IDs to transfer +/// * `dst_block_ids` - Destination block IDs to transfer +/// * `layer_range` - Optional range of layers to transfer (None = all layers) +/// * `ctx` - Transfer context with CUDA stream and NIXL agent +pub fn execute_transfer( + src: &PhysicalLayout, + dst: &PhysicalLayout, + src_block_ids: &[usize], + dst_block_ids: &[usize], + options: TransferOptions, + ctx: &TransferContext, +) -> Result { + // Validate block IDs + validate_block_transfer(src_block_ids, dst_block_ids, None, src, dst, None)?; + + // Select transfer plan based on locations and capabilities + let plan = select_strategy(src, dst, ctx)?; + + // Dispatch based on plan type + match plan { + TransferPlan::Direct(strategy) => execute_direct_transfer( + src, + dst, + src_block_ids, + dst_block_ids, + options.layer_range, + strategy, + ctx, + ), + TransferPlan::TwoHop { + first, + bounce_location, + second, + } => execute_two_hop_transfer(TwoHopTransferParams { + src, + dst, + src_block_ids, + dst_block_ids, + first_strategy: first, + bounce_location, + second_strategy: second, + options, + ctx, + }), + } +} + +/// Execute a direct single-hop transfer. +fn execute_direct_transfer( + src: &PhysicalLayout, + dst: &PhysicalLayout, + src_block_ids: &[usize], + dst_block_ids: &[usize], + layer_range: Option>, + strategy: TransferStrategy, + ctx: &TransferContext, +) -> Result { + match strategy { + TransferStrategy::Memcpy => { + memcpy::execute_memcpy_transfer(src, dst, src_block_ids, dst_block_ids, layer_range) + } + TransferStrategy::CudaAsyncH2D + | TransferStrategy::CudaAsyncD2H + | TransferStrategy::CudaAsyncD2D + | TransferStrategy::CudaBlockingH2D + | TransferStrategy::CudaBlockingD2H => Ok(cuda::execute_cuda_transfer( + src, + dst, + src_block_ids, + dst_block_ids, + layer_range, + strategy, + ctx, + )?), + TransferStrategy::NixlRead + | TransferStrategy::NixlWrite + | TransferStrategy::NixlReadFlipped + | TransferStrategy::NixlWriteFlipped => { + let mut builder = NixlTransferBuilder::new() + .src(src) + .dst(dst) + .src_blocks(src_block_ids) + .dst_blocks(dst_block_ids) + .strategy(strategy); + + if let Some(range) = layer_range { + builder = builder.layer_range(range); + } + + builder.execute(ctx) + } + TransferStrategy::Invalid => Err(anyhow::anyhow!( + "Invalid transfer strategy for src={:?}, dst={:?}", + src.location(), + dst.location() + )), + } +} + +#[allow(clippy::too_many_arguments)] +async fn execute_two_hop_transfer_chunk( + src: &PhysicalLayout, + bounce_layout: &PhysicalLayout, + dst: &PhysicalLayout, + src_block_ids: &[usize], + bounce_block_ids: &[usize], + dst_block_ids: &[usize], + first_strategy: TransferStrategy, + second_strategy: TransferStrategy, + layer_range: &Option>, + ctx: &TransferContext, +) -> Result<()> { + let bounce_ids_to_use = &bounce_block_ids[..src_block_ids.len()]; + + execute_direct_transfer( + src, + bounce_layout, + src_block_ids, + bounce_ids_to_use, + layer_range.clone(), + first_strategy, + ctx, + )? + .await?; + + execute_direct_transfer( + bounce_layout, + dst, + bounce_ids_to_use, + dst_block_ids, + layer_range.clone(), + second_strategy, + ctx, + )? + .await?; + + Ok(()) +} + +/// Parameters for two-hop transfer execution +struct TwoHopTransferParams<'a> { + src: &'a PhysicalLayout, + dst: &'a PhysicalLayout, + src_block_ids: &'a [usize], + dst_block_ids: &'a [usize], + first_strategy: TransferStrategy, + bounce_location: StorageKind, + second_strategy: TransferStrategy, + options: TransferOptions, + ctx: &'a TransferContext, +} + +fn execute_two_hop_transfer(params: TwoHopTransferParams) -> Result { + let TwoHopTransferParams { + src, + dst, + src_block_ids, + dst_block_ids, + first_strategy, + bounce_location, + second_strategy, + options, + ctx, + } = params; + let (tx, rx) = tokio::sync::oneshot::channel(); + + // TODO: Cloning all this stuff is not ideal. + let src_clone = src.clone(); + let dst_clone = dst.clone(); + + let src_block_ids = src_block_ids.to_vec(); + let dst_block_ids = dst_block_ids.to_vec(); + + let options_clone = options.clone(); + + let handle = ctx.tokio(); + let ctx_clone = ctx.clone(); + handle.spawn(async move { + let Some(ref bounce_buffer_spec) = options_clone.bounce_buffer else { + tx.send(Err(anyhow::anyhow!( + "Two-hop transfers require a bounce buffer." + ))) + .unwrap(); + return; + }; + + if bounce_buffer_spec.layout().location() != bounce_location { + tx.send(Err(anyhow::anyhow!( + "Bounce buffer layout does not match bounce location." + ))) + .unwrap(); + return; + } + + let num_bounce_blocks = bounce_buffer_spec.block_ids().len(); + + if num_bounce_blocks < src_block_ids.len() { + for (src_block_ids, dst_block_ids) in src_block_ids + .chunks(num_bounce_blocks) + .zip(dst_block_ids.chunks(num_bounce_blocks)) + { + let bounce_block_ids_to_use = + &bounce_buffer_spec.block_ids()[..src_block_ids.len()]; + if let Err(e) = execute_two_hop_transfer_chunk( + &src_clone, + bounce_buffer_spec.layout(), + &dst_clone, + src_block_ids, + bounce_block_ids_to_use, + dst_block_ids, + first_strategy, + second_strategy, + &options_clone.layer_range, + &ctx_clone, + ) + .await + { + tx.send(Err(e)).unwrap(); + return; + } + } + tx.send(Ok(())).unwrap(); + } else { + let bounce_block_ids_to_use = &bounce_buffer_spec.block_ids()[..src_block_ids.len()]; + let result = execute_two_hop_transfer_chunk( + &src_clone, + bounce_buffer_spec.layout(), + &dst_clone, + src_block_ids.as_slice(), + bounce_block_ids_to_use, + dst_block_ids.as_slice(), + first_strategy, + second_strategy, + &options_clone.layer_range, + &ctx_clone, + ) + .await; + + tx.send(result).unwrap(); + } + }); + + Ok(TransferCompleteNotification { status: rx }) +} + +pub struct TransferNotification { + status: Arc, +} + +impl Default for TransferNotification { + fn default() -> Self { + Self::new() + } +} + +impl TransferNotification { + pub fn new() -> Self { + Self { + status: Arc::new(AtomicBool::new(false)), + } + } + + pub fn done() -> Self { + Self { + status: Arc::new(AtomicBool::new(true)), + } + } + + pub fn is_complete(&self) -> bool { + self.status.load(Ordering::Relaxed) + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/executor/nixl.rs b/lib/llm/src/block_manager/v2/physical/transfer/executor/nixl.rs new file mode 100644 index 0000000000..2fa37f4b38 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/executor/nixl.rs @@ -0,0 +1,320 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Typestate builder for NIXL transfers. +//! +//! This module provides a compile-time safe builder for NIXL transfers that ensures +//! all required parameters are set before execution. + +use super::{PhysicalLayout, TransferContext, TransferStrategy}; +use crate::block_manager::v2::physical::transfer::context::TransferCompleteNotification; +use anyhow::{Result, anyhow}; +use nixl_sys::{XferDescList, XferOp}; +use std::marker::PhantomData; +use std::ops::Range; + +/// Marker type for unset builder fields. +pub struct Unset; + +/// Marker type for set builder fields. +pub struct Set; + +/// Typestate builder for NIXL transfers. +/// +/// This builder uses the typestate pattern to ensure all required parameters are set +/// at compile time. The type parameters track which fields have been set: +/// - `TSrc`: Source layout state +/// - `TDst`: Destination layout state +/// - `TSrcBlocks`: Source block IDs state +/// - `TDstBlocks`: Destination block IDs state +/// - `TStrategy`: Transfer strategy state +pub struct NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, TDstBlocks, TStrategy> { + src: Option<&'a PhysicalLayout>, + dst: Option<&'a PhysicalLayout>, + src_block_ids: Option<&'a [usize]>, + dst_block_ids: Option<&'a [usize]>, + strategy: Option, + layer_range: Option>, + write_notif: Option, + _phantom: PhantomData<(TSrc, TDst, TSrcBlocks, TDstBlocks, TStrategy)>, +} + +impl<'a> NixlTransferBuilder<'a, Unset, Unset, Unset, Unset, Unset> { + /// Creates a new NIXL transfer builder with all fields unset. + pub fn new() -> Self { + Self { + src: None, + dst: None, + src_block_ids: None, + dst_block_ids: None, + strategy: None, + layer_range: None, + write_notif: None, + _phantom: PhantomData, + } + } +} + +impl<'a> Default for NixlTransferBuilder<'a, Unset, Unset, Unset, Unset, Unset> { + fn default() -> Self { + Self::new() + } +} + +// Required field setters - these consume self and return a new builder with the field marked as Set + +impl<'a, TDst, TSrcBlocks, TDstBlocks, TStrategy> + NixlTransferBuilder<'a, Unset, TDst, TSrcBlocks, TDstBlocks, TStrategy> +{ + /// Sets the source physical layout. + pub fn src( + self, + src: &'a PhysicalLayout, + ) -> NixlTransferBuilder<'a, Set, TDst, TSrcBlocks, TDstBlocks, TStrategy> { + NixlTransferBuilder { + src: Some(src), + dst: self.dst, + src_block_ids: self.src_block_ids, + dst_block_ids: self.dst_block_ids, + strategy: self.strategy, + layer_range: self.layer_range, + write_notif: self.write_notif, + _phantom: PhantomData, + } + } +} + +impl<'a, TSrc, TSrcBlocks, TDstBlocks, TStrategy> + NixlTransferBuilder<'a, TSrc, Unset, TSrcBlocks, TDstBlocks, TStrategy> +{ + /// Sets the destination physical layout. + pub fn dst( + self, + dst: &'a PhysicalLayout, + ) -> NixlTransferBuilder<'a, TSrc, Set, TSrcBlocks, TDstBlocks, TStrategy> { + NixlTransferBuilder { + src: self.src, + dst: Some(dst), + src_block_ids: self.src_block_ids, + dst_block_ids: self.dst_block_ids, + strategy: self.strategy, + layer_range: self.layer_range, + write_notif: self.write_notif, + _phantom: PhantomData, + } + } +} + +impl<'a, TSrc, TDst, TDstBlocks, TStrategy> + NixlTransferBuilder<'a, TSrc, TDst, Unset, TDstBlocks, TStrategy> +{ + /// Sets the source block IDs to transfer. + pub fn src_blocks( + self, + src_block_ids: &'a [usize], + ) -> NixlTransferBuilder<'a, TSrc, TDst, Set, TDstBlocks, TStrategy> { + NixlTransferBuilder { + src: self.src, + dst: self.dst, + src_block_ids: Some(src_block_ids), + dst_block_ids: self.dst_block_ids, + strategy: self.strategy, + layer_range: self.layer_range, + write_notif: self.write_notif, + _phantom: PhantomData, + } + } +} + +impl<'a, TSrc, TDst, TSrcBlocks, TStrategy> + NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, Unset, TStrategy> +{ + /// Sets the destination block IDs to transfer. + pub fn dst_blocks( + self, + dst_block_ids: &'a [usize], + ) -> NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, Set, TStrategy> { + NixlTransferBuilder { + src: self.src, + dst: self.dst, + src_block_ids: self.src_block_ids, + dst_block_ids: Some(dst_block_ids), + strategy: self.strategy, + layer_range: self.layer_range, + write_notif: self.write_notif, + _phantom: PhantomData, + } + } +} + +impl<'a, TSrc, TDst, TSrcBlocks, TDstBlocks> + NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, TDstBlocks, Unset> +{ + /// Sets the NIXL transfer strategy (Read or Write). + pub fn strategy( + self, + strategy: TransferStrategy, + ) -> NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, TDstBlocks, Set> { + NixlTransferBuilder { + src: self.src, + dst: self.dst, + src_block_ids: self.src_block_ids, + dst_block_ids: self.dst_block_ids, + strategy: Some(strategy), + layer_range: self.layer_range, + write_notif: self.write_notif, + _phantom: PhantomData, + } + } +} + +// Optional field setters - these can be called at any point in the builder chain + +impl<'a, TSrc, TDst, TSrcBlocks, TDstBlocks, TStrategy> + NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, TDstBlocks, TStrategy> +{ + /// Sets an optional range of layers to transfer. + /// If not called, all layers will be transferred. + pub fn layer_range(mut self, layer_range: Range) -> Self { + self.layer_range = Some(layer_range); + self + } + + /// Sets an optional write notification UUID. + pub fn write_notif(mut self, write_notif: uuid::Uuid) -> Self { + self.write_notif = Some(write_notif); + self + } +} + +// Execute method - only available when all required fields are Set + +impl<'a> NixlTransferBuilder<'a, Set, Set, Set, Set, Set> { + /// Executes the NIXL transfer with the configured parameters. + /// + /// This method is only available when all required fields have been set, + /// enforced at compile time by the typestate pattern. + pub(crate) fn execute(self, ctx: &TransferContext) -> Result { + // Unwrap all required fields (safe because typestate guarantees they're set) + let src = self.src.unwrap(); + let dst = self.dst.unwrap(); + let src_block_ids = self.src_block_ids.unwrap(); + let dst_block_ids = self.dst_block_ids.unwrap(); + let strategy = self.strategy.unwrap(); + let layer_range = self.layer_range; + let _write_notif = self.write_notif; + + // Validate layouts + let src_layout = src.layout(); + let dst_layout = dst.layout(); + + if src_layout.num_layers() != dst_layout.num_layers() { + return Err(anyhow!( + "Layouts have incompatible layer counts: src={}, dst={}", + src_layout.num_layers(), + dst_layout.num_layers() + )); + } + + if src_layout.outer_dim() != dst_layout.outer_dim() { + return Err(anyhow!( + "Layouts have incompatible outer dimensions: src={}, dst={}", + src_layout.outer_dim(), + dst_layout.outer_dim() + )); + } + + // Get NIXL agent + let nixl_agent = ctx.nixl_agent(); + + // Determine layer range + let layers = layer_range.unwrap_or(0..src_layout.num_layers()); + + // Determine NIXL operation type + let xfer_op = match strategy { + TransferStrategy::NixlRead | TransferStrategy::NixlReadFlipped => XferOp::Read, + TransferStrategy::NixlWrite | TransferStrategy::NixlWriteFlipped => XferOp::Write, + _ => { + return Err(anyhow!("Invalid NIXL transfer strategy: {:?}", strategy)); + } + }; + + assert!( + nixl_agent.name() == src.nixl_metadata().agent_name(), + "the source must be local" + ); + + // Capture NIXL metadata for both layouts + let src_metadata = src.nixl_metadata(); + let dst_metadata = dst.nixl_metadata(); + + let src_mem_type = src_metadata.mem_type(); + let dst_mem_type = dst_metadata.mem_type(); + + let src_device_id = src_metadata.device_id(); + let dst_device_id = dst_metadata.device_id(); + + // Build XferDescLists for source and destination + let mut src_dl = XferDescList::new(src_mem_type)?; + let mut dst_dl = XferDescList::new(dst_mem_type)?; + + // Add memory regions to descriptor lists + for (&src_block_id, &dst_block_id) in src_block_ids.iter().zip(dst_block_ids.iter()) { + for layer_id in layers.clone() { + for outer_id in 0..src_layout.outer_dim() { + let src_region = src.memory_region(src_block_id, layer_id, outer_id)?; + let dst_region = dst.memory_region(dst_block_id, layer_id, outer_id)?; + + if src_region.size() != dst_region.size() { + return Err(anyhow!( + "Size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}", + src_block_id, + dst_block_id, + layer_id, + outer_id, + src_region.size(), + dst_region.size() + )); + } + + // Add to source descriptor list + src_dl.add_desc(src_region.addr(), src_region.size(), src_device_id)?; + + // Add to destination descriptor list + dst_dl.add_desc(dst_region.addr(), dst_region.size(), dst_device_id)?; + } + } + } + + // Note: Overlap detection was removed from nixl-sys 0.6.1 + // The NIXL library now handles overlap detection internally + + if matches!( + strategy, + TransferStrategy::NixlReadFlipped | TransferStrategy::NixlWriteFlipped + ) { + std::mem::swap(&mut src_dl, &mut dst_dl); + } + + // Create transfer request + let xfer_req = nixl_agent.create_xfer_req( + xfer_op, + &src_dl, + &dst_dl, + dst_metadata.agent_name(), + None, // opt_args + )?; + + // Post transfer request + // Note: Notification handling via OptArgs can be added later if needed + let still_pending = nixl_agent.post_xfer_req(&xfer_req, None)?; + + if still_pending { + // Register for async completion via status polling + Ok(ctx.register_nixl_status(xfer_req)) + } else { + // Transfer completed synchronously + Ok(TransferCompleteNotification::completed()) + } + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/fill.rs b/lib/llm/src/block_manager/v2/physical/transfer/fill.rs new file mode 100644 index 0000000000..d24b6824ea --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/fill.rs @@ -0,0 +1,273 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Block filling operations for testing. +//! +//! This module provides utilities to populate blocks with specific patterns +//! for verification in round-trip tests. + +use super::PhysicalLayout; + +use crate::block_manager::v2::memory::StorageKind; +use aligned_vec::{AVec, avec}; +use anyhow::{Result, anyhow}; +use cudarc::runtime::sys::{cudaMemcpy, cudaMemcpyKind}; + +use std::{ + fs::File, + io::{Seek, Write}, + mem::ManuallyDrop, + ops::Range, + os::fd::FromRawFd, +}; + +/// Fill strategy for block memory. +#[derive(Debug, Clone, Copy)] +pub enum FillPattern { + /// Fill with a constant byte value + Constant(u8), + + /// Fill with a sequential pattern: block_id + layer_id + offset % 256 + Sequential, +} + +/// Fill blocks in a physical layout with a specific pattern. +/// +/// This operation directly writes to memory and should only be used on +/// local layouts. Remote layouts cannot be filled directly. +/// +/// # Arguments +/// * `layout` - The physical layout containing the blocks +/// * `block_ids` - List of block IDs to fill +/// * `pattern` - Fill pattern to use +/// +/// # Errors +/// Returns an error if: +/// - Layout is remote (cannot fill remote memory directly) +/// - Block IDs are out of range +/// - Memory access fails +pub fn fill_blocks( + layout: &PhysicalLayout, + block_ids: &[usize], + pattern: FillPattern, +) -> Result<()> { + // Can only fill local layouts + let config = layout.layout().config(); + let num_layers = config.num_layers; + let outer_dim = config.outer_dim; + + for &block_id in block_ids { + if block_id >= config.num_blocks { + return Err(anyhow!("Block ID {} out of range", block_id)); + } + + // Fill all layers and outer dimensions for this block + for layer_id in 0..num_layers { + for outer_id in 0..outer_dim { + let region = layout.memory_region(block_id, layer_id, outer_id)?; + + match layout.location() { + StorageKind::System | StorageKind::Pinned => { + fill_memory_region( + region.addr(), + region.size(), + block_id, + layer_id, + pattern, + )?; + } + StorageKind::Device(_) => { + let system_region: Vec = vec![0; region.size()]; + fill_memory_region( + system_region.as_ptr() as usize, + system_region.len(), + block_id, + layer_id, + pattern, + )?; + unsafe { + cudaMemcpy( + region.addr() as *mut std::ffi::c_void, + system_region.as_ptr() as *const std::ffi::c_void, + region.size(), + cudaMemcpyKind::cudaMemcpyHostToDevice, + ); + } + } + StorageKind::Disk(fd) => { + let system_region: AVec = avec![[4096]| 0; region.size()]; + fill_memory_region( + system_region.as_ptr() as usize, + system_region.len(), + block_id, + layer_id, + pattern, + )?; + + let mut file = ManuallyDrop::new(unsafe { File::from_raw_fd(fd as i32) }); + + file.seek(std::io::SeekFrom::Start(region.addr() as u64))?; + file.write_all(&system_region)?; + file.sync_all()?; + file.flush()?; + } + } + } + } + } + + Ok(()) +} + +/// Fill a subset of layers in blocks with a specific pattern. +/// +/// # Arguments +/// * `layout` - The physical layout containing the blocks +/// * `block_ids` - List of block IDs to fill +/// * `layer_range` - Range of layers to fill +/// * `pattern` - Fill pattern to use +pub fn fill_layers( + layout: &PhysicalLayout, + block_ids: &[usize], + layer_range: Range, + pattern: FillPattern, +) -> Result<()> { + let config = layout.layout().config(); + let num_layers = config.num_layers; + let outer_dim = config.outer_dim; + + if layer_range.end > num_layers { + return Err(anyhow!( + "Layer range {:?} exceeds num_layers {}", + layer_range, + num_layers + )); + } + + for &block_id in block_ids { + if block_id >= config.num_blocks { + return Err(anyhow!("Block ID {} out of range", block_id)); + } + + // Fill specified layers and all outer dimensions + for layer_id in layer_range.clone() { + for outer_id in 0..outer_dim { + let region = layout.memory_region(block_id, layer_id, outer_id)?; + fill_memory_region(region.addr(), region.size(), block_id, layer_id, pattern)?; + } + } + } + + Ok(()) +} + +/// Fill a memory region with the specified pattern. +/// +/// # Safety +/// This function performs unsafe memory writes. The caller must ensure: +/// - The memory region is valid and accessible +/// - No other references exist to this memory +fn fill_memory_region( + addr: usize, + size: usize, + block_id: usize, + layer_id: usize, + pattern: FillPattern, +) -> Result<()> { + unsafe { + let ptr = addr as *mut u8; + match pattern { + FillPattern::Constant(value) => { + std::ptr::write_bytes(ptr, value, size); + } + FillPattern::Sequential => { + for offset in 0..size { + let value = ((block_id + layer_id + offset) % 256) as u8; + ptr.add(offset).write(value); + } + } + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::super::tests::*; + use super::*; + use crate::block_manager::v2::memory::actions::Slice; + + #[test] + fn test_fill_blocks_constant() { + let physical = builder(2) + .fully_contiguous() + .allocate_system() + .build() + .unwrap(); + + fill_blocks(&physical, &[0, 1], FillPattern::Constant(42)).unwrap(); + + // Verify all bytes are set to 42 + assert!( + physical + .memory_region(0, 0, 0) + .unwrap() + .as_slice() + .unwrap() + .iter() + .all(|&b| b == 42) + ); + } + + #[test] + fn test_fill_blocks_sequential() { + let physical = builder(2) + .fully_contiguous() + .allocate_system() + .build() + .unwrap(); + + fill_blocks(&physical, &[0, 1], FillPattern::Sequential).unwrap(); + + let mr = physical.memory_region(0, 0, 0).unwrap(); + let mr_slice = mr.as_slice().unwrap(); + + // Verify pattern is applied (spot check a few bytes) + let first_byte = mr_slice[0]; + let second_byte = mr_slice[1]; + assert_eq!(first_byte, 0); + assert_eq!(second_byte, first_byte.wrapping_add(1)); + + let mr = physical.memory_region(1, 1, 0).unwrap(); + let mr_slice = mr.as_slice().unwrap(); + + let first_byte = mr_slice[0]; + let second_byte = mr_slice[1]; + assert_eq!(first_byte, 2); + assert_eq!(second_byte, first_byte.wrapping_add(1)); + } + + #[test] + fn test_fill_layers() { + let physical = builder(2) + .fully_contiguous() + .allocate_system() + .build() + .unwrap(); + + // Fill only layer 0 + fill_layers(&physical, &[0], 0..1, FillPattern::Constant(0)).unwrap(); + fill_layers(&physical, &[0], 1..2, FillPattern::Constant(1)).unwrap(); + fill_layers(&physical, &[1], 0..1, FillPattern::Constant(100)).unwrap(); + fill_layers(&physical, &[1], 1..2, FillPattern::Constant(101)).unwrap(); + + let mr_00 = physical.memory_region(0, 0, 0).unwrap().as_slice().unwrap()[0]; + let mr_01 = physical.memory_region(0, 1, 0).unwrap().as_slice().unwrap()[0]; + let mr_10 = physical.memory_region(1, 0, 0).unwrap().as_slice().unwrap()[0]; + let mr_11 = physical.memory_region(1, 1, 0).unwrap().as_slice().unwrap()[0]; + assert_eq!(mr_00, 0); + assert_eq!(mr_01, 1); + assert_eq!(mr_10, 100); + assert_eq!(mr_11, 101); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/mod.rs b/lib/llm/src/block_manager/v2/physical/transfer/mod.rs new file mode 100644 index 0000000000..18935809f2 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/mod.rs @@ -0,0 +1,120 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Transfer module for copying blocks between layouts with different storage locations. +//! +//! This module provides functionality for transferring KV cache blocks between layouts +//! that may be backed by different storage types (GPU memory, pinned host memory, disk, etc.) +//! and potentially across NIXL-connected remote nodes. +//! +//! # Core Concepts +//! +//! - [`PhysicalLayout`]: Wraps a layout with its physical storage location and NIXL metadata +//! - [`LayoutDescriptor`]: Serializable representation for cross-node communication +//! - Transfer strategies: memcpy, CUDA, NIXL based on source/destination locations +//! - Block-wise and layer-wise transfer operations +//! +//! # Usage +//! +//! ```rust,ignore +//! use dynamo_kvbm::v2::transfer::{PhysicalLayout, transfer_blocks}; +//! +//! // Create local physical layout with NIXL registration +//! let src = PhysicalLayout::new_local(src_layout, StorageKind::Device(0)) +//! .with_nixl_registration("local_agent".to_string())?; +//! +//! // Create remote physical layout +//! let dst = PhysicalLayout::new_remote( +//! dst_layout, +//! StorageKind::Pinned, +//! "remote_agent".to_string() +//! ); +//! +//! // Transfer blocks from local to remote +//! let src_block_ids = [0, 1, 2]; +//! let dst_block_ids = [0, 1, 2]; +//! let future = transfer_blocks(&src, &dst, &src_block_ids, &dst_block_ids, &ctx)?; +//! future.await?; +//! ``` + +pub mod capabilities; +pub mod checksum; +pub mod context; +pub mod executor; +pub mod fill; +pub mod nixl_agent; +pub mod notifications; +pub mod options; +pub mod preferences; +pub mod strategy; +pub mod validation; + +#[cfg(test)] +mod tests; + +// Re-export StorageKind +pub use crate::block_manager::v2::memory::StorageKind; + +pub use capabilities::TransferCapabilities; +pub use checksum::{BlockChecksum, compute_block_checksums, compute_layer_checksums}; +pub use fill::{FillPattern, fill_blocks, fill_layers}; +pub use nixl_agent::{NixlAgent, NixlBackendConfig}; +pub use options::{TransferOptions, TransferOptionsBuilder}; +pub use preferences::{NativeVsNixlPolicy, TransferPreferences}; +pub use strategy::{TransferPlan, TransferStrategy}; +pub use validation::BlockValidationError; + +// Internal - TransferContext is now managed by TransportManager +pub(crate) use context::TransferContext; + +pub use super::layout::PhysicalLayout; + +// Re-export manager types - TransportManager is the primary public API +pub use super::manager::{LayoutHandle, SerializedLayout, TransportManager, WorkerAddress}; + +// #[cfg(test)] +// pub use testing::{RoundTripTest, RoundTripTestResult}; + +use anyhow::Result; + +/// Future representing an in-progress transfer operation. +/// +/// The transfer completes when this future resolves. +pub type TransferFuture = std::pin::Pin> + Send>>; + +/// Specification for bounce buffer in multi-hop transfers. +/// +/// This structure provides the layout and block IDs to use as an intermediate +/// staging area when direct transfers are not allowed. +pub trait BounceBufferSpec: Send + Sync { + fn layout(&self) -> &PhysicalLayout; + fn block_ids(&self) -> &[usize]; +} + +// #[cfg(all(test, feature = "testing-cuda"))] +// mod cuda_integration_tests { +// use super::*; +// use crate::block_manager::v2::layout::{ +// FullyContiguousLayout, Layout, LayoutConfig, MemoryRegion, OwnedMemoryRegion, +// }; +// use cudarc::driver::CudaContext; +// use std::sync::Arc; + +// // TODO: Add CUDA-specific integration tests +// // These would test: +// // - H2D transfers +// // - D2H transfers +// // - D2D transfers +// // - Async completion via event synchronization +// } + +// #[cfg(all(test, feature = "testing-nixl"))] +// mod nixl_integration_tests { +// use super::*; + +// // TODO: Add NIXL-specific integration tests +// // These would test: +// // - Remote memory access via NIXL Read +// // - Disk-backed transfers via NIXL Write +// // - Cross-node serialization with LayoutDescriptor +// } diff --git a/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/config.rs b/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/config.rs new file mode 100644 index 0000000000..b25680fe3b --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/config.rs @@ -0,0 +1,170 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! NIXL backend configuration with Figment support. +//! +//! This module provides configuration extraction for NIXL backends from +//! environment variables with the pattern: `DYN_KVBM_NIXL_BACKEND__=` + +use anyhow::{Result, bail}; +use dynamo_runtime::config::parse_bool; +use std::collections::HashSet; + +/// Configuration for NIXL backends. +/// +/// Supports extracting backend configurations from environment variables: +/// - `DYN_KVBM_NIXL_BACKEND_UCX=true` - Enable UCX backend with default params +/// - `DYN_KVBM_NIXL_BACKEND_GDS=false` - Explicitly disable GDS backend +/// - Valid values: true/false, 1/0, on/off, yes/no (case-insensitive) +/// - Invalid values (e.g., "maybe", "random") will cause an error +/// - Custom params (e.g., `DYN_KVBM_NIXL_BACKEND_UCX_PARAM1=value`) will cause an error +/// +/// # Examples +/// +/// ```rust,ignore +/// // Extract from environment +/// let config = NixlBackendConfig::from_env()?; +/// +/// // Or combine with builder overrides +/// let config = NixlBackendConfig::from_env()? +/// .with_backend("ucx") +/// .with_backend("gds"); +/// ``` +#[derive(Debug, Clone, Default)] +pub struct NixlBackendConfig { + /// Set of enabled backends (just backend names, no custom params yet) + backends: HashSet, +} + +impl NixlBackendConfig { + /// Create a new empty configuration. + pub fn new() -> Self { + Self::default() + } + + /// Create configuration from environment variables. + /// + /// Extracts backends from `DYN_KVBM_NIXL_BACKEND_=` variables. + /// + /// # Errors + /// Returns an error if: + /// - Custom parameters are detected (not yet supported) + /// - Invalid boolean values are provided (must be truthy or falsey) + pub fn from_env() -> Result { + let mut backends = HashSet::new(); + + // Extract all environment variables that match our pattern + for (key, value) in std::env::vars() { + if let Some(remainder) = key.strip_prefix("DYN_KVBM_NIXL_BACKEND_") { + // Check if there's an underscore (indicating custom params) + if remainder.contains('_') { + bail!( + "Custom NIXL backend parameters are not yet supported. \ + Found: {}. Please use only DYN_KVBM_NIXL_BACKEND_=true \ + to enable backends with default parameters.", + key + ); + } + + // Simple backend enablement (e.g., DYN_KVBM_NIXL_BACKEND_UCX=true) + let backend_name = remainder.to_uppercase(); + match parse_bool(&value) { + Ok(true) => { + backends.insert(backend_name); + } + Ok(false) => { + // Explicitly disabled, don't add to backends + continue; + } + Err(e) => bail!("Invalid value for {}: {}", key, e), + } + } + } + + // Default to UCX if no backends specified + if backends.is_empty() { + backends.insert("UCX".to_string()); + } + + Ok(Self { backends }) + } + + /// Add a backend to the configuration. + /// + /// Backend names will be converted to uppercase for consistency. + pub fn with_backend(mut self, backend: impl Into) -> Self { + self.backends.insert(backend.into().to_uppercase()); + self + } + + /// Get the set of enabled backends. + pub fn backends(&self) -> &HashSet { + &self.backends + } + + /// Check if a specific backend is enabled. + pub fn has_backend(&self, backend: &str) -> bool { + self.backends.contains(&backend.to_uppercase()) + } + + /// Merge another configuration into this one. + /// + /// Backends from the other configuration will be added to this one. + pub fn merge(mut self, other: NixlBackendConfig) -> Self { + self.backends.extend(other.backends); + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_new_config_is_empty() { + let config = NixlBackendConfig::new(); + assert!(config.backends().is_empty()); + } + + #[test] + fn test_with_backend() { + let config = NixlBackendConfig::new() + .with_backend("ucx") + .with_backend("gds_mt"); + + assert!(config.has_backend("ucx")); + assert!(config.has_backend("UCX")); + assert!(config.has_backend("gds_mt")); + assert!(config.has_backend("GDS_MT")); + assert!(!config.has_backend("other")); + } + + #[test] + fn test_merge_configs() { + let config1 = NixlBackendConfig::new().with_backend("ucx"); + let config2 = NixlBackendConfig::new().with_backend("gds"); + + let merged = config1.merge(config2); + + assert!(merged.has_backend("ucx")); + assert!(merged.has_backend("gds")); + } + + #[test] + fn test_backend_name_case_insensitive() { + let config = NixlBackendConfig::new() + .with_backend("ucx") + .with_backend("Gds_mt") + .with_backend("OTHER"); + + assert!(config.has_backend("UCX")); + assert!(config.has_backend("ucx")); + assert!(config.has_backend("GDS_MT")); + assert!(config.has_backend("gds_mt")); + assert!(config.has_backend("OTHER")); + assert!(config.has_backend("other")); + } + + // Note: Testing from_env() would require setting environment variables, + // which is challenging in unit tests. This is better tested with integration tests. +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/mod.rs b/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/mod.rs new file mode 100644 index 0000000000..99280ba8e8 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/mod.rs @@ -0,0 +1,258 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! NIXL agent wrapper and configuration. +//! +//! This module provides: +//! - `NixlAgent`: Wrapper around nixl_sys::Agent that tracks initialized backends +//! - `NixlBackendConfig`: Configuration for NIXL backends from environment variables + +mod config; + +pub use config::NixlBackendConfig; + +use anyhow::Result; +use nixl_sys::Agent as RawNixlAgent; +use std::collections::HashSet; + +/// A NIXL agent wrapper that tracks which backends were successfully initialized. +/// +/// This wrapper provides: +/// - Runtime validation of backend availability +/// - Clear error messages when operations need unavailable backends +/// - Single source of truth for backend state in tests and production +/// +/// # Backend Tracking +/// +/// Since `nixl_sys::Agent` doesn't provide a method to query active backends, +/// we track them during initialization. The `available_backends` set is populated +/// based on successful `create_backend()` calls. +#[derive(Clone, Debug)] +pub struct NixlAgent { + agent: RawNixlAgent, + available_backends: HashSet, +} + +impl NixlAgent { + /// Create a new NIXL agent with the specified backends. + /// + /// Attempts to initialize all requested backends. If a backend fails, it logs + /// a warning but continues with remaining backends. At least one backend must + /// succeed or this returns an error. + /// + /// # Arguments + /// * `name` - Agent name + /// * `backends` - List of backend names to try (e.g., `&["UCX", "GDS_MT, "POSIX"]`) + /// + /// # Returns + /// A `NixlAgent` that tracks which backends were successfully initialized. + /// + /// # Errors + /// Returns an error if: + /// - Agent creation fails + /// - All backend initialization attempts fail + pub fn new_with_backends(name: &str, backends: &[&str]) -> Result { + let agent = RawNixlAgent::new(name)?; + let mut available_backends = HashSet::new(); + + for backend in backends { + let backend_upper = backend.to_uppercase(); + match agent.get_plugin_params(&backend_upper) { + Ok((_, params)) => match agent.create_backend(&backend_upper, ¶ms) { + Ok(_) => { + available_backends.insert(backend_upper); + } + Err(e) => { + eprintln!( + "✗ Failed to create {} backend: {}. Operations requiring this backend will fail.", + backend_upper, e + ); + } + }, + Err(_) => { + eprintln!( + "✗ No {} plugin found. Operations requiring this backend will fail.", + backend_upper + ); + } + } + } + + if available_backends.is_empty() { + anyhow::bail!("Failed to initialize any NIXL backends from {:?}", backends); + } + + Ok(Self { + agent, + available_backends, + }) + } + + /// Create a NIXL agent requiring ALL specified backends to be available. + /// + /// Unlike `new_with_backends()` which continues if some backends fail, this method + /// will return an error if ANY backend fails to initialize. Use this in production + /// when specific backends are mandatory. + /// + /// # Arguments + /// * `name` - Agent name + /// * `backends` - List of backend names that MUST be available + /// + /// # Returns + /// A `NixlAgent` with all requested backends initialized. + /// + /// # Errors + /// Returns an error if: + /// - Agent creation fails + /// - Any backend fails to initialize + /// + /// # Example + /// ```ignore + /// // In production: require both UCX and GDS, fail if either is missing + /// let agent = NixlAgent::require_backends("worker-0", &["UCX", "GDS_MT])?; + /// ``` + pub fn require_backends(name: &str, backends: &[&str]) -> Result { + let agent = RawNixlAgent::new(name)?; + let mut available_backends = HashSet::new(); + let mut failed_backends = Vec::new(); + + for backend in backends { + let backend_upper = backend.to_uppercase(); + match agent.get_plugin_params(&backend_upper) { + Ok((_, params)) => match agent.create_backend(&backend_upper, ¶ms) { + Ok(_) => { + available_backends.insert(backend_upper); + } + Err(e) => { + eprintln!("✗ Failed to create {} backend: {}", backend_upper, e); + failed_backends + .push((backend_upper.clone(), format!("create failed: {}", e))); + } + }, + Err(e) => { + eprintln!("✗ No {} plugin found", backend_upper); + failed_backends + .push((backend_upper.clone(), format!("plugin not found: {}", e))); + } + } + } + + if !failed_backends.is_empty() { + let error_details: Vec = failed_backends + .iter() + .map(|(name, reason)| format!("{}: {}", name, reason)) + .collect(); + anyhow::bail!( + "Failed to initialize required backends: [{}]", + error_details.join(", ") + ); + } + + Ok(Self { + agent, + available_backends, + }) + } + + /// Create a NIXL agent with default backends for testing/development. + /// + /// Attempts to initialize UCX, GDS, and POSIX backends. If some are unavailable, + /// continues with whatever succeeds. This ensures code works in various environments. + pub fn new_default(name: &str) -> Result { + Self::new_with_backends(name, &["UCX", "GDS_MT", "POSIX"]) + } + + /// Get a reference to the underlying raw NIXL agent. + pub fn raw_agent(&self) -> &RawNixlAgent { + &self.agent + } + + /// Consume and return the underlying raw NIXL agent. + /// + /// **Warning**: Once consumed, backend tracking is lost. Use this only when + /// interfacing with code that requires `nixl_sys::Agent` directly. + pub fn into_raw_agent(self) -> RawNixlAgent { + self.agent + } + + /// Check if a specific backend is available. + pub fn has_backend(&self, backend: &str) -> bool { + self.available_backends.contains(&backend.to_uppercase()) + } + + /// Get all available backends. + pub fn backends(&self) -> &HashSet { + &self.available_backends + } + + /// Require a specific backend, returning an error if unavailable. + /// + /// Use this at the start of operations that need specific backends. + /// + /// # Example + /// ```ignore + /// agent.require_backend("GDS_MT)?; + /// // Proceed with GDS-specific operations + /// ``` + pub fn require_backend(&self, backend: &str) -> Result<()> { + let backend_upper = backend.to_uppercase(); + if self.has_backend(&backend_upper) { + Ok(()) + } else { + anyhow::bail!( + "Operation requires {} backend, but it was not initialized. Available backends: {:?}", + backend_upper, + self.available_backends + ) + } + } +} + +// Delegate common methods to the underlying agent +impl std::ops::Deref for NixlAgent { + type Target = RawNixlAgent; + + fn deref(&self) -> &Self::Target { + &self.agent + } +} + +#[cfg(all(test, feature = "testing-nixl"))] +mod tests { + use super::*; + + #[test] + fn test_agent_backend_tracking() { + // Try to create agent with UCX + let agent = NixlAgent::new_with_backends("test", &["UCX"]); + + // Should succeed if UCX is available + if let Ok(agent) = agent { + assert!(agent.has_backend("UCX")); + assert!(agent.has_backend("ucx")); // Case insensitive + } + } + + #[test] + fn test_require_backend() { + let agent = NixlAgent::new_with_backends("test", &["UCX"]).expect("Need UCX for test"); + + // Should succeed for available backend + assert!(agent.require_backend("UCX").is_ok()); + + // Should fail for unavailable backend + assert!(agent.require_backend("GDS_MT").is_err()); + } + + #[test] + fn test_require_backends_strict() { + // Should succeed if UCX is available + let agent = NixlAgent::require_backends("test_strict", &["UCX"]) + .expect("Failed to require backends"); + assert!(agent.has_backend("UCX")); + + // Should fail if any backend is missing (GDS likely not available) + let result = NixlAgent::require_backends("test_strict_fail", &["UCX", "DUDE"]); + assert!(result.is_err()); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/notifications/cuda_event.rs b/lib/llm/src/block_manager/v2/physical/transfer/notifications/cuda_event.rs new file mode 100644 index 0000000000..dd4f30e38c --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/notifications/cuda_event.rs @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! CUDA event polling-based completion checker. + +use anyhow::Result; +use cudarc::driver::{CudaEvent, DriverError, result as cuda_result, sys::CUresult}; + +use super::CompletionChecker; + +/// Completion checker that polls CUDA event status. +pub struct CudaEventChecker { + event: CudaEvent, +} + +impl CudaEventChecker { + pub fn new(event: CudaEvent) -> Self { + Self { event } + } +} + +impl CompletionChecker for CudaEventChecker { + fn is_complete(&self) -> Result { + // Query the CUDA event to check if it's complete + // cudaEventQuery returns cudaSuccess if complete, cudaErrorNotReady if still pending + unsafe { + match cuda_result::event::query(self.event.cu_event()) { + Ok(()) => Ok(true), // Event is complete + Err(DriverError(CUresult::CUDA_ERROR_NOT_READY)) => Ok(false), + Err(e) => Err(anyhow::anyhow!("CUDA event query failed: {:?}", e)), + } + } + } +} + +#[cfg(all(test, feature = "testing-cuda"))] +mod tests { + use crate::block_manager::v2::physical::manager::TransportManager; + use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent; + use crate::block_manager::v2::physical::transfer::tests::cuda::CudaSleep; + use std::time::{Duration, Instant}; + + #[tokio::test] + async fn test_cuda_event_delayed_notification() { + let agent = NixlAgent::require_backends("test_agent", &[]).unwrap(); + let manager = TransportManager::builder() + .worker_id(0) + .cuda_device_id(0) + .nixl_agent(agent) + .build() + .unwrap(); + + let stream = manager.h2d_stream(); + let cuda_ctx = manager.cuda_context(); + + // Get or create the CudaSleep utility (compiles kernel and calibrates on first use) + let cuda_sleep = CudaSleep::for_context(cuda_ctx).unwrap(); + + // Test 1: Launch sleep and wait via async notification + let t0_queue_start = Instant::now(); + cuda_sleep + .launch(Duration::from_millis(600), stream) + .unwrap(); + let queue_time = t0_queue_start.elapsed(); + + let event = stream.record_event(None).unwrap(); + let notification = manager.register_cuda_event(event); + notification.await.unwrap(); + let wait_time = t0_queue_start.elapsed() - queue_time; + + println!( + "GPU sleep test: queue {:?}, wait {:?}", + queue_time, wait_time + ); + + assert!( + queue_time < Duration::from_millis(10), + "launching the sleep kernel should be fast: {:?}", + queue_time + ); + + assert!( + wait_time >= Duration::from_millis(500), + "wait time should reflect >=500ms of GPU work: {:?}", + wait_time + ); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/notifications/mod.rs b/lib/llm/src/block_manager/v2/physical/transfer/notifications/mod.rs new file mode 100644 index 0000000000..d23ee4309e --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/notifications/mod.rs @@ -0,0 +1,176 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Transfer completion notification system. +//! +//! This module provides abstractions for waiting on transfer completions using different +//! mechanisms: polling-based (NIXL status, CUDA events) and event-based (NIXL notifications). + +use std::collections::HashMap; +use std::time::{Duration, Instant}; + +use anyhow::Result; +use tokio::sync::{mpsc, oneshot}; +use tokio::time::interval; +use tracing::warn; +use uuid::Uuid; + +pub mod cuda_event; +pub mod nixl_events; +pub mod nixl_status; +pub mod notification; + +pub use cuda_event::CudaEventChecker; +pub use nixl_events::{RegisterNixlNotification, process_nixl_notification_events}; +pub use nixl_status::NixlStatusChecker; +pub use notification::TransferCompleteNotification; + +/// Trait for checking if a transfer operation has completed. +/// Supports polling-based completion checks (NIXL status, CUDA events). +pub trait CompletionChecker: Send { + /// Returns true if the transfer is complete, false if still pending. + fn is_complete(&self) -> Result; +} + +/// Registration message for polling-based transfer completion. +pub struct RegisterPollingNotification { + pub uuid: Uuid, + pub checker: C, + pub done: oneshot::Sender>, +} + +/// Tracking struct for outstanding polling-based transfers. +struct OutstandingPollingTransfer { + checker: C, + done: oneshot::Sender>, + arrived_at: Instant, + last_warned_at: Option, +} + +/// Helper function to check if a transfer should be warned about and log the warning. +/// Returns the new last_warned_at time if a warning was issued. +fn check_and_warn_slow_transfer( + uuid: &Uuid, + arrived_at: Instant, + last_warned_at: Option, +) -> Option { + let elapsed = arrived_at.elapsed(); + if elapsed > Duration::from_secs(60) { + let should_warn = last_warned_at + .map(|last| last.elapsed() > Duration::from_secs(30)) + .unwrap_or(true); + + if should_warn { + warn!( + uuid = %uuid, + elapsed_secs = elapsed.as_secs(), + "Transfer has been pending for over 1 minute" + ); + return Some(Instant::now()); + } + } + last_warned_at +} + +/// Generic polling-based transfer completion handler. +/// Works with any CompletionChecker implementation (NIXL status, CUDA events, etc.) +pub async fn process_polling_notifications( + mut rx: mpsc::Receiver>, +) { + let mut outstanding: HashMap> = HashMap::new(); + let mut check_interval = interval(Duration::from_millis(1)); + + loop { + tokio::select! { + // Handle new transfer requests + notification = rx.recv() => { + match notification { + Some(notif) => { + outstanding.insert(notif.uuid, OutstandingPollingTransfer { + checker: notif.checker, + done: notif.done, + arrived_at: Instant::now(), + last_warned_at: None, + }); + } + None => { + // Channel closed, finish processing outstanding transfers then exit + break; + } + } + } + + // Periodically check status of outstanding transfers + _ = check_interval.tick(), if !outstanding.is_empty() => { + let mut completed = Vec::new(); + + for (uuid, transfer) in outstanding.iter_mut() { + // Check transfer status + match transfer.checker.is_complete() { + Ok(true) => { + // Transfer complete - mark for removal + completed.push((*uuid, Ok(()))); + } + Ok(false) => { + // Transfer still in progress - check if we should warn + transfer.last_warned_at = check_and_warn_slow_transfer( + uuid, + transfer.arrived_at, + transfer.last_warned_at, + ); + } + Err(e) => { + warn!( + uuid = %uuid, + error = %e, + "Transfer status check failed" + ); + completed.push((*uuid, Err(e))); + } + } + } + + // Remove completed transfers and signal completion + for (uuid, result) in completed { + if let Some(transfer) = outstanding.remove(&uuid) { + // Signal completion (ignore if receiver dropped) + let _ = transfer.done.send(result); + } + } + } + } + } + + // Channel closed, but we may still have outstanding transfers + // Continue processing them until all are complete + while !outstanding.is_empty() { + check_interval.tick().await; + + let mut completed = Vec::new(); + + for (uuid, transfer) in outstanding.iter() { + match transfer.checker.is_complete() { + Ok(true) => { + completed.push((*uuid, Ok(()))); + } + Ok(false) => { + // Still pending + } + Err(e) => { + warn!( + uuid = %uuid, + error = %e, + "Transfer status check failed during shutdown" + ); + completed.push((*uuid, Err(e))); + } + } + } + + for (uuid, result) in completed { + if let Some(transfer) = outstanding.remove(&uuid) { + let _ = transfer.done.send(result); + } + } + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_events.rs b/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_events.rs new file mode 100644 index 0000000000..65a02936d6 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_events.rs @@ -0,0 +1,188 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! NIXL notification-based completion handler. + +use std::collections::HashMap; +use std::time::{Duration, Instant}; + +use anyhow::Result; +use nixl_sys::{Agent as NixlAgent, NotificationMap, XferRequest}; +use tokio::sync::{mpsc, oneshot}; +use tokio::time::interval; +use tracing::warn; +use uuid::Uuid; + +/// Registration message for NIXL notification-based transfer completion. +pub struct RegisterNixlNotification { + pub uuid: Uuid, + pub xfer_req: XferRequest, + pub done: oneshot::Sender>, +} + +/// Tracking struct for outstanding NIXL notification transfers. +struct OutstandingTransfer { + #[allow(dead_code)] // Kept for potential future cleanup or debugging + xfer_req: XferRequest, + done: oneshot::Sender>, + arrived_at: Instant, + last_warned_at: Option, +} + +/// Helper function to check if a transfer should be warned about and log the warning. +/// Returns the new last_warned_at time if a warning was issued. +fn check_and_warn_slow_transfer( + uuid: &Uuid, + arrived_at: Instant, + last_warned_at: Option, +) -> Option { + let elapsed = arrived_at.elapsed(); + if elapsed > Duration::from_secs(60) { + let should_warn = last_warned_at + .map(|last| last.elapsed() > Duration::from_secs(30)) + .unwrap_or(true); + + if should_warn { + warn!( + uuid = %uuid, + elapsed_secs = elapsed.as_secs(), + "Transfer has been pending for over 1 minute" + ); + return Some(Instant::now()); + } + } + last_warned_at +} + +/// NIXL notification-based transfer completion handler. +/// Fetches notifications in batches and matches them against outstanding transfers. +pub async fn process_nixl_notification_events( + agent: NixlAgent, + mut rx: mpsc::Receiver, +) { + let mut outstanding: HashMap = HashMap::new(); + let mut check_interval = interval(Duration::from_millis(1)); + + loop { + tokio::select! { + // Handle new transfer requests + notification = rx.recv() => { + match notification { + Some(notif) => { + outstanding.insert(notif.uuid, OutstandingTransfer { + xfer_req: notif.xfer_req, + done: notif.done, + arrived_at: Instant::now(), + last_warned_at: None, + }); + } + None => { + // Channel closed, finish processing outstanding transfers then exit + break; + } + } + } + + // Periodically fetch and process notifications + _ = check_interval.tick(), if !outstanding.is_empty() => { + // Create notification map inside this branch to avoid Send issues + let mut notif_map = match NotificationMap::new() { + Ok(map) => map, + Err(e) => { + warn!(error = %e, "Failed to create notification map"); + continue; + } + }; + + // Fetch all pending notifications + if let Err(e) = agent.get_notifications(&mut notif_map, None) { + warn!(error = %e, "Failed to fetch NIXL notifications"); + continue; + } + + // Process notifications and match against outstanding transfers + let notifications = match notif_map.take_notifs() { + Ok(notifs) => notifs, + Err(e) => { + warn!(error = %e, "Failed to extract notifications from map"); + continue; + } + }; + + let mut completed = Vec::new(); + + // Iterate through all notifications + for (_agent_name, notif_strings) in notifications { + for notif_str in notif_strings { + // Try to parse notification as UUID + // NOTE: This assumes notifications contain UUIDs. + // The actual format may be different and may need adjustment. + if let Ok(notif_uuid) = Uuid::parse_str(¬if_str) { + if outstanding.contains_key(¬if_uuid) { + completed.push(notif_uuid); + } else { + // Notification arrived before we started waiting for it + // This is the race condition we need to handle + warn!( + uuid = %notif_uuid, + "Received notification for transfer not in outstanding map (early arrival)" + ); + } + } + } + } + + // Check for slow transfers and update warnings + for (uuid, transfer) in outstanding.iter_mut() { + if !completed.contains(uuid) { + transfer.last_warned_at = check_and_warn_slow_transfer( + uuid, + transfer.arrived_at, + transfer.last_warned_at, + ); + } + } + + // Remove completed transfers and signal completion + for uuid in completed { + if let Some(transfer) = outstanding.remove(&uuid) { + let _ = transfer.done.send(Ok(())); + } + } + } + } + } + + // Channel closed, but we may still have outstanding transfers + // Continue processing them until all are complete + while !outstanding.is_empty() { + check_interval.tick().await; + + let mut notif_map = match NotificationMap::new() { + Ok(map) => map, + Err(_) => continue, + }; + + if let Ok(()) = agent.get_notifications(&mut notif_map, None) + && let Ok(notifications) = notif_map.take_notifs() + { + let mut completed = Vec::new(); + + for (_agent_name, notif_strings) in notifications { + for notif_str in notif_strings { + if let Ok(notif_uuid) = Uuid::parse_str(¬if_str) + && outstanding.contains_key(¬if_uuid) + { + completed.push(notif_uuid); + } + } + } + + for uuid in completed { + if let Some(transfer) = outstanding.remove(&uuid) { + let _ = transfer.done.send(Ok(())); + } + } + } + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_status.rs b/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_status.rs new file mode 100644 index 0000000000..b1b6027a1a --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_status.rs @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! NIXL status polling-based completion checker. + +use anyhow::{Result, anyhow}; +use nixl_sys::{Agent as NixlAgent, XferRequest}; + +use super::CompletionChecker; + +/// Completion checker that polls NIXL transfer status. +pub struct NixlStatusChecker { + agent: NixlAgent, + xfer_req: XferRequest, +} + +impl NixlStatusChecker { + pub fn new(agent: NixlAgent, xfer_req: XferRequest) -> Self { + Self { agent, xfer_req } + } +} + +impl CompletionChecker for NixlStatusChecker { + fn is_complete(&self) -> Result { + // get_xfer_status returns XferStatus enum: + // - XferStatus::Success means transfer is complete + // - XferStatus::InProgress means still pending + match self.agent.get_xfer_status(&self.xfer_req) { + Ok(status) => Ok(status.is_success()), + Err(e) => Err(anyhow!("NIXL transfer status check failed: {}", e)), + } + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/notifications/notification.rs b/lib/llm/src/block_manager/v2/physical/transfer/notifications/notification.rs new file mode 100644 index 0000000000..d95a1d0316 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/notifications/notification.rs @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Transfer completion notification handle. + +use anyhow::Result; +use tokio::sync::oneshot; + +/// Notification handle for an in-progress transfer. +/// +/// This object can be awaited to block until the transfer completes. +/// The transfer is tracked by a background handler that polls for completion +/// or processes notification events. +pub struct TransferCompleteNotification { + pub(crate) status: oneshot::Receiver>, +} + +impl TransferCompleteNotification { + /// Create a notification that is already completed (for synchronous transfers). + /// + /// This is useful for transfers that complete immediately without needing + /// background polling, such as memcpy operations. + pub fn completed() -> Self { + let (tx, rx) = oneshot::channel(); + // Signal completion immediately + let _ = tx.send(Ok(())); + Self { status: rx } + } + + /// Wait for the transfer to complete (blocking). + /// + /// This method blocks the current thread until the transfer completes. + /// Use `.await` for async contexts. + /// + /// Returns `Ok(())` when the transfer successfully completes, or an error + /// if the background handler was dropped before completion or if the transfer failed. + pub fn wait(self) -> Result<()> { + self.status + .blocking_recv() + .map_err(|_| anyhow::anyhow!("Transfer handler dropped before completion"))? + } +} + +impl std::future::Future for TransferCompleteNotification { + type Output = Result<()>; + + fn poll( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll { + use std::pin::Pin; + Pin::new(&mut self.status).poll(cx).map(|result| { + result + .map_err(|_| anyhow::anyhow!("Transfer handler dropped before completion")) + .and_then(|r| r) + }) + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/options.rs b/lib/llm/src/block_manager/v2/physical/transfer/options.rs new file mode 100644 index 0000000000..3eee954b4d --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/options.rs @@ -0,0 +1,117 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Transfer options for configuring block and layer transfers. + +use super::BounceBufferSpec; +use derive_builder::Builder; +use std::{ops::Range, sync::Arc}; + +/// Options for configuring transfer operations. +/// +/// This structure provides configuration for block and layer transfers, +/// including layer ranges, NIXL write notifications, and bounce buffers. +/// +/// # Examples +/// +/// ```rust,ignore +/// let options = TransferOptions::builder() +/// .nixl_write_notification(42) +/// .layer_range(0..10) +/// .build(); +/// ``` +#[derive(Clone, Default, Builder)] +#[builder(pattern = "owned", default)] +pub struct TransferOptions { + /// Range of layers to transfer (None = all layers). + /// + /// When specified, only the layers in this range will be transferred. + /// This is useful for partial block transfers or layer-specific operations. + #[builder(default, setter(strip_option))] + pub layer_range: Option>, + + /// NIXL write notification value delivered after RDMA write completes. + /// + /// When specified, NIXL will deliver this notification value to the remote + /// node after the RDMA write operation completes. This enables efficient + /// notification of transfer completion without requiring polling. + #[builder(default, setter(strip_option))] + pub nixl_write_notification: Option, + + /// Bounce buffer specification for multi-hop transfers. + /// + /// When direct transfers are not allowed or efficient, this specifies + /// an intermediate staging area. The transfer will be split into two hops: + /// source → bounce buffer → destination. + #[builder(default, setter(strip_option, into))] + pub bounce_buffer: Option>, +} + +impl TransferOptions { + /// Create a new builder for transfer options. + pub fn builder() -> TransferOptionsBuilder { + TransferOptionsBuilder::default() + } + + /// Create transfer options from an optional layer range. + pub fn from_layer_range(layer_range: Option>) -> Self { + Self { + layer_range, + ..Self::default() + } + } + + /// Create default transfer options. + /// + /// This transfers all layers with no special configuration. + pub fn new() -> Self { + Self::default() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_options() { + let options = TransferOptions::default(); + assert!(options.layer_range.is_none()); + assert!(options.nixl_write_notification.is_none()); + assert!(options.bounce_buffer.is_none()); + } + + #[test] + fn test_builder_with_notification() { + let options = TransferOptions::builder() + .nixl_write_notification(42) + .build() + .unwrap(); + + assert_eq!(options.nixl_write_notification, Some(42)); + assert!(options.layer_range.is_none()); + } + + #[test] + fn test_builder_with_layer_range() { + let options = TransferOptions::builder() + .layer_range(0..10) + .build() + .unwrap(); + + assert_eq!(options.layer_range, Some(0..10)); + assert!(options.nixl_write_notification.is_none()); + } + + #[test] + fn test_builder_with_all_options() { + let options = TransferOptions::builder() + .nixl_write_notification(100) + .layer_range(5..15) + .build() + .unwrap(); + + assert_eq!(options.nixl_write_notification, Some(100)); + assert_eq!(options.layer_range, Some(5..15)); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/preferences.rs b/lib/llm/src/block_manager/v2/physical/transfer/preferences.rs new file mode 100644 index 0000000000..1f14db205e --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/preferences.rs @@ -0,0 +1,120 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Transfer preferences for resolving redundant strategy choices. +//! +//! Some source/destination combinations can use multiple transfer strategies. +//! For example: +//! - System ↔ Pinned: memcpy or NIXL +//! - Pinned ↔ Device: CUDA or NIXL +//! +//! This module provides preferences to control which strategy to prefer. + +use serde::{Deserialize, Serialize}; + +/// Policy for choosing between native transports (memcpy/CUDA) and NIXL. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +pub enum NativeVsNixlPolicy { + /// Always prefer native transports (memcpy/CUDA) when available + PreferNative, + + /// Always prefer NIXL when available + PreferNixl, + + /// Use native for local-to-local, NIXL for remote/disk + #[default] + Automatic, +} + +/// Transfer preferences for strategy selection. +/// +/// These preferences allow fine-grained control over transfer strategy selection +/// when multiple valid strategies exist for a source/destination pair. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TransferPreferences { + /// Policy for native vs NIXL transport selection + pub native_vs_nixl: NativeVsNixlPolicy, + + /// Whether to prefer async CUDA operations over blocking ones + pub prefer_async_cuda: bool, +} + +impl Default for TransferPreferences { + fn default() -> Self { + Self { + native_vs_nixl: NativeVsNixlPolicy::default(), + prefer_async_cuda: true, + } + } +} + +impl TransferPreferences { + /// Create preferences with all defaults. + pub fn new() -> Self { + Self::default() + } + + /// Create preferences that always prefer native transports. + pub fn prefer_native() -> Self { + Self { + native_vs_nixl: NativeVsNixlPolicy::PreferNative, + prefer_async_cuda: true, + } + } + + /// Create preferences that always prefer NIXL. + pub fn prefer_nixl() -> Self { + Self { + native_vs_nixl: NativeVsNixlPolicy::PreferNixl, + prefer_async_cuda: true, + } + } + + /// Set the native vs NIXL policy. + pub fn with_native_vs_nixl(mut self, policy: NativeVsNixlPolicy) -> Self { + self.native_vs_nixl = policy; + self + } + + /// Set whether to prefer async CUDA operations. + pub fn with_async_cuda(mut self, prefer_async: bool) -> Self { + self.prefer_async_cuda = prefer_async; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_preferences() { + let prefs = TransferPreferences::default(); + assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::Automatic); + assert!(prefs.prefer_async_cuda); + } + + #[test] + fn test_prefer_native() { + let prefs = TransferPreferences::prefer_native(); + assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::PreferNative); + assert!(prefs.prefer_async_cuda); + } + + #[test] + fn test_prefer_nixl() { + let prefs = TransferPreferences::prefer_nixl(); + assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::PreferNixl); + assert!(prefs.prefer_async_cuda); + } + + #[test] + fn test_builder_pattern() { + let prefs = TransferPreferences::new() + .with_native_vs_nixl(NativeVsNixlPolicy::PreferNixl) + .with_async_cuda(false); + + assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::PreferNixl); + assert!(!prefs.prefer_async_cuda); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/strategy.rs b/lib/llm/src/block_manager/v2/physical/transfer/strategy.rs new file mode 100644 index 0000000000..12eeeb67eb --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/strategy.rs @@ -0,0 +1,506 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Transfer strategy selection based on source and destination storage locations. + +use crate::block_manager::v2::memory::StorageKind; + +use super::TransferCapabilities; +use crate::block_manager::v2::physical::{layout::PhysicalLayout, transfer::TransferContext}; + +/// Transfer strategy to use for copying memory between locations. +/// +/// The strategy is determined by the source and destination storage locations. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TransferStrategy { + /// CPU memcpy (for host-to-host transfers) + Memcpy, + + /// CUDA async host-to-device transfer + CudaAsyncH2D, + + /// CUDA async device-to-host transfer + CudaAsyncD2H, + + /// CUDA async device-to-device transfer + CudaAsyncD2D, + + /// CUDA blocking host-to-device transfer + CudaBlockingH2D, + + /// CUDA blocking device-to-host transfer + CudaBlockingD2H, + + /// NIXL read operation (pull from remote) + NixlRead, + + /// NIXL write operation (push to remote) + NixlWrite, + + /// NIXL write (flipped local and remote order) + /// This is needed for some NIXL backends. + /// For example, the POSIX backend requires that host memory + /// always be the "local" descriptor list, regardless of whether + /// it's a read or write. + NixlWriteFlipped, + + /// NIXL read (flipped local and remote order) + NixlReadFlipped, + + /// Invalid/unsupported transfer + Invalid, +} + +/// Plan for executing a transfer, either direct or via bounce buffer. +/// +/// Some transfers require staging through host memory when direct paths +/// are not enabled via capabilities. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TransferPlan { + /// Direct single-hop transfer using the specified strategy. + Direct(TransferStrategy), + + /// Two-hop transfer requiring a bounce buffer in host memory. + /// + /// This is used when: + /// - Device → Remote (without GPU RDMA) + /// - Disk → Remote + /// - Device ↔ Disk (without GDS) + TwoHop { + /// First hop strategy (src → bounce) + first: TransferStrategy, + + /// Bounce buffer location (always Pinned for best performance) + bounce_location: StorageKind, + + /// Second hop strategy (bounce → dst) + second: TransferStrategy, + }, +} + +pub(crate) fn select_strategy( + src: &PhysicalLayout, + dst: &PhysicalLayout, + ctx: &TransferContext, +) -> anyhow::Result { + let is_src_local = src.nixl_metadata().agent_name() == ctx.nixl_agent().name(); + let is_dst_local = dst.nixl_metadata().agent_name() == ctx.nixl_agent().name(); + + if !is_src_local && !is_dst_local { + return Err(anyhow::anyhow!( + "Both src and dst are remote - this is not supported." + )); + } + + if is_src_local && is_dst_local { + return Ok(select_direct_strategy( + src.location(), + dst.location(), + false, + ctx.capabilities(), + )); + } + + select_remote_strategy_v2( + src.location(), + is_src_local, + dst.location(), + is_dst_local, + ctx.capabilities(), + ) +} + +/// Select the appropriate transfer plan based on source and destination locations. +/// +/// # Arguments +/// * `src` - Source storage location (always local) +/// * `dst` - Destination storage location (can be local or remote) +/// * `dst_is_remote` - Whether destination is on a remote node +/// * `capabilities` - Transfer capability flags +/// +/// # Returns +/// A transfer plan (direct or two-hop) +/// +/// # Conservative Default Policy +/// +/// With default capabilities (all disabled): +/// - Device can only transfer to/from Host +/// - Disk can only transfer to/from Host +/// - Host can transfer to Device, Disk, or Remote +/// - Device ↔ Device is allowed (native CUDA) +/// +/// Transfers that would violate this policy are staged through host: +/// - Device → Remote: Device → Host → Remote (2 hops) +/// - Disk → Remote: Disk → Host → Remote (2 hops) +/// - Device ↔ Disk: Device → Host → Disk (2 hops) +/// +/// # Optional Direct Paths +/// +/// - `allow_gds`: Enables Disk ↔ Device direct transfers +/// - `allow_gpu_rdma`: Enables Device → Remote direct transfers +fn select_direct_strategy( + src: StorageKind, + dst: StorageKind, + dst_is_remote: bool, + capabilities: &TransferCapabilities, +) -> TransferPlan { + use StorageKind::*; + use TransferStrategy::*; + + // Handle remote destination + if dst_is_remote { + return select_remote_strategy(src, capabilities); + } + + // Local-to-local transfers + match (src, dst) { + // Host ↔ Host - direct memcpy + (System, System) | (System, Pinned) | (Pinned, System) | (Pinned, Pinned) => { + TransferPlan::Direct(Memcpy) + } + + // Host → Device - direct CUDA + (System, Device(_)) => TransferPlan::Direct(CudaBlockingH2D), + (Pinned, Device(_)) => TransferPlan::Direct(CudaAsyncH2D), + + // Device → Host - direct CUDA + (Device(_), System) => TransferPlan::Direct(CudaBlockingD2H), + (Device(_), Pinned) => TransferPlan::Direct(CudaAsyncD2H), + + // Device ↔ Device - direct CUDA + (Device(_), Device(_)) => TransferPlan::Direct(CudaAsyncD2D), + + // Host ↔ Disk - direct NIXL + (System, Disk(_)) | (Pinned, Disk(_)) => TransferPlan::Direct(NixlWrite), + (Disk(_), System) | (Disk(_), Pinned) => TransferPlan::Direct(NixlReadFlipped), + + // Disk ↔ Disk - NIXL doesn't seem to support direct transfers here. + // Leaving this as two-hop for now. + (Disk(_), Disk(_)) => TransferPlan::TwoHop { + first: NixlReadFlipped, + bounce_location: Pinned, + second: NixlWrite, + }, + + // Device ↔ Disk - check GDS capability + (Device(_), Disk(_)) => { + if capabilities.allows_device_disk_direct() { + // Direct GDS transfer + TransferPlan::Direct(NixlWrite) + } else { + // Stage through host: Device → Pinned → Disk + TransferPlan::TwoHop { + first: CudaAsyncD2H, + bounce_location: Pinned, + second: NixlWrite, + } + } + } + (Disk(_), Device(_)) => { + if capabilities.allows_device_disk_direct() { + // Direct GDS transfer + TransferPlan::Direct(NixlRead) + } else { + // Stage through host: Disk → Pinned → Device + TransferPlan::TwoHop { + first: NixlReadFlipped, + bounce_location: Pinned, + second: CudaAsyncH2D, + } + } + } + } +} + +/// Select transfer strategy for remote destination. +fn select_remote_strategy(src: StorageKind, capabilities: &TransferCapabilities) -> TransferPlan { + use StorageKind::*; + use TransferStrategy::*; + + match src { + // Host → Remote - direct NIXL + System | Pinned => TransferPlan::Direct(NixlWrite), + + // Device → Remote - check GPU RDMA capability + Device(_) => { + if capabilities.allows_device_remote_direct() { + // Direct GPU RDMA transfer + TransferPlan::Direct(NixlWrite) + } else { + // Stage through host: Device → Pinned → Remote + TransferPlan::TwoHop { + first: CudaAsyncD2H, + bounce_location: Pinned, + second: NixlWrite, + } + } + } + + // Disk → Remote - always stage through host + Disk(_) => TransferPlan::TwoHop { + first: NixlWrite, + bounce_location: Pinned, + second: NixlWrite, + }, + } +} + +fn select_remote_strategy_v2( + src: StorageKind, + is_src_local: bool, + dst: StorageKind, + is_dst_local: bool, + capabilities: &TransferCapabilities, +) -> anyhow::Result { + // We only support System, Pinned and Device for remote transfers. + // Later we might support staged/bounce buffer transfers. + + if matches!(src, StorageKind::Disk(_)) | matches!(dst, StorageKind::Disk(_)) { + return Err(anyhow::anyhow!( + "Neither local nor remote disk transfers are supported over NIXL at this time." + )); + } + + if !capabilities.allow_gpu_rdma + && (matches!(src, StorageKind::Device(_)) || matches!(dst, StorageKind::Device(_))) + { + return Err(anyhow::anyhow!( + "GPU RDMA is disabled - this transfer requires GPU RDMA." + )); + } + + if is_src_local && !is_dst_local { + return Ok(TransferPlan::Direct(TransferStrategy::NixlWrite)); + } + + if is_dst_local && !is_src_local { + return Ok(TransferPlan::Direct(TransferStrategy::NixlReadFlipped)); + } + + unreachable!("Both src and dst are remote - this is not supported."); +} + +#[cfg(test)] +mod tests { + use super::*; + + fn default_caps() -> TransferCapabilities { + TransferCapabilities::default() + } + + #[test] + fn test_host_to_host_transfers() { + let caps = default_caps(); + assert_eq!( + select_direct_strategy(StorageKind::System, StorageKind::System, false, &caps), + TransferPlan::Direct(TransferStrategy::Memcpy) + ); + assert_eq!( + select_direct_strategy(StorageKind::System, StorageKind::Pinned, false, &caps), + TransferPlan::Direct(TransferStrategy::Memcpy) + ); + assert_eq!( + select_direct_strategy(StorageKind::Pinned, StorageKind::System, false, &caps), + TransferPlan::Direct(TransferStrategy::Memcpy) + ); + assert_eq!( + select_direct_strategy(StorageKind::Pinned, StorageKind::Pinned, false, &caps), + TransferPlan::Direct(TransferStrategy::Memcpy) + ); + } + + #[test] + fn test_host_to_device_transfers() { + let caps = default_caps(); + // System (unpinned) to device should be blocking + assert_eq!( + select_direct_strategy(StorageKind::System, StorageKind::Device(0), false, &caps), + TransferPlan::Direct(TransferStrategy::CudaBlockingH2D) + ); + + // Pinned to device should be async + assert_eq!( + select_direct_strategy(StorageKind::Pinned, StorageKind::Device(0), false, &caps), + TransferPlan::Direct(TransferStrategy::CudaAsyncH2D) + ); + } + + #[test] + fn test_device_to_host_transfers() { + let caps = default_caps(); + // Device to system should be blocking + assert_eq!( + select_direct_strategy(StorageKind::Device(0), StorageKind::System, false, &caps), + TransferPlan::Direct(TransferStrategy::CudaBlockingD2H) + ); + + // Device to pinned should be async + assert_eq!( + select_direct_strategy(StorageKind::Device(0), StorageKind::Pinned, false, &caps), + TransferPlan::Direct(TransferStrategy::CudaAsyncD2H) + ); + } + + #[test] + fn test_device_to_device_transfers() { + let caps = default_caps(); + assert_eq!( + select_direct_strategy(StorageKind::Device(0), StorageKind::Device(1), false, &caps), + TransferPlan::Direct(TransferStrategy::CudaAsyncD2D) + ); + assert_eq!( + select_direct_strategy(StorageKind::Device(3), StorageKind::Device(3), false, &caps), + TransferPlan::Direct(TransferStrategy::CudaAsyncD2D) + ); + } + + #[test] + fn test_disk_to_host_transfers() { + let caps = default_caps(); + // Disk to host - direct NIXL + assert_eq!( + select_direct_strategy(StorageKind::Disk(42), StorageKind::System, false, &caps), + TransferPlan::Direct(TransferStrategy::NixlReadFlipped) + ); + assert_eq!( + select_direct_strategy(StorageKind::Disk(42), StorageKind::Pinned, false, &caps), + TransferPlan::Direct(TransferStrategy::NixlReadFlipped) + ); + } + + #[test] + fn test_host_to_disk_transfers() { + let caps = default_caps(); + // Host to disk - direct NIXL + assert_eq!( + select_direct_strategy(StorageKind::System, StorageKind::Disk(42), false, &caps), + TransferPlan::Direct(TransferStrategy::NixlWrite) + ); + assert_eq!( + select_direct_strategy(StorageKind::Pinned, StorageKind::Disk(42), false, &caps), + TransferPlan::Direct(TransferStrategy::NixlWrite) + ); + } + + #[test] + fn test_device_to_disk_without_gds() { + let caps = default_caps(); // GDS disabled + // Device → Disk should use bounce buffer + let plan = + select_direct_strategy(StorageKind::Device(0), StorageKind::Disk(42), false, &caps); + match plan { + TransferPlan::TwoHop { + first, + bounce_location, + second, + } => { + assert_eq!(first, TransferStrategy::CudaAsyncD2H); + assert_eq!(bounce_location, StorageKind::Pinned); + assert_eq!(second, TransferStrategy::NixlWrite); + } + _ => panic!("Expected TwoHop plan"), + } + } + + #[test] + fn test_disk_to_device_without_gds() { + let caps = default_caps(); // GDS disabled + // Disk → Device should use bounce buffer + let plan = + select_direct_strategy(StorageKind::Disk(42), StorageKind::Device(0), false, &caps); + match plan { + TransferPlan::TwoHop { + first, + bounce_location, + second, + } => { + assert_eq!(first, TransferStrategy::NixlReadFlipped); + assert_eq!(bounce_location, StorageKind::Pinned); + assert_eq!(second, TransferStrategy::CudaAsyncH2D); + } + _ => panic!("Expected TwoHop plan"), + } + } + + #[test] + fn test_device_to_disk_with_gds() { + let caps = TransferCapabilities::default().with_gds(true); + // Device → Disk should be direct with GDS + assert_eq!( + select_direct_strategy(StorageKind::Device(0), StorageKind::Disk(42), false, &caps), + TransferPlan::Direct(TransferStrategy::NixlWrite) + ); + } + + #[test] + fn test_disk_to_device_with_gds() { + let caps = TransferCapabilities::default().with_gds(true); + // Disk → Device should be direct with GDS + assert_eq!( + select_direct_strategy(StorageKind::Disk(42), StorageKind::Device(0), false, &caps), + TransferPlan::Direct(TransferStrategy::NixlRead) + ); + } + + #[test] + fn test_host_to_remote() { + let caps = default_caps(); + // Host → Remote - always direct + assert_eq!( + select_direct_strategy(StorageKind::System, StorageKind::System, true, &caps), + TransferPlan::Direct(TransferStrategy::NixlWrite) + ); + assert_eq!( + select_direct_strategy(StorageKind::Pinned, StorageKind::Pinned, true, &caps), + TransferPlan::Direct(TransferStrategy::NixlWrite) + ); + } + + #[test] + fn test_device_to_remote_without_rdma() { + let caps = default_caps(); // GPU RDMA disabled + // Device → Remote should use bounce buffer + let plan = select_direct_strategy(StorageKind::Device(0), StorageKind::System, true, &caps); + match plan { + TransferPlan::TwoHop { + first, + bounce_location, + second, + } => { + assert_eq!(first, TransferStrategy::CudaAsyncD2H); + assert_eq!(bounce_location, StorageKind::Pinned); + assert_eq!(second, TransferStrategy::NixlWrite); + } + _ => panic!("Expected TwoHop plan"), + } + } + + #[test] + fn test_device_to_remote_with_rdma() { + let caps = TransferCapabilities::default().with_gpu_rdma(true); + // Device → Remote should be direct with GPU RDMA + assert_eq!( + select_direct_strategy(StorageKind::Device(0), StorageKind::Device(0), true, &caps), + TransferPlan::Direct(TransferStrategy::NixlWrite) + ); + } + + #[test] + fn test_disk_to_remote() { + let caps = default_caps(); + // Disk → Remote always uses bounce buffer + let plan = select_direct_strategy(StorageKind::Disk(42), StorageKind::System, true, &caps); + match plan { + TransferPlan::TwoHop { + first, + bounce_location, + second, + } => { + assert_eq!(first, TransferStrategy::NixlWrite); + assert_eq!(bounce_location, StorageKind::Pinned); + assert_eq!(second, TransferStrategy::NixlWrite); + } + _ => panic!("Expected TwoHop plan"), + } + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/testing.rs b/lib/llm/src/block_manager/v2/physical/transfer/testing.rs new file mode 100644 index 0000000000..c675a8c5b6 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/testing.rs @@ -0,0 +1,363 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Round-trip testing infrastructure for transfer verification. +//! +//! This module provides utilities for testing data integrity across transfers +//! by comparing checksums after round-trip operations: +//! 1. Source blocks (host) → Intermediate (device/disk/remote) +//! 2. Intermediate → Destination blocks (host, different IDs) +//! 3. Verify checksums match between source and destination + +use super::{ + BlockChecksum, FillPattern, PhysicalLayout, StorageKind, compute_block_checksums, + fill_blocks, transfer_blocks, +}; +use super::context::TransferContext; +use anyhow::{Result, anyhow}; +use std::collections::HashMap; + +/// Result of a round-trip test. +#[derive(Debug)] +pub struct RoundTripTestResult { + /// Source block checksums (keyed by source block ID) + pub source_checksums: HashMap, + + /// Destination block checksums (keyed by destination block ID) + pub dest_checksums: HashMap, + + /// Block ID mapping used (src_id, dst_id) + pub block_mapping: Vec<(usize, usize)>, + + /// Whether all checksums matched + pub success: bool, + + /// Mismatched blocks (if any) + pub mismatches: Vec<(usize, usize)>, // (src_id, dst_id) pairs that didn't match +} + +impl RoundTripTestResult { + /// Check if the round-trip test passed. + pub fn is_success(&self) -> bool { + self.success + } + + /// Get the number of blocks tested. + pub fn num_blocks(&self) -> usize { + self.block_mapping.len() + } + + /// Get a detailed report of the test results. + pub fn report(&self) -> String { + if self.success { + format!( + "Round-trip test PASSED: {}/{} blocks verified successfully", + self.num_blocks(), + self.num_blocks() + ) + } else { + format!( + "Round-trip test FAILED: {}/{} blocks mismatched\nMismatches: {:?}", + self.mismatches.len(), + self.num_blocks(), + self.mismatches + ) + } + } +} + +/// Builder for round-trip tests. +/// +/// This allows configuring a test that transfers data from source blocks +/// to intermediate storage and back to different destination blocks, +/// verifying data integrity via checksums. +pub struct RoundTripTest { + /// Source physical layout (must be local) + source: PhysicalLayout, + + /// Intermediate physical layout (can be remote/device/disk) + intermediate: PhysicalLayout, + + /// Destination physical layout (must be local) + destination: PhysicalLayout, + + /// Block mapping: (src_id, intermediate_id, dst_id) + block_mapping: Vec<(usize, usize, usize)>, + + /// Fill pattern for source blocks + fill_pattern: FillPattern, +} + +impl RoundTripTest { + /// Create a new round-trip test. + /// + /// # Arguments + /// * `source` - Source physical layout (must be local) + /// * `intermediate` - Intermediate physical layout + /// * `destination` - Destination physical layout (must be local) + pub fn new( + source: PhysicalLayout, + intermediate: PhysicalLayout, + destination: PhysicalLayout, + ) -> Result { + if source.is_remote() { + return Err(anyhow!("Source layout must be local")); + } + if destination.is_remote() { + return Err(anyhow!("Destination layout must be local")); + } + + Ok(Self { + source, + intermediate, + destination, + block_mapping: Vec::new(), + fill_pattern: FillPattern::Sequential, + }) + } + + /// Set the fill pattern for source blocks. + pub fn with_fill_pattern(mut self, pattern: FillPattern) -> Self { + self.fill_pattern = pattern; + self + } + + /// Add a block mapping for the round-trip test. + /// + /// # Arguments + /// * `src_id` - Source block ID + /// * `intermediate_id` - Intermediate block ID + /// * `dst_id` - Destination block ID + pub fn add_block_mapping( + mut self, + src_id: usize, + intermediate_id: usize, + dst_id: usize, + ) -> Self { + self.block_mapping.push((src_id, intermediate_id, dst_id)); + self + } + + /// Add multiple block mappings at once. + /// + /// This is a convenience method for adding several mappings. + pub fn with_block_mappings(mut self, mappings: &[(usize, usize, usize)]) -> Self { + self.block_mapping.extend_from_slice(mappings); + self + } + + /// Run the round-trip test. + /// + /// # Workflow + /// 1. Fill source blocks with the specified pattern + /// 2. Compute source checksums + /// 3. Transfer source → intermediate + /// 4. Transfer intermediate → destination + /// 5. Compute destination checksums + /// 6. Compare checksums + /// + /// # Arguments + /// * `ctx` - Transfer context with CUDA stream and NIXL agent + pub async fn run(self, ctx: &TransferContext) -> Result { + if self.block_mapping.is_empty() { + return Err(anyhow!("No block mappings specified")); + } + + // Step 1: Fill source blocks + let src_ids: Vec = self.block_mapping.iter().map(|(src, _, _)| *src).collect(); + fill_blocks(&self.source, &src_ids, self.fill_pattern)?; + + // Step 2: Compute source checksums + let source_checksums = compute_block_checksums(&self.source, &src_ids)?; + + // Step 3: Transfer source → intermediate + let src_ids_intermediate: Vec = + self.block_mapping.iter().map(|(src, _, _)| *src).collect(); + let inter_ids_from_src: Vec = self + .block_mapping + .iter() + .map(|(_, inter, _)| *inter) + .collect(); + let notification = transfer_blocks( + &self.source, + &self.intermediate, + &src_ids_intermediate, + &inter_ids_from_src, + ctx, + )?; + notification.await?; + + // Step 4: Transfer intermediate → destination + let inter_ids_to_dst: Vec = self + .block_mapping + .iter() + .map(|(_, inter, _)| *inter) + .collect(); + let dst_ids_from_inter: Vec = + self.block_mapping.iter().map(|(_, _, dst)| *dst).collect(); + let notification = transfer_blocks( + &self.intermediate, + &self.destination, + &inter_ids_to_dst, + &dst_ids_from_inter, + ctx, + )?; + notification.await?; + + // Step 5: Compute destination checksums + let dst_ids: Vec = self.block_mapping.iter().map(|(_, _, dst)| *dst).collect(); + let dest_checksums = compute_block_checksums(&self.destination, &dst_ids)?; + + // Step 6: Compare checksums + let mut mismatches = Vec::new(); + for (src_id, _, dst_id) in &self.block_mapping { + let src_checksum = &source_checksums[src_id]; + let dst_checksum = &dest_checksums[dst_id]; + + if src_checksum != dst_checksum { + mismatches.push((*src_id, *dst_id)); + } + } + + let success = mismatches.is_empty(); + let block_mapping: Vec<(usize, usize)> = self + .block_mapping + .iter() + .map(|(src, _, dst)| (*src, *dst)) + .collect(); + + Ok(RoundTripTestResult { + source_checksums, + dest_checksums, + block_mapping, + success, + mismatches, + }) + } +} + +#[cfg(test, features = "testing-cuda")] +mod tests { + use super::*; + use crate::block_manager::v2::layout::{ + FullyContiguousLayout, Layout, LayoutConfig, MemoryRegion, OwnedMemoryRegion, + }; + use std::sync::Arc; + + // Helper to create a minimal transfer context for testing + // In real tests with CUDA/NIXL, this would be properly constructed + fn create_test_context() -> TransferContext { + // For now, we'll skip these tests if CUDA is not available + // In the future, we can mock TransferContext or use conditional compilation + todo!("Create test context - requires CUDA/NIXL setup") + } + + #[tokio::test] + async fn test_round_trip_host_to_host() { + // Create three layouts: source, intermediate, destination + let (src_layout, _src_mem) = create_test_layout(4); + let (inter_layout, _inter_mem) = create_test_layout(4); + let (dst_layout, _dst_mem) = create_test_layout(4); + + let source = PhysicalLayout::new_local(src_layout, StorageKind::System); + let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned); + let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System); + + // Build round-trip test with different block IDs + // Source: blocks [0, 1, 2, 3] + // Intermediate: blocks [0, 1, 2, 3] + // Destination: blocks [0, 1, 2, 3] (different memory than source) + let test = RoundTripTest::new(source, intermediate, destination) + .unwrap() + .with_fill_pattern(FillPattern::Sequential) + .add_block_mapping(0, 0, 0) + .add_block_mapping(1, 1, 1) + .add_block_mapping(2, 2, 2) + .add_block_mapping(3, 3, 3); + + // Create a transfer context (requires actual CUDA/NIXL setup) + let ctx = create_test_context(); + + // Run the test + let result = test.run(&ctx).await.unwrap(); + + assert!(result.is_success(), "{}", result.report()); + assert_eq!(result.num_blocks(), 4); + } + + #[tokio::test] + async fn test_round_trip_different_block_ids() { + // Create layouts with enough blocks + let (src_layout, _src_mem) = create_test_layout(8); + let (inter_layout, _inter_mem) = create_test_layout(8); + let (dst_layout, _dst_mem) = create_test_layout(8); + + let source = PhysicalLayout::new_local(src_layout, StorageKind::System); + let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned); + let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System); + + // Test with non-overlapping block IDs + // Source: blocks [0, 1, 2, 3] + // Intermediate: blocks [2, 3, 4, 5] + // Destination: blocks [4, 5, 6, 7] + let test = RoundTripTest::new(source, intermediate, destination) + .unwrap() + .with_fill_pattern(FillPattern::BlockBased) + .with_block_mappings(&[(0, 2, 4), (1, 3, 5), (2, 4, 6), (3, 5, 7)]); + + let ctx = create_test_context(); + let result = test.run(&ctx).await.unwrap(); + + assert!(result.is_success(), "{}", result.report()); + assert_eq!(result.num_blocks(), 4); + } + + #[test] + fn test_round_trip_builder() { + let (src_layout, _) = create_test_layout(4); + let (inter_layout, _) = create_test_layout(4); + let (dst_layout, _) = create_test_layout(4); + + let source = PhysicalLayout::new_local(src_layout, StorageKind::System); + let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned); + let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System); + + let test = RoundTripTest::new(source, intermediate, destination) + .unwrap() + .with_fill_pattern(FillPattern::Constant(42)) + .add_block_mapping(0, 0, 1) + .add_block_mapping(1, 1, 2); + + assert_eq!(test.block_mapping.len(), 2); + } + + #[test] + fn test_round_trip_requires_local_source() { + let (src_layout, _) = create_test_layout(1); + let (inter_layout, _) = create_test_layout(1); + let (dst_layout, _) = create_test_layout(1); + + let source = + PhysicalLayout::new_remote(src_layout, StorageKind::System, "remote".to_string()); + let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned); + let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System); + + let result = RoundTripTest::new(source, intermediate, destination); + assert!(result.is_err()); + } + + #[test] + fn test_round_trip_requires_local_destination() { + let (src_layout, _) = create_test_layout(1); + let (inter_layout, _) = create_test_layout(1); + let (dst_layout, _) = create_test_layout(1); + + let source = PhysicalLayout::new_local(src_layout, StorageKind::System); + let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned); + let destination = + PhysicalLayout::new_remote(dst_layout, StorageKind::System, "remote".to_string()); + + let result = RoundTripTest::new(source, intermediate, destination); + assert!(result.is_err()); + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/tests/local_transfers.rs b/lib/llm/src/block_manager/v2/physical/transfer/tests/local_transfers.rs new file mode 100644 index 0000000000..846a97206f --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/tests/local_transfers.rs @@ -0,0 +1,976 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Local transfer tests where source and destination use the same NIXL agent. +//! +//! These tests verify data integrity across: +//! - Different storage types (System, Pinned, Device) +//! - Different layout types (Fully Contiguous, Layer-wise) +//! - Different transfer strategies (Memcpy, CUDA H2D/D2H) + +use super::*; +use crate::block_manager::v2::physical::layout::BlockDimension; +use crate::block_manager::v2::physical::transfer::executor::execute_transfer; +use crate::block_manager::v2::physical::transfer::{ + BlockChecksum, BounceBufferSpec, FillPattern, StorageKind, TransferCapabilities, + TransferOptions, compute_block_checksums, compute_layer_checksums, fill_blocks, fill_layers, +}; +use anyhow::Result; +use rstest::rstest; +use std::collections::HashMap; +use std::ops::Range; +use std::sync::Arc; + +// ============================================================================ +// System <=> System Tests (Memcpy) +// ============================================================================ + +#[derive(Clone)] +enum LayoutType { + FC, + LW, +} + +fn build_layout( + agent: NixlAgent, + layout_type: LayoutType, + storage_kind: StorageKind, + num_blocks: usize, +) -> PhysicalLayout { + match layout_type { + LayoutType::FC => create_fc_layout(agent, storage_kind, num_blocks), + LayoutType::LW => create_lw_layout(agent, storage_kind, num_blocks), + } +} + +/// Layout kind for parameterized testing. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LayoutKind { + /// Fully contiguous layout + FC, + /// Layer-wise (layer-separate) layout + LW, +} + +/// Storage and layout specification for creating test layouts. +#[derive(Debug, Clone, Copy)] +pub struct LayoutSpec { + pub kind: LayoutKind, + pub storage: StorageKind, +} + +impl LayoutSpec { + pub fn new(kind: LayoutKind, storage: StorageKind) -> Self { + Self { kind, storage } + } +} + +/// Transfer mode for parameterized testing. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TransferMode { + /// Transfer entire blocks (all layers) + FullBlocks, + /// Transfer only the first layer + FirstLayerOnly, + /// Transfer only the second layer + SecondLayerOnly, +} + +impl TransferMode { + /// Convert to optional layer range for execute_transfer. + pub fn layer_range(&self) -> Option> { + match self { + TransferMode::FullBlocks => None, + TransferMode::FirstLayerOnly => Some(0..1), + TransferMode::SecondLayerOnly => Some(1..2), + } + } + + /// Get a descriptive suffix for test names. + pub fn suffix(&self) -> &'static str { + match self { + TransferMode::FullBlocks => "full", + TransferMode::FirstLayerOnly => "layer0", + TransferMode::SecondLayerOnly => "layer1", + } + } +} + +/// Create a fully contiguous physical layout with the specified storage type. +pub fn create_fc_layout( + agent: NixlAgent, + storage_kind: StorageKind, + num_blocks: usize, +) -> PhysicalLayout { + let config = standard_config(num_blocks); + let builder = PhysicalLayout::builder(agent) + .with_config(config) + .fully_contiguous(); + + match storage_kind { + StorageKind::System => builder.allocate_system().build().unwrap(), + StorageKind::Pinned => builder.allocate_pinned(false).build().unwrap(), + StorageKind::Device(device_id) => builder.allocate_device(device_id).build().unwrap(), + StorageKind::Disk(_) => builder.allocate_disk(None).build().unwrap(), + } +} + +/// Create a layer-separate physical layout with the specified storage type. +pub fn create_lw_layout( + agent: NixlAgent, + storage_kind: StorageKind, + num_blocks: usize, +) -> PhysicalLayout { + let config = standard_config(num_blocks); + let builder = PhysicalLayout::builder(agent) + .with_config(config) + .layer_separate(BlockDimension::BlockIsFirstDim); + + match storage_kind { + StorageKind::System => builder.allocate_system().build().unwrap(), + StorageKind::Pinned => builder.allocate_pinned(false).build().unwrap(), + StorageKind::Device(device_id) => builder.allocate_device(device_id).build().unwrap(), + StorageKind::Disk(_) => builder.allocate_disk(None).build().unwrap(), + } +} + +/// Create a physical layout based on the specification. +/// +/// This is a DRY helper that dispatches to create_fc_layout or create_lw_layout +/// based on the layout kind in the spec. +pub fn create_layout(agent: NixlAgent, spec: LayoutSpec, num_blocks: usize) -> PhysicalLayout { + match spec.kind { + LayoutKind::FC => create_fc_layout(agent, spec.storage, num_blocks), + LayoutKind::LW => create_lw_layout(agent, spec.storage, num_blocks), + } +} + +/// Fill blocks or layers based on transfer mode and compute checksums. +/// +/// This is a mode-aware version of fill_and_checksum that handles both +/// full block transfers and layer-wise transfers. +pub fn fill_and_checksum_with_mode( + layout: &PhysicalLayout, + block_ids: &[usize], + pattern: FillPattern, + mode: TransferMode, +) -> Result> { + match mode { + TransferMode::FullBlocks => { + fill_blocks(layout, block_ids, pattern)?; + compute_block_checksums(layout, block_ids) + } + TransferMode::FirstLayerOnly => { + fill_layers(layout, block_ids, 0..1, pattern)?; + compute_layer_checksums(layout, block_ids, 0..1) + } + TransferMode::SecondLayerOnly => { + fill_layers(layout, block_ids, 1..2, pattern)?; + compute_layer_checksums(layout, block_ids, 1..2) + } + } +} + +/// Verify checksums with transfer mode awareness. +/// +/// This is a mode-aware version that handles both full block and layer-wise verification. +pub fn verify_checksums_by_position_with_mode( + src_checksums: &HashMap, + src_block_ids: &[usize], + dst_layout: &PhysicalLayout, + dst_block_ids: &[usize], + mode: TransferMode, +) -> Result<()> { + assert_eq!( + src_block_ids.len(), + dst_block_ids.len(), + "Source and destination block arrays must have same length" + ); + + let dst_checksums = match mode { + TransferMode::FullBlocks => compute_block_checksums(dst_layout, dst_block_ids)?, + TransferMode::FirstLayerOnly => compute_layer_checksums(dst_layout, dst_block_ids, 0..1)?, + TransferMode::SecondLayerOnly => compute_layer_checksums(dst_layout, dst_block_ids, 1..2)?, + }; + + for (src_id, dst_id) in src_block_ids.iter().zip(dst_block_ids.iter()) { + let src_checksum = src_checksums + .get(src_id) + .unwrap_or_else(|| panic!("Missing source checksum for block {}", src_id)); + let dst_checksum = dst_checksums + .get(dst_id) + .unwrap_or_else(|| panic!("Missing destination checksum for block {}", dst_id)); + + assert_eq!( + src_checksum, dst_checksum, + "Checksum mismatch (mode={:?}): src[{}] != dst[{}]: {} != {}", + mode, src_id, dst_id, src_checksum, dst_checksum + ); + } + + Ok(()) +} + +/// Create a test agent with specific backends. +pub fn create_test_agent_with_backends(name: &str, backends: &[&str]) -> Result { + NixlAgent::new_with_backends(name, backends) +} + +/// Create a transport manager for testing with the specified agent. +/// +/// Note: The agent should already have backends configured. Use `create_test_agent` +/// or `build_agent_with_backends` to create properly configured agents. +pub fn create_transfer_context( + agent: NixlAgent, + capabilities: Option, +) -> Result { + crate::block_manager::v2::physical::manager::TransportManager::builder() + .capabilities(capabilities.unwrap_or_default()) + .worker_id(0) // Default worker ID for local tests + .nixl_agent(agent) + .cuda_device_id(0) + .build() +} + +/// Fill blocks and compute checksums. +/// +/// This can only be called on System or Pinned layouts. +pub fn fill_and_checksum( + layout: &PhysicalLayout, + block_ids: &[usize], + pattern: FillPattern, +) -> Result> { + fill_blocks(layout, block_ids, pattern)?; + compute_block_checksums(layout, block_ids) +} + +/// Verify that destination block checksums match the expected source checksums. +/// +/// This function compares checksums in order, assuming the source and destination +/// block arrays have a 1:1 correspondence (src[i] was transferred to dst[i]). +pub fn verify_checksums_by_position( + src_checksums: &HashMap, + src_block_ids: &[usize], + dst_layout: &PhysicalLayout, + dst_block_ids: &[usize], +) -> Result<()> { + assert_eq!( + src_block_ids.len(), + dst_block_ids.len(), + "Source and destination block arrays must have same length" + ); + + let dst_checksums = compute_block_checksums(dst_layout, dst_block_ids)?; + + for (src_id, dst_id) in src_block_ids.iter().zip(dst_block_ids.iter()) { + let src_checksum = src_checksums + .get(src_id) + .unwrap_or_else(|| panic!("Missing source checksum for block {}", src_id)); + let dst_checksum = dst_checksums + .get(dst_id) + .unwrap_or_else(|| panic!("Missing destination checksum for block {}", dst_id)); + + assert_eq!( + src_checksum, dst_checksum, + "Checksum mismatch: src[{}] != dst[{}]: {} != {}", + src_id, dst_id, src_checksum, dst_checksum + ); + } + + Ok(()) +} + +/// Fill guard blocks and return their checksums for later verification. +/// +/// Guard blocks are blocks adjacent to transfer destinations that should +/// remain unchanged during transfers. This function fills them with a +/// distinctive pattern and returns their checksums for later validation. +/// +/// # Arguments +/// * `layout` - The physical layout containing the guard blocks +/// * `guard_block_ids` - Block IDs to use as guards +/// * `pattern` - Fill pattern for guard blocks (typically a constant like 0xFF) +/// +/// # Returns +/// A map of block ID to checksum for all guard blocks +pub fn create_guard_blocks( + layout: &PhysicalLayout, + guard_block_ids: &[usize], + pattern: FillPattern, +) -> Result> { + fill_blocks(layout, guard_block_ids, pattern)?; + compute_block_checksums(layout, guard_block_ids) +} + +/// Verify that guard blocks remain unchanged after transfers. +/// +/// This function compares the current checksums of guard blocks against +/// their expected values. Any mismatch indicates memory corruption or +/// unintended overwrites during transfer operations. +/// +/// # Arguments +/// * `layout` - The physical layout containing the guard blocks +/// * `guard_block_ids` - Block IDs to verify +/// * `expected_checksums` - Expected checksums from create_guard_blocks +/// +/// # Errors +/// Returns an error if any guard block checksum has changed +pub fn verify_guard_blocks_unchanged( + layout: &PhysicalLayout, + guard_block_ids: &[usize], + expected_checksums: &HashMap, +) -> Result<()> { + let current_checksums = compute_block_checksums(layout, guard_block_ids)?; + + for &block_id in guard_block_ids { + let expected = expected_checksums + .get(&block_id) + .unwrap_or_else(|| panic!("Missing expected checksum for guard block {}", block_id)); + let current = current_checksums + .get(&block_id) + .unwrap_or_else(|| panic!("Missing current checksum for guard block {}", block_id)); + + if expected != current { + return Err(anyhow::anyhow!( + "Guard block {} was modified during transfer! Expected: {}, Got: {}", + block_id, + expected, + current + )); + } + } + + Ok(()) +} + +struct DummyBounceBufferSpec { + pub layout: PhysicalLayout, + pub block_ids: Vec, +} + +impl BounceBufferSpec for DummyBounceBufferSpec { + fn layout(&self) -> &PhysicalLayout { + &self.layout + } + fn block_ids(&self) -> &[usize] { + &self.block_ids + } +} + +fn build_agent_for_kinds(src_kind: StorageKind, dst_kind: StorageKind) -> Result { + use std::collections::HashSet; + + let mut backends = HashSet::new(); + + // Determine required backends for both source and destination + for kind in [src_kind, dst_kind] { + match kind { + StorageKind::System | StorageKind::Pinned => { + backends.insert("POSIX"); // Lightweight for DRAM + } + StorageKind::Device(_) => { + backends.insert("UCX"); // Required for VRAM (expensive) + } + StorageKind::Disk(_) => { + backends.insert("POSIX"); // Required for disk I/O + } + } + } + + // Optional: Add GDS for Device <-> Disk optimization + match (src_kind, dst_kind) { + (StorageKind::Device(_), StorageKind::Disk(_)) + | (StorageKind::Disk(_), StorageKind::Device(_)) => { + backends.insert("GDS_MT"); + } + _ => {} + } + + let backend_vec: Vec<&str> = backends.into_iter().collect(); + create_test_agent_with_backends("agent", &backend_vec) +} + +#[rstest] +#[tokio::test] +async fn test_p2p( + #[values(LayoutType::FC, LayoutType::LW)] src_layout: LayoutType, + #[values( + StorageKind::System, + StorageKind::Pinned, + StorageKind::Device(0), + StorageKind::Disk(0) + )] + src_kind: StorageKind, + #[values(LayoutType::FC, LayoutType::LW)] dst_layout: LayoutType, + #[values( + StorageKind::System, + StorageKind::Pinned, + StorageKind::Device(0), + StorageKind::Disk(0) + )] + dst_kind: StorageKind, +) -> Result<()> { + use crate::block_manager::v2::physical::transfer::TransferOptions; + + let agent = build_agent_for_kinds(src_kind, dst_kind)?; + + let src = build_layout(agent.clone(), src_layout, src_kind, 4); + let dst = build_layout(agent.clone(), dst_layout, dst_kind, 4); + + let bounce_layout = build_layout(agent.clone(), LayoutType::FC, StorageKind::Pinned, 4); + + let bounce_buffer_spec: Arc = Arc::new(DummyBounceBufferSpec { + layout: bounce_layout, + block_ids: vec![0, 1], + }); + + let src_blocks = vec![0, 1]; + let dst_blocks = vec![2, 3]; + + let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?; + let ctx = create_transfer_context(agent, None).unwrap(); + + let options = TransferOptions::builder() + .bounce_buffer(bounce_buffer_spec) + .build()?; + + let notification = + execute_transfer(&src, &dst, &src_blocks, &dst_blocks, options, ctx.context())?; + notification.await?; + + verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?; + + Ok(()) +} + +#[rstest] +#[tokio::test] +async fn test_roundtrip( + #[values(LayoutType::FC, LayoutType::LW)] src_layout: LayoutType, + #[values(StorageKind::System, StorageKind::Pinned, StorageKind::Device(0))] + src_kind: StorageKind, + #[values(LayoutType::FC, LayoutType::LW)] inter_layout: LayoutType, + #[values(StorageKind::System, StorageKind::Pinned, StorageKind::Device(0))] + inter_kind: StorageKind, + #[values(LayoutType::FC, LayoutType::LW)] dst_layout: LayoutType, + #[values(StorageKind::System, StorageKind::Pinned, StorageKind::Device(0))] + dst_kind: StorageKind, +) -> Result<()> { + let agent = build_agent_for_kinds(src_kind, dst_kind)?; + + // Create layouts: source pinned, device intermediate, destination pinned + let src = build_layout(agent.clone(), src_layout, src_kind, 4); + let device = build_layout(agent.clone(), inter_layout, inter_kind, 4); + let dst = build_layout(agent.clone(), dst_layout, dst_kind, 4); + + let src_blocks = vec![0, 1]; + let device_blocks = vec![0, 1]; + let dst_blocks = vec![2, 3]; + + // Fill source and compute checksums + let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?; + let ctx = create_transfer_context(agent, None).unwrap(); + + // Transfer: Pinned[0,1] -> Device[0,1] + let notification = execute_transfer( + &src, + &device, + &src_blocks, + &device_blocks, + TransferOptions::default(), + ctx.context(), + )?; + notification.await?; + + // Transfer: Device[0,1] -> Pinned[2,3] + let notification = execute_transfer( + &device, + &dst, + &device_blocks, + &dst_blocks, + TransferOptions::default(), + ctx.context(), + )?; + notification.await?; + + // Verify checksums match + verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?; + + Ok(()) +} + +#[rstest] +#[case(StorageKind::Device(0), StorageKind::Disk(0))] +#[case(StorageKind::Disk(0), StorageKind::Device(0))] +#[tokio::test] +async fn test_gds( + #[case] src_kind: StorageKind, + #[values(LayoutType::FC, LayoutType::LW)] src_layout: LayoutType, + #[case] dst_kind: StorageKind, + #[values(LayoutType::FC, LayoutType::LW)] dst_layout: LayoutType, +) -> Result<()> { + let capabilities = TransferCapabilities::default().with_gds_if_supported(); + + if !capabilities.allow_gds { + println!("System does not support GDS. Skipping test."); + return Ok(()); + } + + let agent = build_agent_for_kinds(src_kind, dst_kind)?; + + let src = build_layout(agent.clone(), src_layout, src_kind, 4); + let dst = build_layout(agent.clone(), dst_layout, dst_kind, 4); + + let src_blocks = vec![0, 1]; + let dst_blocks = vec![2, 3]; + + let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?; + let ctx = create_transfer_context(agent, Some(capabilities)).unwrap(); + + let notification = execute_transfer( + &src, + &dst, + &src_blocks, + &dst_blocks, + TransferOptions::default(), + ctx.context(), + )?; + notification.await?; + + verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?; + + Ok(()) +} + +#[rstest] +#[case(StorageKind::Device(0), StorageKind::Disk(0))] +#[case(StorageKind::Disk(0), StorageKind::Device(0))] +#[tokio::test] +async fn test_buffered_transfer( + #[case] src_kind: StorageKind, + #[values(LayoutType::FC, LayoutType::LW)] src_layout: LayoutType, + #[case] dst_kind: StorageKind, + #[values(LayoutType::FC, LayoutType::LW)] dst_layout: LayoutType, +) -> Result<()> { + let agent = build_agent_for_kinds(src_kind, dst_kind)?; + + let src = build_layout(agent.clone(), src_layout, src_kind, 5); + let dst = build_layout(agent.clone(), dst_layout, dst_kind, 5); + + let src_blocks = vec![0, 1, 2, 3, 4]; + let dst_blocks = vec![4, 3, 2, 1, 0]; + + let bounce_layout = build_layout(agent.clone(), LayoutType::FC, StorageKind::Pinned, 3); + let bounce_buffer_spec: Arc = Arc::new(DummyBounceBufferSpec { + layout: bounce_layout, + block_ids: vec![0, 1, 2], + }); + + let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?; + let ctx = create_transfer_context(agent, None).unwrap(); + + let notification = execute_transfer( + &src, + &dst, + &src_blocks, + &dst_blocks, + TransferOptions::builder() + .bounce_buffer(bounce_buffer_spec) + .build()?, + ctx.context(), + )?; + notification.await?; + + verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?; + + Ok(()) +} + +#[rstest] +#[case(1024)] +#[case(2048)] +#[case(4096)] +#[case(8192)] +#[case(16384)] +#[tokio::test] +async fn test_large_block_counts(#[case] block_count: usize) { + let agent = create_test_agent(&format!("test_large_block_counts_{}", block_count)); + + let src = create_fc_layout(agent.clone(), StorageKind::Pinned, block_count); + let device = create_fc_layout(agent.clone(), StorageKind::Device(0), block_count); + + let src_blocks = (0..block_count).collect::>(); + let device_blocks = (0..block_count).collect::>(); + + let ctx = create_transfer_context(agent, None).unwrap(); + let notification = execute_transfer( + &src, + &device, + &src_blocks, + &device_blocks, + TransferOptions::default(), + ctx.context(), + ) + .unwrap(); + notification.await.unwrap(); +} + +// ============================================================================ +// Parameterized Bounce Tests with Guard Block Validation +// ============================================================================ + +/// Test bounce transfers with guard block validation. +/// +/// This test validates that: +/// 1. Data can be transferred: host[src_blocks] → bounce[src_blocks] → host[dst_blocks] +/// 2. Guard blocks adjacent to dst_blocks remain unchanged (no memory corruption) +/// 3. Works correctly with different storage types, layouts, and transfer modes +/// +/// Test pattern (6 blocks total): +/// - Source blocks: [0, 1] +/// - Destination blocks: [3, 4] +/// - Guard blocks: [2, 5] (adjacent to destination, should remain unchanged) +#[rstest] +// Storage combinations (host, bounce) +#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")] +#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")] +#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")] +#[tokio::test] +async fn test_bounce_with_guards_fc_fc_full( + #[case] host_storage: StorageKind, + #[case] bounce_storage: StorageKind, + #[case] name_suffix: &str, +) { + test_bounce_with_guards_impl( + host_storage, + bounce_storage, + LayoutKind::FC, + LayoutKind::FC, + TransferMode::FullBlocks, + name_suffix, + ) + .await + .unwrap(); +} + +#[rstest] +#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")] +#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")] +#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")] +#[tokio::test] +async fn test_bounce_with_guards_fc_lw_full( + #[case] host_storage: StorageKind, + #[case] bounce_storage: StorageKind, + #[case] name_suffix: &str, +) { + test_bounce_with_guards_impl( + host_storage, + bounce_storage, + LayoutKind::FC, + LayoutKind::LW, + TransferMode::FullBlocks, + name_suffix, + ) + .await + .unwrap(); +} + +#[rstest] +#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")] +#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")] +#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")] +#[tokio::test] +async fn test_bounce_with_guards_lw_fc_full( + #[case] host_storage: StorageKind, + #[case] bounce_storage: StorageKind, + #[case] name_suffix: &str, +) { + test_bounce_with_guards_impl( + host_storage, + bounce_storage, + LayoutKind::LW, + LayoutKind::FC, + TransferMode::FullBlocks, + name_suffix, + ) + .await + .unwrap(); +} + +#[rstest] +#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")] +#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")] +#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")] +#[tokio::test] +async fn test_bounce_with_guards_lw_lw_full( + #[case] host_storage: StorageKind, + #[case] bounce_storage: StorageKind, + #[case] name_suffix: &str, +) { + test_bounce_with_guards_impl( + host_storage, + bounce_storage, + LayoutKind::LW, + LayoutKind::LW, + TransferMode::FullBlocks, + name_suffix, + ) + .await + .unwrap(); +} + +#[rstest] +#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")] +#[tokio::test] +async fn test_bounce_with_guards_fc_fc_layer0( + #[case] host_storage: StorageKind, + #[case] bounce_storage: StorageKind, + #[case] name_suffix: &str, +) { + test_bounce_with_guards_impl( + host_storage, + bounce_storage, + LayoutKind::FC, + LayoutKind::FC, + TransferMode::FirstLayerOnly, + name_suffix, + ) + .await + .unwrap(); +} + +#[rstest] +#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")] +#[tokio::test] +async fn test_bounce_with_guards_lw_lw_layer0( + #[case] host_storage: StorageKind, + #[case] bounce_storage: StorageKind, + #[case] name_suffix: &str, +) { + test_bounce_with_guards_impl( + host_storage, + bounce_storage, + LayoutKind::LW, + LayoutKind::LW, + TransferMode::FirstLayerOnly, + name_suffix, + ) + .await + .unwrap(); +} + +/// Implementation helper for bounce tests with guard blocks. +async fn test_bounce_with_guards_impl( + host_storage: StorageKind, + bounce_storage: StorageKind, + host_layout: LayoutKind, + bounce_layout: LayoutKind, + mode: TransferMode, + name_suffix: &str, +) -> Result<()> { + let num_blocks = 6; + let test_name = format!( + "bounce_{}_{:?}_{:?}_{}_{}", + name_suffix, + host_layout, + bounce_layout, + mode.suffix(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() + ); + let agent = create_test_agent(&test_name); + + // Create layouts + let host = create_layout( + agent.clone(), + LayoutSpec::new(host_layout, host_storage), + num_blocks, + ); + let bounce = create_layout( + agent.clone(), + LayoutSpec::new(bounce_layout, bounce_storage), + num_blocks, + ); + + // Block assignments: + // - Transfer: host[0,1] → bounce[0,1] → host[3,4] + // - Guards: host[2,5] (should remain unchanged) + let src_blocks = vec![0, 1]; + let dst_blocks = vec![3, 4]; + let guard_blocks = vec![2, 5]; + + // Setup: Fill source blocks and guard blocks + let src_checksums = + fill_and_checksum_with_mode(&host, &src_blocks, FillPattern::Sequential, mode)?; + let guard_checksums = create_guard_blocks(&host, &guard_blocks, FillPattern::Constant(0xFF))?; + + let ctx = create_transfer_context(agent, None)?; + + // Execute bounce: host[0,1] → bounce[0,1] + let notification = execute_transfer( + &host, + &bounce, + &src_blocks, + &src_blocks, + TransferOptions::from_layer_range(mode.layer_range()), + ctx.context(), + )?; + notification.await?; + + // Execute bounce: bounce[0,1] → host[3,4] + let notification = execute_transfer( + &bounce, + &host, + &src_blocks, + &dst_blocks, + TransferOptions::from_layer_range(mode.layer_range()), + ctx.context(), + )?; + notification.await?; + + // Verify: Data integrity + guards unchanged + verify_checksums_by_position_with_mode(&src_checksums, &src_blocks, &host, &dst_blocks, mode)?; + verify_guard_blocks_unchanged(&host, &guard_blocks, &guard_checksums)?; + + Ok(()) +} + +// ============================================================================ +// Parameterized Direct Transfer Tests +// ============================================================================ + +/// Test direct transfers with parameterization over storage, layout, and transfer mode. +/// +/// This demonstrates the DRY parameterized approach that can replace the 18 individual +/// tests above (System<=>System, Pinned<=>Pinned, cross-type, etc). +/// +/// Note: Only tests System<=>System, Pinned<=>Pinned, and System<=>Pinned since we can only +/// fill/checksum System and Pinned storage. For Device tests, use bounce tests instead. +#[rstest] +// Storage combinations (only fillable storage types) +#[case(StorageKind::System, StorageKind::System, "sys_sys")] +#[case(StorageKind::Pinned, StorageKind::Pinned, "pin_pin")] +#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")] +#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")] +#[tokio::test] +async fn test_direct_transfer_fc_fc_full( + #[case] src_storage: StorageKind, + #[case] dst_storage: StorageKind, + #[case] name_suffix: &str, +) { + test_direct_transfer_impl( + src_storage, + dst_storage, + LayoutKind::FC, + LayoutKind::FC, + TransferMode::FullBlocks, + name_suffix, + ) + .await + .unwrap(); +} + +#[rstest] +#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")] +#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")] +#[tokio::test] +async fn test_direct_transfer_fc_lw_layer0( + #[case] src_storage: StorageKind, + #[case] dst_storage: StorageKind, + #[case] name_suffix: &str, +) { + test_direct_transfer_impl( + src_storage, + dst_storage, + LayoutKind::FC, + LayoutKind::LW, + TransferMode::FirstLayerOnly, + name_suffix, + ) + .await + .unwrap(); +} + +#[rstest] +#[case(StorageKind::Pinned, StorageKind::Pinned, "pin_pin")] +#[tokio::test] +async fn test_direct_transfer_lw_lw_layer1( + #[case] src_storage: StorageKind, + #[case] dst_storage: StorageKind, + #[case] name_suffix: &str, +) { + test_direct_transfer_impl( + src_storage, + dst_storage, + LayoutKind::LW, + LayoutKind::LW, + TransferMode::SecondLayerOnly, + name_suffix, + ) + .await + .unwrap(); +} + +/// Implementation helper for direct transfer tests. +async fn test_direct_transfer_impl( + src_storage: StorageKind, + dst_storage: StorageKind, + src_layout: LayoutKind, + dst_layout: LayoutKind, + mode: TransferMode, + name_suffix: &str, +) -> Result<()> { + let num_blocks = 4; + let test_name = format!( + "direct_{}_{:?}_{:?}_{}_{}", + name_suffix, + src_layout, + dst_layout, + mode.suffix(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() + ); + let agent = create_test_agent(&test_name); + + // Create layouts + let src = create_layout( + agent.clone(), + LayoutSpec::new(src_layout, src_storage), + num_blocks, + ); + let dst = create_layout( + agent.clone(), + LayoutSpec::new(dst_layout, dst_storage), + num_blocks, + ); + + // Transfer src[0,1] -> dst[2,3] + let src_blocks = vec![0, 1]; + let dst_blocks = vec![2, 3]; + + // Fill source and compute checksums + let src_checksums = + fill_and_checksum_with_mode(&src, &src_blocks, FillPattern::Sequential, mode)?; + + let ctx = create_transfer_context(agent, None)?; + + // Execute transfer + let notification = execute_transfer( + &src, + &dst, + &src_blocks, + &dst_blocks, + TransferOptions::from_layer_range(mode.layer_range()), + ctx.context(), + )?; + notification.await?; + + // Verify data integrity + verify_checksums_by_position_with_mode(&src_checksums, &src_blocks, &dst, &dst_blocks, mode)?; + + Ok(()) +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/tests/mod.rs b/lib/llm/src/block_manager/v2/physical/transfer/tests/mod.rs new file mode 100644 index 0000000000..709e8f65f8 --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/tests/mod.rs @@ -0,0 +1,220 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Comprehensive transfer tests for verifying data integrity across storage types and layout configurations. + +#[cfg(all(feature = "testing-cuda", feature = "testing-nixl"))] +mod local_transfers; + +use super::{NixlAgent, PhysicalLayout}; +use crate::block_manager::v2::physical::layout::{ + LayoutConfig, + builder::{HasConfig, NoLayout, NoMemory, PhysicalLayoutBuilder}, +}; + +/// Standard layout configuration for all tests. +pub fn standard_config(num_blocks: usize) -> LayoutConfig { + LayoutConfig::builder() + .num_blocks(num_blocks) + .num_layers(2) + .outer_dim(2) + .page_size(16) + .inner_dim(128) + .dtype_width_bytes(2) + .build() + .unwrap() +} + +/// Helper function for creating a PhysicalLayout builder with standard config. +/// +/// This is used by other test modules (fill, checksum, validation) for backwards compatibility. +pub fn builder(num_blocks: usize) -> PhysicalLayoutBuilder { + let agent = create_test_agent("test_agent"); + let config = standard_config(num_blocks); + PhysicalLayout::builder(agent).with_config(config) +} + +/// Create a test agent with optimal backends for testing. +/// +/// Attempts to initialize UCX, GDS, and POSIX backends. Falls back gracefully +/// if some backends are unavailable (e.g., GDS on non-DGX machines). +pub fn create_test_agent(name: &str) -> NixlAgent { + NixlAgent::require_backends(name, &[]).expect("Failed to require backends") +} + +#[cfg(feature = "testing-cuda")] +pub(crate) mod cuda { + use anyhow::Result; + use cudarc::driver::sys::CUdevice_attribute_enum; + use cudarc::driver::{CudaContext, CudaStream, LaunchConfig, PushKernelArg}; + use cudarc::nvrtc::{CompileOptions, compile_ptx_with_opts}; + use std::collections::HashMap; + use std::sync::{Arc, OnceLock}; + use std::time::{Duration, Instant}; + + /// CUDA sleep kernel source code. + pub const SLEEP_KERNEL_SRC: &str = r#" + extern "C" __global__ void sleep_kernel(unsigned long long min_cycles) { + const unsigned long long start = clock64(); + while ((clock64() - start) < min_cycles) { + asm volatile(""); + } + } + "#; + + /// A reusable CUDA sleep utility for tests. + /// + /// This struct provides a simple interface to execute GPU sleep operations + /// with calibrated timing. It compiles the sleep kernel once per CUDA context + /// and caches the calibration for reuse. + /// + /// The calibration is conservative (prefers longer sleep durations over shorter) + /// to ensure minimum sleep times are met. + pub struct CudaSleep { + function: cudarc::driver::CudaFunction, + cycles_per_ms: f64, + } + + impl CudaSleep { + /// Get or create a CudaSleep instance for the given CUDA context. + /// + /// This function uses lazy initialization and caches instances per device ID. + /// The first call for each device will compile the kernel and run calibration. + /// + /// # Arguments + /// * `cuda_ctx` - The CUDA context to use + /// + /// # Returns + /// A shared reference to the CudaSleep instance for this context's device. + pub fn for_context(cuda_ctx: &Arc) -> Result> { + static INSTANCES: OnceLock>>> = + OnceLock::new(); + + let instances = INSTANCES.get_or_init(|| parking_lot::Mutex::new(HashMap::new())); + let device_ordinal = cuda_ctx.ordinal(); + + // Fast path: check if instance already exists + { + let instances_guard = instances.lock(); + if let Some(instance) = instances_guard.get(&device_ordinal) { + return Ok(Arc::clone(instance)); + } + } + + // Slow path: create new instance with calibration + let instance = Arc::new(Self::new(cuda_ctx)?); + + // Store in cache + let mut instances_guard = instances.lock(); + instances_guard + .entry(device_ordinal) + .or_insert_with(|| Arc::clone(&instance)); + + Ok(instance) + } + + /// Create a new CudaSleep instance with calibration. + /// + /// This compiles the sleep kernel and runs a calibration loop to determine + /// the relationship between clock cycles and wall-clock time. + fn new(cuda_ctx: &Arc) -> Result { + // Get device compute capability + let major = cuda_ctx + .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)?; + let minor = cuda_ctx + .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)?; + + // Compile PTX for this device + let mut compile_opts = CompileOptions { + name: Some("sleep_kernel.cu".into()), + ..Default::default() + }; + compile_opts + .options + .push(format!("--gpu-architecture=compute_{}{}", major, minor)); + let ptx = compile_ptx_with_opts(SLEEP_KERNEL_SRC, compile_opts)?; + let module = cuda_ctx.load_module(ptx)?; + let function = module.load_function("sleep_kernel")?; + + // Get device clock rate + let clock_rate_khz = + cuda_ctx.attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_CLOCK_RATE)? as u64; + + // Create a temporary stream for calibration + let stream = cuda_ctx.new_stream()?; + + // Warm up to absorb JIT overhead + let warm_cycles = clock_rate_khz.saturating_mul(10).max(1); + Self::launch_kernel(&function, &stream, warm_cycles)?; + stream.synchronize()?; + + // Run calibration loop + let desired_delay = Duration::from_millis(600); + let mut target_cycles = clock_rate_khz.saturating_mul(50).max(1); // ~50ms starting point + let mut actual_duration = Duration::ZERO; + + for _ in 0..8 { + let start = Instant::now(); + Self::launch_kernel(&function, &stream, target_cycles)?; + stream.synchronize()?; + actual_duration = start.elapsed(); + + if actual_duration >= desired_delay { + break; + } + + target_cycles = target_cycles.saturating_mul(2); + } + + // Calculate cycles per millisecond with conservative 20% margin + // (prefer longer sleeps over shorter) + let cycles_per_ms = if actual_duration.as_millis() > 0 { + (target_cycles as f64 / actual_duration.as_millis() as f64) * 1.2 + } else { + clock_rate_khz as f64 // Fallback to clock rate + }; + + Ok(Self { + function, + cycles_per_ms, + }) + } + + /// Launch the sleep kernel with the specified number of cycles. + fn launch_kernel( + function: &cudarc::driver::CudaFunction, + stream: &Arc, + cycles: u64, + ) -> Result<()> { + let launch_cfg = LaunchConfig { + grid_dim: (1, 1, 1), + block_dim: (1, 1, 1), + shared_mem_bytes: 0, + }; + + let mut launch = stream.launch_builder(function); + unsafe { + launch.arg(&cycles); + launch.launch(launch_cfg)?; + } + + Ok(()) + } + + /// Launch a sleep operation on the given stream. + /// + /// This queues a GPU kernel that will sleep for approximately the specified + /// duration. The sleep is conservative and may take longer than requested. + /// + /// # Arguments + /// * `duration` - The minimum duration to sleep + /// * `stream` - The CUDA stream to launch the kernel on + /// + /// # Returns + /// Ok(()) if the kernel was successfully queued + pub fn launch(&self, duration: Duration, stream: &Arc) -> Result<()> { + let target_cycles = (duration.as_millis() as f64 * self.cycles_per_ms) as u64; + Self::launch_kernel(&self.function, stream, target_cycles) + } + } +} diff --git a/lib/llm/src/block_manager/v2/physical/transfer/validation.rs b/lib/llm/src/block_manager/v2/physical/transfer/validation.rs new file mode 100644 index 0000000000..dc460222ab --- /dev/null +++ b/lib/llm/src/block_manager/v2/physical/transfer/validation.rs @@ -0,0 +1,463 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Block ID validation for transfers. +//! +//! This module provides validation functions to ensure block transfers are safe and correct. + +use super::PhysicalLayout; +use std::collections::HashSet; +use thiserror::Error; + +/// Validation errors for block transfers. +#[derive(Debug, Error, PartialEq)] +pub enum BlockValidationError { + /// Destination block IDs contain duplicates. + #[error("Destination block IDs are not unique: duplicates = {duplicates:?}")] + DuplicateDestinationBlocks { duplicates: Vec }, + + /// Source and destination blocks overlap when using the same layout. + #[error("Source and destination blocks overlap (same layout): overlapping = {overlapping:?}")] + OverlappingBlocks { overlapping: Vec }, + + /// Lists have mismatched lengths. + #[error( + "Block ID lists have mismatched lengths: src={src_len}, dst={dst_len}, bounce={bounce_len:?}" + )] + LengthMismatch { + src_len: usize, + dst_len: usize, + bounce_len: Option, + }, + + /// Block ID is out of range for the layout. + #[error("Block ID {block_id} out of range for {layout_name} (max={max})")] + BlockOutOfRange { + block_id: usize, + layout_name: &'static str, + max: usize, + }, + + /// Bounce block IDs contain duplicates. + #[error("Bounce block IDs are not unique: duplicates = {duplicates:?}")] + DuplicateBounceBlocks { duplicates: Vec }, +} + +/// Validate that destination block IDs are unique (no duplicates). +/// +/// # Arguments +/// * `dst_block_ids` - Destination block IDs +/// +/// # Returns +/// Ok(()) if unique, Err with duplicate IDs otherwise +pub fn validate_dst_unique(dst_block_ids: &[usize]) -> Result<(), BlockValidationError> { + let mut seen = HashSet::new(); + let mut duplicates = Vec::new(); + + for &id in dst_block_ids { + if !seen.insert(id) && !duplicates.contains(&id) { + duplicates.push(id); + } + } + + if duplicates.is_empty() { + Ok(()) + } else { + Err(BlockValidationError::DuplicateDestinationBlocks { duplicates }) + } +} + +/// Validate that bounce block IDs are unique (no duplicates). +pub fn validate_bounce_unique(bounce_block_ids: &[usize]) -> Result<(), BlockValidationError> { + let mut seen = HashSet::new(); + let mut duplicates = Vec::new(); + + for &id in bounce_block_ids { + if !seen.insert(id) && !duplicates.contains(&id) { + duplicates.push(id); + } + } + + if duplicates.is_empty() { + Ok(()) + } else { + Err(BlockValidationError::DuplicateBounceBlocks { duplicates }) + } +} + +/// Check if two layouts are the same by comparing their Arc pointers. +/// +/// This is a conservative check - if pointers differ, layouts might still be the same +/// but we treat them as different to avoid false positives in disjoint validation. +fn are_same_layout(layout1: &PhysicalLayout, layout2: &PhysicalLayout) -> bool { + // Compare Arc pointer addresses + std::ptr::eq( + std::sync::Arc::as_ptr(layout1.layout()), + std::sync::Arc::as_ptr(layout2.layout()), + ) +} + +/// Validate that src and dst block IDs are disjoint when using the same layout. +/// +/// Only enforced in debug mode when src and dst point to the same layout. +/// +/// # Arguments +/// * `src_block_ids` - Source block IDs +/// * `dst_block_ids` - Destination block IDs +/// * `src_layout` - Source physical layout +/// * `dst_layout` - Destination physical layout +#[cfg(debug_assertions)] +pub fn validate_disjoint_same_layout( + src_block_ids: &[usize], + dst_block_ids: &[usize], + src_layout: &PhysicalLayout, + dst_layout: &PhysicalLayout, +) -> Result<(), BlockValidationError> { + // Only check if same layout + if !are_same_layout(src_layout, dst_layout) { + return Ok(()); + } + + let src_set: HashSet<_> = src_block_ids.iter().copied().collect(); + let overlapping: Vec<_> = dst_block_ids + .iter() + .filter(|id| src_set.contains(id)) + .copied() + .collect(); + + if overlapping.is_empty() { + Ok(()) + } else { + Err(BlockValidationError::OverlappingBlocks { overlapping }) + } +} + +/// Validate block IDs are in range for a layout. +#[cfg(debug_assertions)] +pub fn validate_block_ids_in_range( + block_ids: &[usize], + layout: &PhysicalLayout, + layout_name: &'static str, +) -> Result<(), BlockValidationError> { + let max_blocks = layout.layout().config().num_blocks; + + for &block_id in block_ids { + if block_id >= max_blocks { + return Err(BlockValidationError::BlockOutOfRange { + block_id, + layout_name, + max: max_blocks, + }); + } + } + + Ok(()) +} + +/// Full validation for block transfer (debug mode). +/// +/// Validates: +/// - List lengths match +/// - Destination IDs are unique +/// - Bounce IDs are unique (if provided) +/// - Source and destination are disjoint (if same layout) +/// - All block IDs are in range for their respective layouts +#[cfg(debug_assertions)] +pub fn validate_block_transfer( + src_block_ids: &[usize], + dst_block_ids: &[usize], + bounce_block_ids: Option<&[usize]>, + src_layout: &PhysicalLayout, + dst_layout: &PhysicalLayout, + bounce_layout: Option<&PhysicalLayout>, +) -> Result<(), BlockValidationError> { + // Validate lengths + if src_block_ids.len() != dst_block_ids.len() { + return Err(BlockValidationError::LengthMismatch { + src_len: src_block_ids.len(), + dst_len: dst_block_ids.len(), + bounce_len: bounce_block_ids.map(|ids| ids.len()), + }); + } + + if let Some(bounce_ids) = bounce_block_ids + && bounce_ids.len() != src_block_ids.len() + { + return Err(BlockValidationError::LengthMismatch { + src_len: src_block_ids.len(), + dst_len: dst_block_ids.len(), + bounce_len: Some(bounce_ids.len()), + }); + } + + #[cfg(debug_assertions)] + { + // Validate destination uniqueness + validate_dst_unique(dst_block_ids)?; + + // Validate bounce uniqueness if provided + if let Some(bounce_ids) = bounce_block_ids { + validate_bounce_unique(bounce_ids)?; + } + + // Validate disjoint if same layout + validate_disjoint_same_layout(src_block_ids, dst_block_ids, src_layout, dst_layout)?; + + // Validate block IDs in range + validate_block_ids_in_range(src_block_ids, src_layout, "source")?; + validate_block_ids_in_range(dst_block_ids, dst_layout, "destination")?; + if let (Some(bounce_ids), Some(bounce_layout)) = (bounce_block_ids, bounce_layout) { + validate_block_ids_in_range(bounce_ids, bounce_layout, "bounce")?; + } + } + + Ok(()) +} + +/// Minimal validation for block transfer (release mode). +/// +/// Only validates: +/// - List lengths match +/// - Destination IDs are unique +#[cfg(not(debug_assertions))] +pub fn validate_block_transfer( + src_block_ids: &[usize], + dst_block_ids: &[usize], + bounce_block_ids: Option<&[usize]>, + _src_layout: &PhysicalLayout, + _dst_layout: &PhysicalLayout, + _bounce_layout: Option<&PhysicalLayout>, +) -> Result<(), BlockValidationError> { + // Validate lengths + if src_block_ids.len() != dst_block_ids.len() { + return Err(BlockValidationError::LengthMismatch { + src_len: src_block_ids.len(), + dst_len: dst_block_ids.len(), + bounce_len: bounce_block_ids.map(|ids| ids.len()), + }); + } + + if let Some(bounce_ids) = bounce_block_ids { + if bounce_ids.len() != src_block_ids.len() { + return Err(BlockValidationError::LengthMismatch { + src_len: src_block_ids.len(), + dst_len: dst_block_ids.len(), + bounce_len: Some(bounce_ids.len()), + }); + } + } + + // Validate destination uniqueness + validate_dst_unique(dst_block_ids)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::super::tests::*; + use super::*; + + #[test] + fn test_dst_unique_valid() { + let ids = vec![0, 1, 2, 3, 4]; + assert!(validate_dst_unique(&ids).is_ok()); + } + + #[test] + fn test_dst_unique_duplicate() { + let ids = vec![0, 1, 2, 1, 3]; + let result = validate_dst_unique(&ids); + assert!(result.is_err()); + match result.unwrap_err() { + BlockValidationError::DuplicateDestinationBlocks { duplicates } => { + assert_eq!(duplicates, vec![1]); + } + _ => panic!("Wrong error type"), + } + } + + #[test] + fn test_dst_unique_multiple_duplicates() { + let ids = vec![0, 1, 2, 1, 3, 2]; + let result = validate_dst_unique(&ids); + assert!(result.is_err()); + match result.unwrap_err() { + BlockValidationError::DuplicateDestinationBlocks { duplicates } => { + assert!(duplicates.contains(&1)); + assert!(duplicates.contains(&2)); + } + _ => panic!("Wrong error type"), + } + } + + #[test] + #[cfg(debug_assertions)] + fn test_disjoint_same_layout_valid() { + let physical = builder(2) + .fully_contiguous() + .allocate_system() + .build() + .unwrap(); + + let src_ids = vec![0, 1, 2]; + let dst_ids = vec![5, 6, 7]; + + assert!(validate_disjoint_same_layout(&src_ids, &dst_ids, &physical, &physical).is_ok()); + } + + #[test] + #[cfg(debug_assertions)] + fn test_disjoint_same_layout_overlap() { + let physical = builder(2) + .fully_contiguous() + .allocate_system() + .build() + .unwrap(); + + let src_ids = vec![0, 1, 2]; + let dst_ids = vec![2, 3, 4]; // 2 overlaps + + let result = validate_disjoint_same_layout(&src_ids, &dst_ids, &physical, &physical); + assert!(result.is_err()); + match result.unwrap_err() { + BlockValidationError::OverlappingBlocks { overlapping } => { + assert_eq!(overlapping, vec![2]); + } + _ => panic!("Wrong error type"), + } + } + + #[test] + fn test_disjoint_different_layouts_ok() { + let physical1 = builder(2) + .fully_contiguous() + .allocate_system() + .build() + .unwrap(); + + let physical2 = builder(2) + .fully_contiguous() + .allocate_system() + .build() + .unwrap(); + + let src_ids = vec![0, 1, 2]; + let dst_ids = vec![0, 1, 2]; // Same IDs but different layouts + + // Should be OK since different layouts + #[cfg(debug_assertions)] + assert!(validate_disjoint_same_layout(&src_ids, &dst_ids, &physical1, &physical2).is_ok()); + } + + #[test] + fn test_length_mismatch() { + let physical1 = builder(2) + .fully_contiguous() + .allocate_system() + .build() + .unwrap(); + + let physical2 = builder(2) + .fully_contiguous() + .allocate_system() + .build() + .unwrap(); + + let src_ids = vec![0, 1, 2]; + let dst_ids = vec![5, 6]; // Different length + + let result = + validate_block_transfer(&src_ids, &dst_ids, None, &physical1, &physical2, None); + assert!(result.is_err()); + match result.unwrap_err() { + BlockValidationError::LengthMismatch { + src_len, + dst_len, + bounce_len, + } => { + assert_eq!(src_len, 3); + assert_eq!(dst_len, 2); + assert_eq!(bounce_len, None); + } + _ => panic!("Wrong error type"), + } + } + + // #[test] + // #[cfg(debug_assertions)] + // fn test_block_out_of_range() { + // let (_layout, physical) = create_test_layout(5); // Only 5 blocks + // let src_ids = vec![0, 1, 2]; + // let dst_ids = vec![3, 4, 10]; // 10 is out of range + + // let result = validate_block_ids_in_range(&dst_ids, &physical, "destination"); + // assert!(result.is_err()); + // match result.unwrap_err() { + // BlockValidationError::BlockOutOfRange { + // block_id, + // layout_name, + // max, + // } => { + // assert_eq!(block_id, 10); + // assert_eq!(layout_name, "destination"); + // assert_eq!(max, 5); + // } + // _ => panic!("Wrong error type"), + // } + // } + + // #[test] + // fn test_bounce_length_mismatch() { + // let (_layout1, physical1) = create_test_layout(10); + // let (_layout2, physical2) = create_test_layout(10); + // let (_layout3, physical3) = create_test_layout(10); + // let src_ids = vec![0, 1, 2]; + // let dst_ids = vec![5, 6, 7]; + // let bounce_ids = vec![8, 9]; // Wrong length + + // let result = validate_block_transfer( + // &src_ids, + // &dst_ids, + // Some(&bounce_ids), + // &physical1, + // &physical2, + // Some(&physical3), + // ); + // assert!(result.is_err()); + // match result.unwrap_err() { + // BlockValidationError::LengthMismatch { + // src_len, + // dst_len, + // bounce_len, + // } => { + // assert_eq!(src_len, 3); + // assert_eq!(dst_len, 3); + // assert_eq!(bounce_len, Some(2)); + // } + // _ => panic!("Wrong error type"), + // } + // } + + // #[test] + // fn test_full_validation_success() { + // let (_layout1, physical1) = create_test_layout(10); + // let (_layout2, physical2) = create_test_layout(10); + // let (_layout3, physical3) = create_test_layout(10); + // let src_ids = vec![0, 1, 2]; + // let dst_ids = vec![5, 6, 7]; + // let bounce_ids = vec![8, 9, 3]; + + // assert!( + // validate_block_transfer( + // &src_ids, + // &dst_ids, + // Some(&bounce_ids), + // &physical1, + // &physical2, + // Some(&physical3), + // ) + // .is_ok() + // ); + // } +} diff --git a/lib/runtime/src/config.rs b/lib/runtime/src/config.rs index fbfc457fe7..a30e72991b 100644 --- a/lib/runtime/src/config.rs +++ b/lib/runtime/src/config.rs @@ -397,6 +397,19 @@ pub fn is_truthy(val: &str) -> bool { matches!(val.to_lowercase().as_str(), "1" | "true" | "on" | "yes") } +pub fn parse_bool(val: &str) -> anyhow::Result { + if is_truthy(val) { + Ok(true) + } else if is_falsey(val) { + Ok(false) + } else { + anyhow::bail!( + "Invalid boolean value: '{}'. Expected one of: true/false, 1/0, on/off, yes/no", + val + ) + } +} + /// Check if a string is falsey /// This will be used to evaluate environment variables or any other subjective /// configuration parameters that can be set by the user that should be evaluated diff --git a/lib/runtime/src/lib.rs b/lib/runtime/src/lib.rs index d44da24c80..86c3ced1fa 100644 --- a/lib/runtime/src/lib.rs +++ b/lib/runtime/src/lib.rs @@ -17,7 +17,7 @@ pub use anyhow::{ use async_once_cell::OnceCell; -mod config; +pub mod config; pub use config::RuntimeConfig; pub mod component;