diff --git a/Cargo.lock b/Cargo.lock
index d7b47e8ef2..45004612e5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -604,6 +604,26 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "bincode"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740"
+dependencies = [
+ "bincode_derive",
+ "serde",
+ "unty",
+]
+
+[[package]]
+name = "bincode_derive"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09"
+dependencies = [
+ "virtue",
+]
+
 [[package]]
 name = "bindgen"
 version = "0.71.1"
@@ -2136,7 +2156,7 @@ dependencies = [
  "async_zmq",
  "axum 0.8.4",
  "axum-server",
- "bincode",
+ "bincode 2.0.1",
  "bitflags 2.9.4",
  "blake3",
  "bs62",
@@ -2273,7 +2293,7 @@ dependencies = [
  "async-trait",
  "async_zmq",
  "axum 0.8.4",
- "bincode",
+ "bincode 1.3.3",
  "blake3",
  "bytes",
  "chrono",
@@ -5575,9 +5595,9 @@ checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
 
 [[package]]
 name = "ordered-float"
-version = "5.0.0"
+version = "5.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2c1f9f56e534ac6a9b8a4600bdf0f530fb393b5f393e7b4d03489c3cf0c3f01"
+checksum = "7f4779c6901a562440c3786d08192c6fbda7c1c2060edd10006b05ee35d10f2d"
 dependencies = [
  "num-traits",
 ]
@@ -7899,9 +7919,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 
 [[package]]
 name = "symphonia"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "815c942ae7ee74737bb00f965fa5b5a2ac2ce7b6c01c0cc169bbeaf7abd5f5a9"
+checksum = "5773a4c030a19d9bfaa090f49746ff35c75dfddfa700df7a5939d5e076a57039"
 dependencies = [
  "lazy_static",
  "symphonia-bundle-flac",
@@ -7917,9 +7937,9 @@ dependencies = [
 
 [[package]]
 name = "symphonia-bundle-flac"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72e34f34298a7308d4397a6c7fbf5b84c5d491231ce3dd379707ba673ab3bd97"
+checksum = "c91565e180aea25d9b80a910c546802526ffd0072d0b8974e3ebe59b686c9976"
 dependencies = [
  "log",
  "symphonia-core",
@@ -7929,9 +7949,9 @@ dependencies = [
 
 [[package]]
 name = "symphonia-bundle-mp3"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c01c2aae70f0f1fb096b6f0ff112a930b1fb3626178fba3ae68b09dce71706d4"
+checksum = "4872dd6bb56bf5eac799e3e957aa1981086c3e613b27e0ac23b176054f7c57ed"
 dependencies = [
  "lazy_static",
  "log",
@@ -7941,9 +7961,9 @@ dependencies = [
 
 [[package]]
 name = "symphonia-codec-pcm"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f395a67057c2ebc5e84d7bb1be71cce1a7ba99f64e0f0f0e303a03f79116f89b"
+checksum = "4e89d716c01541ad3ebe7c91ce4c8d38a7cf266a3f7b2f090b108fb0cb031d95"
 dependencies = [
  "log",
  "symphonia-core",
@@ -7951,9 +7971,9 @@ dependencies = [
 
 [[package]]
 name = "symphonia-codec-vorbis"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a98765fb46a0a6732b007f7e2870c2129b6f78d87db7987e6533c8f164a9f30"
+checksum = "f025837c309cd69ffef572750b4a2257b59552c5399a5e49707cc5b1b85d1c73"
 dependencies = [
  "log",
  "symphonia-core",
@@ -7962,9 +7982,9 @@ dependencies = [
 
 [[package]]
 name = "symphonia-core"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "798306779e3dc7d5231bd5691f5a813496dc79d3f56bf82e25789f2094e022c3"
+checksum = "ea00cc4f79b7f6bb7ff87eddc065a1066f3a43fe1875979056672c9ef948c2af"
 dependencies = [
  "arrayvec",
  "bitflags 1.3.2",
@@ -7975,9 +7995,9 @@ dependencies = [
 
 [[package]]
 name = "symphonia-format-isomp4"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abfdf178d697e50ce1e5d9b982ba1b94c47218e03ec35022d9f0e071a16dc844"
+checksum = "243739585d11f81daf8dac8d9f3d18cc7898f6c09a259675fc364b382c30e0a5"
 dependencies = [
  "encoding_rs",
  "log",
@@ -7988,9 +8008,9 @@ dependencies = [
 
 [[package]]
 name = "symphonia-format-ogg"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ada3505789516bcf00fc1157c67729eded428b455c27ca370e41f4d785bfa931"
+checksum = "2b4955c67c1ed3aa8ae8428d04ca8397fbef6a19b2b051e73b5da8b1435639cb"
 dependencies = [
  "log",
  "symphonia-core",
@@ -8000,9 +8020,9 @@ dependencies = [
 
 [[package]]
 name = "symphonia-format-riff"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05f7be232f962f937f4b7115cbe62c330929345434c834359425e043bfd15f50"
+checksum = "c2d7c3df0e7d94efb68401d81906eae73c02b40d5ec1a141962c592d0f11a96f"
 dependencies = [
  "extended",
  "log",
@@ -8012,9 +8032,9 @@ dependencies = [
 
 [[package]]
 name = "symphonia-metadata"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc622b9841a10089c5b18e99eb904f4341615d5aa55bbf4eedde1be721a4023c"
+checksum = "36306ff42b9ffe6e5afc99d49e121e0bd62fe79b9db7b9681d48e29fa19e6b16"
 dependencies = [
  "encoding_rs",
  "lazy_static",
@@ -8024,9 +8044,9 @@ dependencies = [
 
 [[package]]
 name = "symphonia-utils-xiph"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "484472580fa49991afda5f6550ece662237b00c6f562c7d9638d1b086ed010fe"
+checksum = "ee27c85ab799a338446b68eec77abf42e1a6f1bb490656e121c6e27bfbab9f16"
 dependencies = [
  "symphonia-core",
  "symphonia-metadata",
@@ -9281,6 +9301,12 @@ version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
+[[package]]
+name = "unty"
+version = "0.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae"
+
 [[package]]
 name = "ureq"
 version = "2.12.1"
@@ -9515,6 +9541,12 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
 
+[[package]]
+name = "virtue"
+version = "0.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1"
+
 [[package]]
 name = "vob"
 version = "3.0.6"
@@ -9730,9 +9762,9 @@ dependencies = [
 
 [[package]]
 name = "widestring"
-version = "1.2.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd7cf3379ca1aac9eea11fba24fd7e315d621f8dfe35c8d7d2be8b793726e07d"
+checksum = "72069c3113ab32ab29e5584db3c6ec55d416895e60715417b5b883a357c3e471"
 
 [[package]]
 name = "winapi"
diff --git a/Cargo.toml b/Cargo.toml
index 256248beb4..e9c3dc8376 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -63,6 +63,7 @@ chrono = { version = "0.4", default-features = false, features = [
     "now",
     "serde",
 ] }
+cudarc = { version = "0.17.1", features = ["cuda-12020"] }
 derive_builder = { version = "0.20" }
 derive-getters = { version = "0.5" }
 either = { version = "1.13", features = ["serde"] }
diff --git a/lib/bindings/python/Cargo.lock b/lib/bindings/python/Cargo.lock
index 4586591b15..7576ead2c0 100644
--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
@@ -515,6 +515,26 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "bincode"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740"
+dependencies = [
+ "bincode_derive",
+ "serde",
+ "unty",
+]
+
+[[package]]
+name = "bincode_derive"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09"
+dependencies = [
+ "virtue",
+]
+
 [[package]]
 name = "bindgen"
 version = "0.69.5"
@@ -1103,15 +1123,6 @@ dependencies = [
  "typenum",
 ]
 
-[[package]]
-name = "cudarc"
-version = "0.16.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17200eb07e7d85a243aa1bf4569a7aa998385ba98d14833973a817a63cc86e92"
-dependencies = [
- "libloading",
-]
-
 [[package]]
 name = "cudarc"
 version = "0.17.2"
@@ -1452,6 +1463,15 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "dynamo-kvbm-kernels"
+version = "0.6.0"
+dependencies = [
+ "cc",
+ "cudarc",
+ "once_cell",
+]
+
 [[package]]
 name = "dynamo-llm"
 version = "0.6.0"
@@ -1459,6 +1479,7 @@ dependencies = [
  "ahash",
  "aho-corasick",
  "akin",
+ "aligned-vec",
  "anyhow",
  "async-nats",
  "async-stream",
@@ -1466,7 +1487,7 @@ dependencies = [
  "async_zmq",
  "axum",
  "axum-server",
- "bincode",
+ "bincode 2.0.1",
  "bitflags 2.9.3",
  "blake3",
  "bs62",
@@ -1474,12 +1495,13 @@ dependencies = [
  "bytes",
  "candle-core",
  "chrono",
- "cudarc 0.17.2",
+ "cudarc",
  "dashmap",
  "derive-getters",
  "derive_builder",
  "dialoguer",
  "dynamo-async-openai",
+ "dynamo-kvbm-kernels",
  "dynamo-parsers",
  "dynamo-runtime",
  "either",
@@ -1560,7 +1582,7 @@ dependencies = [
  "anyhow",
  "async-stream",
  "async-trait",
- "cudarc 0.16.6",
+ "cudarc",
  "derive-getters",
  "dlpark",
  "dynamo-async-openai",
@@ -1602,7 +1624,7 @@ dependencies = [
  "async-trait",
  "async_zmq",
  "axum",
- "bincode",
+ "bincode 1.3.3",
  "blake3",
  "bytes",
  "chrono",
@@ -6825,6 +6847,12 @@ version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
+[[package]]
+name = "unty"
+version = "0.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae"
+
 [[package]]
 name = "ureq"
 version = "2.12.1"
@@ -6981,6 +7009,12 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
 
+[[package]]
+name = "virtue"
+version = "0.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1"
+
 [[package]]
 name = "walkdir"
 version = "2.5.0"
diff --git a/lib/bindings/python/Cargo.toml b/lib/bindings/python/Cargo.toml
index bb978a37ee..459663c5af 100644
--- a/lib/bindings/python/Cargo.toml
+++ b/lib/bindings/python/Cargo.toml
@@ -73,7 +73,7 @@ pyo3-async-runtimes = { version = "0.23.0", default-features = false, features =
 pythonize = "0.23"
 
 dlpark = { version = "0.5", features = ["pyo3", "half"], optional = true }
-cudarc = { version = "0.16.2", features = ["cuda-12020"], optional = true }
+cudarc = { version = "0.17.1", features = ["cuda-12020"], optional = true }
 prometheus = "0.14.0"
 
 
diff --git a/lib/llm/Cargo.toml b/lib/llm/Cargo.toml
index d74056150b..cebf062be6 100644
--- a/lib/llm/Cargo.toml
+++ b/lib/llm/Cargo.toml
@@ -21,7 +21,7 @@ testing-full = ["testing-cuda", "testing-nixl"]
 testing-cuda = ["dep:cudarc"]
 testing-nixl = ["dep:nixl-sys"]
 testing-etcd = []
-block-manager = ["dep:nixl-sys", "dep:cudarc", "dep:ndarray", "dep:nix"]
+block-manager = ["dep:nixl-sys", "dep:cudarc", "dep:ndarray", "dep:nix", "dep:aligned-vec"]
 cuda = ["dep:cudarc"]
 integration = ["dynamo-runtime/integration"]
 
@@ -85,7 +85,7 @@ offset-allocator = "0.2"
 regex = "1"
 rayon = "1"
 dashmap = { version = "5.5.3" }
-bincode = "1"
+bincode = { version = "2.0.1", features = ["serde", "derive"] }
 
 # input/text
 dialoguer = { version = "0.11", default-features = false, features = [
@@ -94,11 +94,13 @@ dialoguer = { version = "0.11", default-features = false, features = [
 ] }
 
 # block_manager
+aligned-vec = { version = "0.6.4", optional = true }
 nixl-sys = { version = "=0.6.0", optional = true }
-cudarc = { version = "0.17.1", features = ["cuda-12020"], optional = true }
+cudarc = { workspace = true, optional = true }
 ndarray = { version = "0.16", optional = true }
 nix = { version = "0.26", optional = true }
 
+
 # protocols
 unicode-segmentation = "1.12"
 
@@ -163,7 +165,7 @@ insta = { version = "1.41", features = [
   "redactions",
   "filters",
 ] }
-aligned-vec = "0.6.4"
+
 lazy_static = "1.4"
 
 [build-dependencies]
diff --git a/lib/llm/benches/transfer_context_v2.rs b/lib/llm/benches/transfer_context_v2.rs
index 22a76b01cc..69e3d20f8e 100644
--- a/lib/llm/benches/transfer_context_v2.rs
+++ b/lib/llm/benches/transfer_context_v2.rs
@@ -7,7 +7,7 @@ mod benchmarks {
 
     use criterion::{BenchmarkId, Criterion, criterion_group};
     use cudarc::driver::{CudaContext, CudaStream};
-    use nixl_sys;
+
     use tokio::runtime::Runtime;
     use tokio_util::task::TaskTracker;
 
diff --git a/lib/llm/src/block_manager.rs b/lib/llm/src/block_manager.rs
index 0670f95e2d..edab3e71ec 100644
--- a/lib/llm/src/block_manager.rs
+++ b/lib/llm/src/block_manager.rs
@@ -20,6 +20,7 @@ pub mod numa_allocator;
 pub mod offload;
 pub mod pool;
 pub mod storage;
+pub mod v2;
 
 // dynamo rt integration
 pub mod controller;
@@ -326,18 +327,6 @@ mod tests {
             .unwrap()
     }
 
-    pub async fn create_reference_block_manager_with_counts(
-        device: usize,
-        host: usize,
-        disk: usize,
-    ) -> ReferenceBlockManager {
-        ReferenceBlockManager::new(create_reference_block_manager_config_with_counts(
-            device, host, disk,
-        ))
-        .await
-        .unwrap()
-    }
-
     #[tokio::test]
     async fn test_reference_block_manager_inherited_async_runtime() {
         dynamo_runtime::logging::init();
diff --git a/lib/llm/src/block_manager/block/transfer/context.rs b/lib/llm/src/block_manager/block/transfer/context.rs
index 36ad83a4c0..7d8e97c340 100644
--- a/lib/llm/src/block_manager/block/transfer/context.rs
+++ b/lib/llm/src/block_manager/block/transfer/context.rs
@@ -563,11 +563,11 @@ pub mod v2 {
                 tracker.spawn(async move {
                     let event = ctx_clone
                         .record_event()
-                        .expect(&format!("Failed to record event {}", i));
+                        .unwrap_or_else(|_| panic!("Failed to record event {}", i));
                     event
                         .synchronize()
                         .await
-                        .expect(&format!("Failed to sync event {}", i));
+                        .unwrap_or_else(|_| panic!("Failed to sync event {}", i));
                 });
             }
 
@@ -575,26 +575,6 @@ pub mod v2 {
             tracker.wait().await;
         }
 
-        #[tokio::test]
-        async fn test_performance_baseline() {
-            let ctx = setup_context();
-            let start = std::time::Instant::now();
-
-            // Test a reasonable number of synchronizations
-            for _ in 0..10 {
-                let event = ctx.record_event().expect("Failed to record event");
-                event.synchronize().await.expect("Sync failed");
-            }
-
-            let duration = start.elapsed();
-            // Should complete 10 synchronizations in reasonable time (< 1ms total)
-            assert!(
-                duration < std::time::Duration::from_millis(1),
-                "Performance regression: took {:?} for 10 syncs",
-                duration
-            );
-        }
-
         #[tokio::test]
         async fn test_error_handling() {
             let ctx = setup_context();
diff --git a/lib/llm/src/block_manager/distributed/worker.rs b/lib/llm/src/block_manager/distributed/worker.rs
index 8b3890e0d2..2fc927db2c 100644
--- a/lib/llm/src/block_manager/distributed/worker.rs
+++ b/lib/llm/src/block_manager/distributed/worker.rs
@@ -185,10 +185,13 @@ struct WorkerMetadataHandler {
 #[async_trait]
 impl Handler for WorkerMetadataHandler {
     async fn handle(&self, mut message: MessageHandle) -> anyhow::Result<()> {
-        let payload = bincode::serialize(&WorkerMetadata {
-            num_device_blocks: self.num_device_blocks,
-            bytes_per_block: self.bytes_per_block,
-        })?;
+        let payload = bincode::serde::encode_to_vec(
+            &WorkerMetadata {
+                num_device_blocks: self.num_device_blocks,
+                bytes_per_block: self.bytes_per_block,
+            },
+            bincode::config::standard(),
+        )?;
         message
             .reply(ZMQ_WORKER_METADATA_MESSAGE, &[payload])
             .await?;
@@ -226,8 +229,11 @@ impl Handler for LeaderMetadataHandler {
             );
             return Ok(());
         }
-        let leader_meta: LeaderMetadata = match bincode::deserialize(&message.data[0]) {
-            Ok(m) => m,
+        let leader_meta: LeaderMetadata = match bincode::serde::decode_from_slice(
+            &message.data[0],
+            bincode::config::standard(),
+        ) {
+            Ok((m, _)) => m,
             Err(e) => {
                 tracing::error!("leader_metadata: bad payload: {e:#}");
                 return Ok(());
diff --git a/lib/llm/src/block_manager/distributed/zmq.rs b/lib/llm/src/block_manager/distributed/zmq.rs
index d2e19322fd..5a48bb5f3d 100644
--- a/lib/llm/src/block_manager/distributed/zmq.rs
+++ b/lib/llm/src/block_manager/distributed/zmq.rs
@@ -166,14 +166,18 @@ impl ZmqActiveMessageLeader {
             }
         };
 
-        let workers: Vec<WorkerMetadata> = workers_payloads
-            .into_iter()
-            .map(|b| bincode::deserialize::<WorkerMetadata>(&b))
-            .collect::<std::result::Result<_, _>>()?;
+        let mut workers: Vec<WorkerMetadata> = Vec::with_capacity(workers_payloads.len());
+
+        for payload in workers_payloads {
+            let worker: WorkerMetadata =
+                bincode::serde::decode_from_slice(&payload, bincode::config::standard())?.0;
+            workers.push(worker);
+        }
 
         // 2) Compute & broadcast LeaderMetadata; wait for ALL acks in the SAME round.
         let leader_meta = make_leader_meta(&workers);
-        let leader_meta_bytes = bincode::serialize(&leader_meta)?;
+        let leader_meta_bytes =
+            bincode::serde::encode_to_vec(&leader_meta, bincode::config::standard())?;
 
         loop {
             if Instant::now() >= deadline {
diff --git a/lib/llm/src/block_manager/offload.rs b/lib/llm/src/block_manager/offload.rs
index 57f3553bf2..1ad258d002 100644
--- a/lib/llm/src/block_manager/offload.rs
+++ b/lib/llm/src/block_manager/offload.rs
@@ -693,7 +693,7 @@ impl OffloadFiltersBuilder {
     }
 }
 
-#[cfg(all(test, feature = "testing-cuda"))]
+#[cfg(all(test, feature = "testing-cuda", feature = "testing-nixl"))]
 mod tests {
     use super::*;
 
@@ -713,8 +713,7 @@ mod tests {
     use nixl_sys::{MemoryRegion, NixlDescriptor};
 
     use aligned_vec::avec;
-    use cudarc::runtime::sys::{cudaMemcpy, cudaMemcpyKind, cudaMemset};
-    use prometheus::Registry;
+    use cudarc::runtime::sys::{cudaDeviceSynchronize, cudaMemcpy, cudaMemcpyKind, cudaMemset};
     use rstest::*;
     use std::fs::File;
     use std::io::{Read, Seek, SeekFrom, Write};
@@ -1286,6 +1285,8 @@ mod tests {
         // Check that this is the same block.
         check_block_contents(&immutable_host_block, &device_blocks[0], 42)?;
 
+        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
+
         Ok(())
     }
 
diff --git a/lib/llm/src/block_manager/v2.rs b/lib/llm/src/block_manager/v2.rs
new file mode 100644
index 0000000000..51eb8e2a8f
--- /dev/null
+++ b/lib/llm/src/block_manager/v2.rs
@@ -0,0 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+pub mod kernels;
+pub mod memory;
+pub mod physical;
diff --git a/lib/llm/src/block_manager/v2/kernels/mod.rs b/lib/llm/src/block_manager/v2/kernels/mod.rs
new file mode 100644
index 0000000000..5db3a820ee
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/kernels/mod.rs
@@ -0,0 +1,56 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Safe-ish wrappers around the CUDA block/universal packing kernels.
+//!
+//! The core ideas:
+//! * A “block” represents the stack of `nl * no` tensors arranged either as NHD
+//!   (inner axes `[nt, nh, hd]`) or HND (inner axes `[nh, nt, hd]`).
+//! * A “universal” tensor is `[nh, nl, no, nt, hd]` stored contiguously.
+//! * An “operational” tensor is `[nl, no, inner]` with `inner = nt * nh * hd`.
+//!
+//! Host code calls these helpers with flattened pointer tables so a single
+//! launch can move many logical blocks in one go.
+
+#![allow(dead_code)]
+#![allow(clippy::missing_safety_doc)]
+
+/// Numeric tags passed across the FFI boundary to select the CUDA template.
+#[repr(i32)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum TensorDataType {
+    F16 = 0,
+    BF16 = 1,
+    F32 = 2,
+    F64 = 3,
+}
+
+/// Identifies how each `[nt, nh, hd]` chunk is laid out in device memory.
+#[repr(i32)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum BlockLayout {
+    NHD = 0,
+    HND = 1,
+}
+
+/// Direction flag for copying between block stacks and operational buffers.
+#[repr(i32)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum OperationalCopyDirection {
+    BlockToOperational = 0,
+    OperationalToBlock = 1,
+}
+
+/// Selects how the operational copy should move data.
+#[repr(i32)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum OperationalCopyBackend {
+    /// Try cudaMemcpyBatchAsync, fall back to cudaMemcpyAsync, then the kernel.
+    Auto = 0,
+    /// Force the custom CUDA kernel path.
+    KernelOnly = 1,
+    /// Issue one cudaMemcpyAsync per chunk.
+    MemcpyAsync = 2,
+    /// Invoke cudaMemcpyBatchAsync directly.
+    MemcpyBatch = 3,
+}
diff --git a/lib/llm/src/block_manager/v2/memory/actions.rs b/lib/llm/src/block_manager/v2/memory/actions.rs
new file mode 100644
index 0000000000..98fdbce61b
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/memory/actions.rs
@@ -0,0 +1,221 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Storage actions.
+
+use super::{MemoryRegion, StorageError};
+
+/// Extension trait for storage types that support memory setting operations
+pub trait Memset: MemoryRegion {
+    /// Sets a region of memory to a specific value
+    ///
+    /// # Arguments
+    /// * `value` - The value to set (will be truncated to u8)
+    /// * `offset` - Offset in bytes from the start of the storage
+    /// * `size` - Number of bytes to set
+    ///
+    /// # Safety
+    /// The caller must ensure:
+    /// - offset + size <= self.size()
+    /// - No other references exist to the memory region being set
+    fn memset(&mut self, value: u8, offset: usize, size: usize) -> Result<(), StorageError>;
+}
+
+/// Extension trait for storage types that support slicing operations
+pub trait Slice {
+    /// Returns an immutable byte slice view of the entire storage region
+    ///
+    /// # Safety
+    /// The caller must ensure:
+    /// - The memory region is valid and initialized
+    /// - No concurrent mutable access occurs while the slice is in use
+    fn as_slice(&self) -> Result<&[u8], StorageError>;
+
+    /// Returns an immutable byte slice view of a subregion
+    ///
+    /// # Arguments
+    /// * `offset` - Offset in bytes from the start of the storage
+    /// * `len` - Number of bytes to slice
+    ///
+    /// # Safety
+    /// The caller must ensure:
+    /// - offset + len <= self.size()
+    /// - The memory region is valid and initialized
+    /// - No concurrent mutable access occurs while the slice is in use
+    fn slice(&self, offset: usize, len: usize) -> Result<&[u8], StorageError> {
+        let slice = self.as_slice()?;
+
+        // validate offset and len
+        if offset.saturating_add(len) > slice.len() {
+            return Err(StorageError::Unsupported("slice out of bounds".into()));
+        }
+
+        slice
+            .get(offset..offset.saturating_add(len))
+            .ok_or_else(|| StorageError::Unsupported("slice out of bounds".into()))
+    }
+
+    /// Returns a typed immutable slice view of the entire storage region
+    ///
+    /// # Safety
+    /// The caller must ensure:
+    /// - The memory region is valid and initialized
+    /// - The memory is properly aligned for type T
+    /// - The size is a multiple of `size_of::<T>()`
+    /// - No concurrent mutable access occurs while the slice is in use
+    /// - The data represents valid values of type T
+    fn as_slice_typed<T>(&self) -> Result<&[T], StorageError> {
+        let bytes = self.as_slice()?;
+        let ptr = bytes.as_ptr() as *const T;
+        let len = bytes.len() / std::mem::size_of::<T>();
+
+        if !(bytes.as_ptr() as usize).is_multiple_of(std::mem::align_of::<T>()) {
+            return Err(StorageError::Unsupported(format!(
+                "memory not aligned for type (required alignment: {})",
+                std::mem::align_of::<T>()
+            )));
+        }
+
+        if bytes.len() % std::mem::size_of::<T>() != 0 {
+            return Err(StorageError::Unsupported(format!(
+                "size {} is not a multiple of type size {}",
+                bytes.len(),
+                std::mem::size_of::<T>()
+            )));
+        }
+
+        // SAFETY: Caller guarantees memory is valid, aligned, and properly initialized for T
+        Ok(unsafe { std::slice::from_raw_parts(ptr, len) })
+    }
+
+    /// Returns a typed immutable slice view of a subregion
+    ///
+    /// # Arguments
+    /// * `offset` - Offset in bytes from the start of the storage
+    /// * `len` - Number of elements of type T to slice
+    ///
+    /// # Safety
+    /// The caller must ensure:
+    /// - offset + (len * size_of::<T>()) <= self.size()
+    /// - offset is properly aligned for type T
+    /// - The memory region is valid and initialized
+    /// - No concurrent mutable access occurs while the slice is in use
+    /// - The data represents valid values of type T
+    fn slice_typed<T>(&self, offset: usize, len: usize) -> Result<&[T], StorageError> {
+        let type_size = std::mem::size_of::<T>();
+        let byte_len = len
+            .checked_mul(type_size)
+            .ok_or_else(|| StorageError::Unsupported("length overflow".into()))?;
+
+        let bytes = self.slice(offset, byte_len)?;
+        let ptr = bytes.as_ptr() as *const T;
+
+        if !(bytes.as_ptr() as usize).is_multiple_of(std::mem::align_of::<T>()) {
+            return Err(StorageError::Unsupported(format!(
+                "memory not aligned for type (required alignment: {})",
+                std::mem::align_of::<T>()
+            )));
+        }
+
+        // SAFETY: Caller guarantees memory is valid, aligned, and properly initialized for T
+        Ok(unsafe { std::slice::from_raw_parts(ptr, len) })
+    }
+}
+
+pub trait SliceMut {
+    /// Returns a mutable byte slice view of the entire storage region
+    ///
+    /// # Safety
+    /// The caller must ensure:
+    /// - The memory region is valid
+    /// - No other references (mutable or immutable) exist to this memory region
+    fn as_slice_mut(&mut self) -> Result<&mut [u8], StorageError>;
+
+    /// Returns a mutable byte slice view of a subregion
+    ///
+    /// # Arguments
+    /// * `offset` - Offset in bytes from the start of the storage
+    /// * `len` - Number of bytes to slice
+    ///
+    /// # Safety
+    /// The caller must ensure:
+    /// - offset + len <= self.size()
+    /// - The memory region is valid
+    /// - No other references (mutable or immutable) exist to this memory region
+    fn slice_mut(&mut self, offset: usize, len: usize) -> Result<&mut [u8], StorageError> {
+        let slice = self.as_slice_mut()?;
+
+        // validate offset and len
+        if offset.saturating_add(len) > slice.len() {
+            return Err(StorageError::Unsupported("slice out of bounds".into()));
+        }
+
+        slice
+            .get_mut(offset..offset.saturating_add(len))
+            .ok_or_else(|| StorageError::Unsupported("slice out of bounds".into()))
+    }
+
+    /// Returns a typed mutable slice view of the entire storage region
+    ///
+    /// # Safety
+    /// The caller must ensure:
+    /// - The memory region is valid
+    /// - The memory is properly aligned for type T
+    /// - The size is a multiple of `size_of::<T>()`
+    /// - No other references (mutable or immutable) exist to this memory region
+    fn as_slice_typed_mut<T>(&mut self) -> Result<&mut [T], StorageError> {
+        let bytes = self.as_slice_mut()?;
+        let ptr = bytes.as_mut_ptr() as *mut T;
+        let len = bytes.len() / std::mem::size_of::<T>();
+
+        if !(bytes.as_ptr() as usize).is_multiple_of(std::mem::align_of::<T>()) {
+            return Err(StorageError::Unsupported(format!(
+                "memory not aligned for type (required alignment: {})",
+                std::mem::align_of::<T>()
+            )));
+        }
+
+        if bytes.len() % std::mem::size_of::<T>() != 0 {
+            return Err(StorageError::Unsupported(format!(
+                "size {} is not a multiple of type size {}",
+                bytes.len(),
+                std::mem::size_of::<T>()
+            )));
+        }
+
+        // SAFETY: Caller guarantees memory is valid, aligned, and no aliasing
+        Ok(unsafe { std::slice::from_raw_parts_mut(ptr, len) })
+    }
+
+    /// Returns a typed mutable slice view of a subregion
+    ///
+    /// # Arguments
+    /// * `offset` - Offset in bytes from the start of the storage
+    /// * `len` - Number of elements of type T to slice
+    ///
+    /// # Safety
+    /// The caller must ensure:
+    /// - offset + (len * size_of::<T>()) <= self.size()
+    /// - offset is properly aligned for type T
+    /// - The memory region is valid
+    /// - No other references (mutable or immutable) exist to this memory region
+    fn slice_typed_mut<T>(&mut self, offset: usize, len: usize) -> Result<&mut [T], StorageError> {
+        let type_size = std::mem::size_of::<T>();
+        let byte_len = len
+            .checked_mul(type_size)
+            .ok_or_else(|| StorageError::Unsupported("length overflow".into()))?;
+
+        let bytes = self.slice_mut(offset, byte_len)?;
+        let ptr = bytes.as_mut_ptr() as *mut T;
+
+        if !(bytes.as_ptr() as usize).is_multiple_of(std::mem::align_of::<T>()) {
+            return Err(StorageError::Unsupported(format!(
+                "memory not aligned for type (required alignment: {})",
+                std::mem::align_of::<T>()
+            )));
+        }
+
+        // SAFETY: Caller guarantees memory is valid, aligned, and no aliasing
+        Ok(unsafe { std::slice::from_raw_parts_mut(ptr, len) })
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/memory/device.rs b/lib/llm/src/block_manager/v2/memory/device.rs
new file mode 100644
index 0000000000..6f8a88daa4
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/memory/device.rs
@@ -0,0 +1,115 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! CUDA device memory storage.
+
+use super::{MemoryRegion, Result, StorageError, StorageKind};
+use cudarc::driver::CudaContext;
+use std::any::Any;
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex, OnceLock};
+
+/// Get or create a CUDA context for the given device.
+fn cuda_context(device_id: u32) -> Result<Arc<CudaContext>> {
+    static CONTEXTS: OnceLock<Mutex<HashMap<u32, Arc<CudaContext>>>> = OnceLock::new();
+    let mut map = CONTEXTS.get_or_init(Default::default).lock().unwrap();
+
+    if let Some(existing) = map.get(&device_id) {
+        return Ok(existing.clone());
+    }
+
+    let ctx = CudaContext::new(device_id as usize)?;
+    map.insert(device_id, ctx.clone());
+    Ok(ctx)
+}
+
+/// CUDA device memory allocated via cudaMalloc.
+#[derive(Debug)]
+pub struct DeviceStorage {
+    ctx: Arc<CudaContext>,
+    ptr: u64,
+    device_id: u32,
+    len: usize,
+}
+
+unsafe impl Send for DeviceStorage {}
+unsafe impl Sync for DeviceStorage {}
+
+impl DeviceStorage {
+    /// Allocate new device memory of the given size.
+    ///
+    /// # Arguments
+    /// * `len` - Size in bytes to allocate
+    /// * `device_id` - CUDA device on which to allocate
+    pub fn new(len: usize, device_id: u32) -> Result<Self> {
+        if len == 0 {
+            return Err(StorageError::AllocationFailed(
+                "zero-sized allocations are not supported".into(),
+            ));
+        }
+
+        let ctx = cuda_context(device_id)?;
+        ctx.bind_to_thread().map_err(StorageError::Cuda)?;
+        let ptr = unsafe { cudarc::driver::result::malloc_sync(len).map_err(StorageError::Cuda)? };
+
+        Ok(Self {
+            ctx,
+            ptr,
+            device_id,
+            len,
+        })
+    }
+
+    /// Get the device pointer value.
+    pub fn device_ptr(&self) -> u64 {
+        self.ptr
+    }
+
+    /// Get the CUDA device ID this memory is allocated on.
+    pub fn device_id(&self) -> u32 {
+        self.device_id
+    }
+}
+
+impl Drop for DeviceStorage {
+    fn drop(&mut self) {
+        if let Err(e) = self.ctx.bind_to_thread() {
+            tracing::debug!("failed to bind CUDA context for free: {e}");
+        }
+        unsafe {
+            if let Err(e) = cudarc::driver::result::free_sync(self.ptr) {
+                tracing::debug!("failed to free device memory: {e}");
+            }
+        };
+    }
+}
+
+impl MemoryRegion for DeviceStorage {
+    fn addr(&self) -> usize {
+        self.device_ptr() as usize
+    }
+
+    fn size(&self) -> usize {
+        self.len
+    }
+
+    fn storage_kind(&self) -> StorageKind {
+        StorageKind::Device(self.device_id)
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+// Support for NIXL registration
+impl super::registered::NixlCompatible for DeviceStorage {
+    fn nixl_params(&self) -> (*const u8, usize, nixl_sys::MemType, u64) {
+        (
+            self.ptr as *const u8,
+            self.len,
+            nixl_sys::MemType::Vram,
+            self.device_id as u64,
+        )
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/memory/disk.rs b/lib/llm/src/block_manager/v2/memory/disk.rs
new file mode 100644
index 0000000000..a0ce440746
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/memory/disk.rs
@@ -0,0 +1,362 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Disk-backed memory storage using memory-mapped files.
+
+use super::{MemoryRegion, Result, StorageError, StorageKind};
+use std::any::Any;
+use std::path::{Path, PathBuf};
+
+use core::ffi::c_char;
+use nix::fcntl::{FallocateFlags, fallocate};
+use nix::unistd::unlink;
+use std::ffi::CString;
+
+const DISK_CACHE_KEY: &str = "DYN_KVBM_DISK_CACHE_DIR";
+const DEFAULT_DISK_CACHE_DIR: &str = "/tmp/";
+
+#[derive(Debug)]
+pub struct DiskStorage {
+    fd: u64,
+    path: PathBuf,
+    size: usize,
+    unlinked: bool,
+}
+
+impl DiskStorage {
+    pub fn new(size: usize) -> Result<Self> {
+        // We need to open our file with some special flags that aren't supported by the tempfile crate.
+        // Instead, we'll use the mkostemp function to create a temporary file with the correct flags.
+
+        let specified_dir =
+            std::env::var(DISK_CACHE_KEY).unwrap_or_else(|_| DEFAULT_DISK_CACHE_DIR.to_string());
+        let file_path = Path::new(&specified_dir).join("dynamo-kvbm-disk-cache-XXXXXX");
+
+        Self::new_at(file_path, size)
+    }
+
+    pub fn new_at(path: impl AsRef<Path>, len: usize) -> Result<Self> {
+        if len == 0 {
+            return Err(StorageError::AllocationFailed(
+                "zero-sized allocations are not supported".into(),
+            ));
+        }
+
+        let file_path = path.as_ref().to_path_buf();
+
+        if !file_path.exists() {
+            std::fs::create_dir_all(file_path.parent().unwrap()).unwrap();
+        }
+
+        tracing::debug!("Allocating disk cache file at {}", file_path.display());
+
+        let path_str = file_path.to_str().unwrap();
+        let is_template = path_str.contains("XXXXXX");
+
+        let (raw_fd, actual_path) = if is_template {
+            // Template path - use mkostemp to generate unique filename
+            let template = CString::new(path_str).unwrap();
+            let mut template_bytes = template.into_bytes_with_nul();
+
+            let fd = unsafe {
+                nix::libc::mkostemp(
+                    template_bytes.as_mut_ptr() as *mut c_char,
+                    nix::libc::O_RDWR | nix::libc::O_DIRECT,
+                )
+            };
+
+            if fd == -1 {
+                return Err(StorageError::AllocationFailed(format!(
+                    "mkostemp failed: {}",
+                    std::io::Error::last_os_error()
+                )));
+            }
+
+            // Extract the actual path created by mkostemp
+            let actual = PathBuf::from(
+                CString::from_vec_with_nul(template_bytes)
+                    .unwrap()
+                    .to_str()
+                    .unwrap(),
+            );
+
+            (fd, actual)
+        } else {
+            // Specific path - use open with O_CREAT
+            let path_cstr = CString::new(path_str).unwrap();
+            let fd = unsafe {
+                nix::libc::open(
+                    path_cstr.as_ptr(),
+                    nix::libc::O_CREAT | nix::libc::O_RDWR | nix::libc::O_DIRECT,
+                    0o644,
+                )
+            };
+
+            if fd == -1 {
+                return Err(StorageError::AllocationFailed(format!(
+                    "open failed: {}",
+                    std::io::Error::last_os_error()
+                )));
+            }
+
+            (fd, file_path)
+        };
+
+        // We need to use fallocate to actually allocate the storage and create the blocks on disk.
+        fallocate(raw_fd, FallocateFlags::empty(), 0, len as i64).map_err(|e| {
+            StorageError::AllocationFailed(format!("Failed to allocate temp file: {}", e))
+        })?;
+
+        Ok(Self {
+            fd: raw_fd as u64,
+            path: actual_path,
+            size: len,
+            unlinked: false,
+        })
+    }
+
+    pub fn fd(&self) -> u64 {
+        self.fd
+    }
+
+    pub fn path(&self) -> &Path {
+        self.path.as_path()
+    }
+
+    /// Unlink our temp file.
+    /// This means that when this process terminates, the file will be automatically deleted by the OS.
+    /// Unfortunately, GDS requires that files we try to register must be linked.
+    /// To get around this, we unlink the file only after we've registered it with NIXL.
+    pub fn unlink(&mut self) -> Result<()> {
+        if self.unlinked {
+            return Ok(());
+        }
+
+        unlink(self.path.as_path())
+            .map_err(|e| StorageError::AllocationFailed(format!("Failed to unlink file: {}", e)))?;
+        self.unlinked = true;
+        Ok(())
+    }
+
+    pub fn unlinked(&self) -> bool {
+        self.unlinked
+    }
+}
+
+impl Drop for DiskStorage {
+    fn drop(&mut self) {
+        let _ = self.unlink();
+    }
+}
+
+impl MemoryRegion for DiskStorage {
+    fn addr(&self) -> usize {
+        0
+    }
+
+    fn size(&self) -> usize {
+        self.size
+    }
+
+    fn storage_kind(&self) -> StorageKind {
+        StorageKind::Disk(self.fd)
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+// Support for NIXL registration
+impl super::registered::NixlCompatible for DiskStorage {
+    fn nixl_params(&self) -> (*const u8, usize, nixl_sys::MemType, u64) {
+        #[cfg(unix)]
+        {
+            // Use file descriptor as device_id for MemType::File
+            (
+                std::ptr::null(),
+                self.size,
+                nixl_sys::MemType::File,
+                self.fd,
+            )
+        }
+
+        #[cfg(not(unix))]
+        {
+            // On non-Unix systems, we can't get the file descriptor easily
+            // Return device_id as 0 - registration will fail on these systems
+            (
+                self.mmap.as_ptr(),
+                self.mmap.len(),
+                nixl_sys::MemType::File,
+                0,
+            )
+        }
+    }
+}
+
+// mod mmap {
+//     use super::*;
+
+//     #[cfg(unix)]
+//     use std::os::unix::io::AsRawFd;
+
+//     use memmap2::{MmapMut, MmapOptions};
+//     use std::fs::{File, OpenOptions};
+//     use tempfile::NamedTempFile;
+
+//     /// Disk-backed storage using memory-mapped files.
+//     #[derive(Debug)]
+//     pub struct MemMappedFileStorage {
+//         _file: File, // Keep file alive for the lifetime of the mmap
+//         mmap: MmapMut,
+//         path: PathBuf,
+//         #[cfg(unix)]
+//         fd: i32,
+//     }
+
+//     unsafe impl Send for MemMappedFileStorage {}
+//     unsafe impl Sync for MemMappedFileStorage {}
+
+//     impl MemMappedFileStorage {
+//         /// Create new disk storage with a temporary file.
+//         pub fn new_temp(len: usize) -> Result<Self> {
+//             if len == 0 {
+//                 return Err(StorageError::AllocationFailed(
+//                     "zero-sized allocations are not supported".into(),
+//                 ));
+//             }
+
+//             // Create temporary file
+//             let temp_file = NamedTempFile::new()?;
+//             let path = temp_file.path().to_path_buf();
+//             let file = temp_file.into_file();
+
+//             // Set file size
+//             file.set_len(len as u64)?;
+
+//             #[cfg(unix)]
+//             let fd = file.as_raw_fd();
+
+//             // Memory map the file
+//             let mmap = unsafe { MmapOptions::new().len(len).map_mut(&file)? };
+
+//             Ok(Self {
+//                 _file: file,
+//                 mmap,
+//                 path,
+//                 #[cfg(unix)]
+//                 fd,
+//             })
+//         }
+
+//         /// Create new disk storage with a specific file path.
+//         pub fn new_at(path: impl AsRef<Path>, len: usize) -> Result<Self> {
+//             if len == 0 {
+//                 return Err(StorageError::AllocationFailed(
+//                     "zero-sized allocations are not supported".into(),
+//                 ));
+//             }
+
+//             let path = path.as_ref().to_path_buf();
+
+//             // Create or open file
+//             let file = OpenOptions::new()
+//                 .read(true)
+//                 .write(true)
+//                 .create(true)
+//                 .open(&path)?;
+
+//             // Set file size
+//             file.set_len(len as u64)?;
+
+//             #[cfg(unix)]
+//             let fd = file.as_raw_fd();
+
+//             // Memory map the file
+//             let mmap = unsafe { MmapOptions::new().len(len).map_mut(&file)? };
+
+//             Ok(Self {
+//                 _file: file,
+//                 mmap,
+//                 path,
+//                 #[cfg(unix)]
+//                 fd,
+//             })
+//         }
+
+//         /// Get the path to the backing file.
+//         pub fn path(&self) -> &Path {
+//             &self.path
+//         }
+
+//         /// Get the file descriptor (Unix only).
+//         #[cfg(unix)]
+//         pub fn fd(&self) -> i32 {
+//             self.fd
+//         }
+
+//         /// Get a pointer to the memory-mapped region.
+//         ///
+//         /// # Safety
+//         /// The caller must ensure the pointer is not used after this storage is dropped.
+//         pub unsafe fn as_ptr(&self) -> *const u8 {
+//             self.mmap.as_ptr()
+//         }
+
+//         /// Get a mutable pointer to the memory-mapped region.
+//         ///
+//         /// # Safety
+//         /// The caller must ensure the pointer is not used after this storage is dropped
+//         /// and that there are no other references to this memory.
+//         pub unsafe fn as_mut_ptr(&mut self) -> *mut u8 {
+//             self.mmap.as_mut_ptr()
+//         }
+//     }
+
+//     impl MemoryRegion for MemMappedFileStorage {
+//         fn addr(&self) -> usize {
+//             self.mmap.as_ptr() as usize
+//         }
+
+//         fn size(&self) -> usize {
+//             self.mmap.len()
+//         }
+
+//         fn storage_kind(&self) -> StorageKind {
+//             StorageKind::Disk
+//         }
+
+//         fn as_any(&self) -> &dyn Any {
+//             self
+//         }
+//     }
+
+//     // Support for NIXL registration
+//     impl super::super::registered::NixlCompatible for MemMappedFileStorage {
+//         fn nixl_params(&self) -> (*const u8, usize, nixl_sys::MemType, u64) {
+//             #[cfg(unix)]
+//             {
+//                 // Use file descriptor as device_id for MemType::File
+//                 (
+//                     self.mmap.as_ptr(),
+//                     self.mmap.len(),
+//                     nixl_sys::MemType::File,
+//                     self.fd as u64,
+//                 )
+//             }
+
+//             #[cfg(not(unix))]
+//             {
+//                 // On non-Unix systems, we can't get the file descriptor easily
+//                 // Return device_id as 0 - registration will fail on these systems
+//                 (
+//                     self.mmap.as_ptr(),
+//                     self.mmap.len(),
+//                     nixl_sys::MemType::File,
+//                     0,
+//                 )
+//             }
+//         }
+//     }
+// }
diff --git a/lib/llm/src/block_manager/v2/memory/mod.rs b/lib/llm/src/block_manager/v2/memory/mod.rs
new file mode 100644
index 0000000000..b9ae29358e
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/memory/mod.rs
@@ -0,0 +1,206 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Clean, minimal storage API for v2 block manager.
+//!
+//! This module provides a simplified storage abstraction with:
+//! - Single trait for type erasure (`MemoryRegion`)
+//! - Concrete storage types (no trait implementations required)
+//! - Composition-based NIXL registration via `NixlRegistered<T>` wrapper
+//! - RAII with proper drop ordering (registration handle drops before memory)
+
+pub mod actions;
+
+mod device;
+mod disk;
+mod pinned;
+mod registered;
+mod system;
+mod torch;
+
+#[cfg(test)]
+mod tests;
+
+pub use device::DeviceStorage;
+pub use disk::DiskStorage;
+pub use pinned::PinnedStorage;
+pub use registered::{
+    NixlCompatible, NixlDescriptor, NixlRegistered, RegisteredView, register_with_nixl,
+};
+pub use system::SystemStorage;
+pub use torch::{TorchDevice, TorchTensor};
+
+use serde::{Deserialize, Serialize};
+use std::any::Any;
+use std::fmt;
+use std::sync::Arc;
+use thiserror::Error;
+
+/// Result type for storage operations.
+pub type Result<T> = std::result::Result<T, StorageError>;
+
+/// Errors that can occur during storage operations.
+#[derive(Debug, Error)]
+pub enum StorageError {
+    #[error("allocation failed: {0}")]
+    AllocationFailed(String),
+
+    #[error("registration failed: {0}")]
+    RegistrationFailed(String),
+
+    #[error("operation failed: {0}")]
+    OperationFailed(String),
+
+    #[error("unsupported operation: {0}")]
+    Unsupported(String),
+
+    #[error("I/O error: {0}")]
+    Io(#[from] std::io::Error),
+
+    // #[cfg(feature = "cuda")]
+    #[error("CUDA error: {0}")]
+    Cuda(#[from] cudarc::driver::DriverError),
+
+    #[error("NIXL error: {0}")]
+    Nixl(#[from] nixl_sys::NixlError),
+}
+
+/// Storage type classification.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum StorageKind {
+    /// System memory (malloc)
+    System,
+
+    /// CUDA pinned host memory
+    // #[cfg(feature = "cuda")]
+    Pinned,
+
+    /// CUDA device memory with device ID
+    // #[cfg(feature = "cuda")]
+    Device(u32),
+
+    /// Disk-backed memory (mmap)
+    Disk(u64),
+}
+
+/// Core trait for memory regions that can be type-erased.
+///
+/// This is the only trait in the storage API. Concrete storage types
+/// implement this trait to enable type erasure via `Arc<dyn MemoryRegion>`.
+pub trait MemoryRegion: Send + Sync + fmt::Debug {
+    /// Base address of the memory region.
+    fn addr(&self) -> usize;
+
+    /// Size of the memory region in bytes.
+    fn size(&self) -> usize;
+
+    /// Type of storage backing this region.
+    fn storage_kind(&self) -> StorageKind;
+
+    /// Enable downcasting to concrete type.
+    fn as_any(&self) -> &dyn Any;
+
+    /// Get the NIXL descriptor for this memory region.
+    fn nixl_descriptor(&self) -> Option<NixlDescriptor> {
+        None
+    }
+}
+
+/// Type-erased memory region for use in layouts.
+pub type OwnedMemoryRegion = Arc<dyn MemoryRegion>;
+
+/// Helper function to convert concrete storage to type-erased form.
+pub fn erase_storage<S: MemoryRegion + 'static>(storage: S) -> OwnedMemoryRegion {
+    Arc::new(storage)
+}
+
+/// Simple memory region descriptor.
+#[derive(Debug)]
+pub struct OffsetMemoryRegion {
+    base: OwnedMemoryRegion,
+    offset: usize,
+    len: usize,
+}
+
+impl OffsetMemoryRegion {
+    /// Create a new offset view into an existing memory region.
+    ///
+    /// Returns an error if the offset and length exceed the bounds of the base region.
+    pub fn new(base: OwnedMemoryRegion, offset: usize, len: usize) -> Result<Self> {
+        let end = offset
+            .checked_add(len)
+            .ok_or_else(|| StorageError::Unsupported("offset overflow".into()))?;
+        if end > base.size() {
+            return Err(StorageError::Unsupported(
+                "offset region exceeds base allocation bounds".into(),
+            ));
+        }
+        Ok(Self { base, offset, len })
+    }
+
+    /// Get the offset relative to the base mapping.
+    pub fn offset(&self) -> usize {
+        self.offset
+    }
+
+    /// Get the length of the offset region.
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Check if the offset region is empty.
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    /// Access the underlying base region.
+    pub fn base(&self) -> &OwnedMemoryRegion {
+        &self.base
+    }
+}
+
+impl MemoryRegion for OffsetMemoryRegion {
+    fn addr(&self) -> usize {
+        self.base.addr() + self.offset
+    }
+
+    fn size(&self) -> usize {
+        self.len
+    }
+
+    fn storage_kind(&self) -> StorageKind {
+        self.base.storage_kind()
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub struct MemoryDescriptor {
+    pub addr: usize,
+    pub size: usize,
+}
+
+impl MemoryDescriptor {
+    pub fn new(addr: usize, size: usize) -> Self {
+        Self { addr, size }
+    }
+
+    #[inline]
+    pub fn addr(&self) -> usize {
+        self.addr
+    }
+
+    #[inline]
+    pub fn size(&self) -> usize {
+        self.size
+    }
+}
+
+impl actions::Slice for MemoryDescriptor {
+    fn as_slice(&self) -> Result<&[u8]> {
+        Ok(unsafe { std::slice::from_raw_parts(self.addr as *const u8, self.size) })
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/memory/pinned.rs b/lib/llm/src/block_manager/v2/memory/pinned.rs
new file mode 100644
index 0000000000..e6d83174ac
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/memory/pinned.rs
@@ -0,0 +1,139 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! CUDA pinned host memory storage.
+
+use super::{MemoryRegion, Result, StorageError, StorageKind, actions};
+use cudarc::driver::CudaContext;
+use cudarc::driver::sys;
+use std::any::Any;
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex, OnceLock};
+
+/// Get or create a CUDA context for the given device.
+fn cuda_context(device_id: u32) -> Result<Arc<CudaContext>> {
+    static CONTEXTS: OnceLock<Mutex<HashMap<u32, Arc<CudaContext>>>> = OnceLock::new();
+    let mut map = CONTEXTS.get_or_init(Default::default).lock().unwrap();
+
+    if let Some(existing) = map.get(&device_id) {
+        return Ok(existing.clone());
+    }
+
+    let ctx = CudaContext::new(device_id as usize)?;
+    map.insert(device_id, ctx.clone());
+    Ok(ctx)
+}
+
+/// CUDA pinned host memory allocated via cudaHostAlloc.
+#[derive(Debug)]
+pub struct PinnedStorage {
+    ptr: usize,
+    len: usize,
+    ctx: Arc<CudaContext>,
+}
+
+unsafe impl Send for PinnedStorage {}
+unsafe impl Sync for PinnedStorage {}
+
+impl PinnedStorage {
+    /// Allocate new pinned memory of the given size.
+    ///
+    /// # Arguments
+    /// * `len` - Size in bytes to allocate
+    /// * `device_id` - CUDA device to associate with the allocation
+    pub fn new(len: usize) -> Result<Self> {
+        if len == 0 {
+            return Err(StorageError::AllocationFailed(
+                "zero-sized allocations are not supported".into(),
+            ));
+        }
+
+        let ctx = cuda_context(0)?;
+        let ptr = unsafe {
+            ctx.bind_to_thread().map_err(StorageError::Cuda)?;
+
+            let ptr = cudarc::driver::result::malloc_host(len, sys::CU_MEMHOSTALLOC_WRITECOMBINED)
+                .map_err(StorageError::Cuda)?;
+
+            let ptr = ptr as *mut u8;
+            assert!(!ptr.is_null(), "Failed to allocate pinned memory");
+            assert!(ptr.is_aligned(), "Pinned memory is not aligned");
+            assert!(len < isize::MAX as usize);
+
+            ptr as usize
+        };
+
+        Ok(Self { ptr, len, ctx })
+    }
+
+    /// Get a pointer to the underlying memory.
+    ///
+    /// # Safety
+    /// The caller must ensure the pointer is not used after this storage is dropped.
+    pub unsafe fn as_ptr(&self) -> *const u8 {
+        self.ptr as *const u8
+    }
+
+    /// Get a mutable pointer to the underlying memory.
+    ///
+    /// # Safety
+    /// The caller must ensure the pointer is not used after this storage is dropped
+    /// and that there are no other references to this memory.
+    pub unsafe fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr as *mut u8
+    }
+}
+
+impl Drop for PinnedStorage {
+    fn drop(&mut self) {
+        if let Err(e) = self.ctx.bind_to_thread() {
+            tracing::debug!("failed to bind CUDA context for free: {e}");
+        }
+        unsafe {
+            if let Err(e) = cudarc::driver::result::free_host(self.ptr as _) {
+                tracing::debug!("failed to free pinned memory: {e}");
+            }
+        };
+    }
+}
+
+impl MemoryRegion for PinnedStorage {
+    fn addr(&self) -> usize {
+        unsafe { self.as_ptr() as usize }
+    }
+
+    fn size(&self) -> usize {
+        self.len
+    }
+
+    fn storage_kind(&self) -> StorageKind {
+        StorageKind::Pinned
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+// Support for NIXL registration
+impl super::registered::NixlCompatible for PinnedStorage {
+    fn nixl_params(&self) -> (*const u8, usize, nixl_sys::MemType, u64) {
+        let ptr = unsafe { self.as_ptr() };
+        (ptr, self.len, nixl_sys::MemType::Dram, 0)
+    }
+}
+
+impl actions::Memset for PinnedStorage {
+    fn memset(&mut self, value: u8, offset: usize, size: usize) -> Result<()> {
+        if offset + size > self.len {
+            return Err(StorageError::OperationFailed(
+                "memset: offset + size > storage size".into(),
+            ));
+        }
+        unsafe {
+            let ptr = (self.ptr as *mut u8).add(offset);
+            std::ptr::write_bytes(ptr, value, size);
+        }
+        Ok(())
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/memory/registered.rs b/lib/llm/src/block_manager/v2/memory/registered.rs
new file mode 100644
index 0000000000..010088198a
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/memory/registered.rs
@@ -0,0 +1,195 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! NIXL registration wrapper for storage types.
+
+use super::{MemoryRegion, StorageKind};
+use nixl_sys::{Agent as NixlAgent, MemType, OptArgs, RegistrationHandle};
+use std::any::Any;
+use std::fmt;
+
+/// Trait for storage types that can be registered with NIXL.
+pub trait NixlCompatible {
+    /// Get parameters needed for NIXL registration.
+    ///
+    /// Returns (ptr, size, mem_type, device_id)
+    fn nixl_params(&self) -> (*const u8, usize, MemType, u64);
+}
+
+/// NIXL descriptor containing registration information.
+#[derive(Debug, Clone)]
+pub struct NixlDescriptor {
+    pub addr: u64,
+    pub size: usize,
+    pub mem_type: MemType,
+    pub device_id: u64,
+}
+
+impl nixl_sys::MemoryRegion for NixlDescriptor {
+    unsafe fn as_ptr(&self) -> *const u8 {
+        self.addr as *const u8
+    }
+
+    fn size(&self) -> usize {
+        self.size
+    }
+}
+
+impl nixl_sys::NixlDescriptor for NixlDescriptor {
+    fn mem_type(&self) -> MemType {
+        self.mem_type
+    }
+
+    fn device_id(&self) -> u64 {
+        self.device_id
+    }
+}
+
+/// View trait for accessing registration information without unwrapping.
+pub trait RegisteredView {
+    /// Get the name of the NIXL agent that registered this memory.
+    fn agent_name(&self) -> &str;
+
+    /// Get the NIXL descriptor for this registered memory.
+    fn descriptor(&self) -> NixlDescriptor;
+}
+
+/// Wrapper for storage that has been registered with NIXL.
+///
+/// This wrapper ensures proper drop order: the registration handle is
+/// dropped before the storage, ensuring deregistration happens before
+/// the memory is freed.
+pub struct NixlRegistered<S: NixlCompatible> {
+    storage: S,
+    handle: Option<RegistrationHandle>,
+    agent_name: String,
+}
+
+impl<S: NixlCompatible> Drop for NixlRegistered<S> {
+    fn drop(&mut self) {
+        // Explicitly drop the registration handle first
+        drop(self.handle.take());
+        // Storage drops naturally after
+    }
+}
+
+impl<S: NixlCompatible + fmt::Debug> fmt::Debug for NixlRegistered<S> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("NixlRegistered")
+            .field("storage", &self.storage)
+            .field("agent_name", &self.agent_name)
+            .field("handle", &self.handle.is_some())
+            .finish()
+    }
+}
+
+impl<S: MemoryRegion + NixlCompatible + 'static> MemoryRegion for NixlRegistered<S> {
+    fn addr(&self) -> usize {
+        self.storage.addr()
+    }
+
+    fn size(&self) -> usize {
+        self.storage.size()
+    }
+
+    fn storage_kind(&self) -> StorageKind {
+        self.storage.storage_kind()
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn nixl_descriptor(&self) -> Option<NixlDescriptor> {
+        Some(self.descriptor())
+    }
+}
+
+impl<S: MemoryRegion + NixlCompatible> RegisteredView for NixlRegistered<S> {
+    fn agent_name(&self) -> &str {
+        &self.agent_name
+    }
+
+    fn descriptor(&self) -> NixlDescriptor {
+        let (ptr, size, mem_type, device_id) = self.storage.nixl_params();
+        NixlDescriptor {
+            addr: ptr as u64,
+            size,
+            mem_type,
+            device_id,
+        }
+    }
+}
+
+impl<S: MemoryRegion + NixlCompatible> NixlRegistered<S> {
+    /// Get a reference to the underlying storage.
+    pub fn storage(&self) -> &S {
+        &self.storage
+    }
+
+    /// Get a mutable reference to the underlying storage.
+    pub fn storage_mut(&mut self) -> &mut S {
+        &mut self.storage
+    }
+
+    /// Check if the registration handle is still valid.
+    pub fn is_registered(&self) -> bool {
+        self.handle.is_some()
+    }
+
+    /// Consume this wrapper and return the underlying storage.
+    ///
+    /// This will deregister the storage from NIXL.
+    pub fn into_storage(mut self) -> S {
+        // Manually drop the handle first
+        self.handle = None;
+        // Now we can move out the storage
+        // We need to use mem::forget to prevent Drop from running
+        let storage = std::mem::replace(&mut self.storage, unsafe { std::mem::zeroed() });
+        std::mem::forget(self);
+        storage
+    }
+}
+
+/// Register storage with a NIXL agent.
+///
+/// This consumes the storage and returns a `NixlRegistered` wrapper that
+/// manages the registration lifetime. The registration handle will be
+/// automatically dropped when the wrapper is dropped, ensuring proper
+/// cleanup order.
+///
+/// # Arguments
+/// * `storage` - The storage to register (consumed)
+/// * `agent` - The NIXL agent to register with
+/// * `opt` - Optional arguments for registration
+///
+/// # Returns
+/// A `NixlRegistered` wrapper containing the storage and registration handle.
+pub fn register_with_nixl<S>(
+    storage: S,
+    agent: &NixlAgent,
+    opt: Option<&OptArgs>,
+) -> std::result::Result<NixlRegistered<S>, S>
+where
+    S: MemoryRegion + NixlCompatible,
+{
+    // Get NIXL parameters
+    let (ptr, size, mem_type, device_id) = storage.nixl_params();
+
+    // Create a NIXL descriptor for registration
+    let descriptor = NixlDescriptor {
+        addr: ptr as u64,
+        size,
+        mem_type,
+        device_id,
+    };
+
+    match agent.register_memory(&descriptor, opt) {
+        Ok(handle) => Ok(NixlRegistered {
+            storage,
+            handle: Some(handle),
+            agent_name: agent.name().to_string(),
+        }),
+        Err(_) => Err(storage),
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/memory/system.rs b/lib/llm/src/block_manager/v2/memory/system.rs
new file mode 100644
index 0000000000..f51abeb6bf
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/memory/system.rs
@@ -0,0 +1,131 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! System memory storage backed by malloc.
+
+use super::{MemoryRegion, Result, StorageError, StorageKind, actions};
+use std::any::Any;
+use std::ptr::NonNull;
+
+use nix::libc;
+
+/// System memory allocated via malloc.
+#[derive(Debug)]
+pub struct SystemStorage {
+    ptr: NonNull<u8>,
+    len: usize,
+}
+
+unsafe impl Send for SystemStorage {}
+unsafe impl Sync for SystemStorage {}
+
+impl SystemStorage {
+    /// Allocate new system memory of the given size.
+    pub fn new(len: usize) -> Result<Self> {
+        if len == 0 {
+            return Err(StorageError::AllocationFailed(
+                "zero-sized allocations are not supported".into(),
+            ));
+        }
+
+        let mut ptr: *mut libc::c_void = std::ptr::null_mut();
+
+        // We need 4KB alignment here for NIXL disk transfers to work.
+        // The O_DIRECT flag is required for GDS.
+        // However, a limitation of this flag is that all operations involving disk
+        // (both read and write) must be page-aligned.
+        // Pinned memory is already page-aligned, so we only need to align system memory.
+        // TODO(jthomson04): Is page size always 4KB?
+
+        // SAFETY: malloc returns suitably aligned memory or null on failure.
+        let result = unsafe { libc::posix_memalign(&mut ptr, 4096, len) };
+        if result != 0 {
+            return Err(StorageError::AllocationFailed(format!(
+                "posix_memalign failed for size {}",
+                len
+            )));
+        }
+        let ptr = NonNull::new(ptr as *mut u8).ok_or_else(|| {
+            StorageError::AllocationFailed(format!("malloc failed for size {}", len))
+        })?;
+
+        // Zero-initialize the memory
+        unsafe {
+            std::ptr::write_bytes(ptr.as_ptr(), 0, len);
+        }
+
+        Ok(Self { ptr, len })
+    }
+
+    /// Get a pointer to the underlying memory.
+    ///
+    /// # Safety
+    /// The caller must ensure the pointer is not used after this storage is dropped.
+    pub unsafe fn as_ptr(&self) -> *const u8 {
+        self.ptr.as_ptr()
+    }
+
+    /// Get a mutable pointer to the underlying memory.
+    ///
+    /// # Safety
+    /// The caller must ensure the pointer is not used after this storage is dropped
+    /// and that there are no other references to this memory.
+    pub unsafe fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr.as_ptr()
+    }
+}
+
+impl Drop for SystemStorage {
+    fn drop(&mut self) {
+        // SAFETY: pointer was allocated by malloc.
+        unsafe {
+            libc::free(self.ptr.as_ptr() as *mut libc::c_void);
+        }
+    }
+}
+
+impl MemoryRegion for SystemStorage {
+    fn addr(&self) -> usize {
+        self.ptr.as_ptr() as usize
+    }
+
+    fn size(&self) -> usize {
+        self.len
+    }
+
+    fn storage_kind(&self) -> StorageKind {
+        StorageKind::System
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+// Support for NIXL registration
+impl super::registered::NixlCompatible for SystemStorage {
+    fn nixl_params(&self) -> (*const u8, usize, nixl_sys::MemType, u64) {
+        (self.ptr.as_ptr(), self.len, nixl_sys::MemType::Dram, 0)
+    }
+}
+
+impl actions::Memset for SystemStorage {
+    fn memset(&mut self, value: u8, offset: usize, size: usize) -> Result<()> {
+        if offset + size > self.len {
+            return Err(StorageError::OperationFailed(
+                "memset: offset + size > storage size".into(),
+            ));
+        }
+        unsafe {
+            let ptr = self.ptr.as_ptr().add(offset);
+            std::ptr::write_bytes(ptr, value, size);
+        }
+        Ok(())
+    }
+}
+
+impl actions::Slice for SystemStorage {
+    fn as_slice(&self) -> Result<&[u8]> {
+        Ok(unsafe { std::slice::from_raw_parts(self.ptr.as_ptr(), self.len) })
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/memory/tests.rs b/lib/llm/src/block_manager/v2/memory/tests.rs
new file mode 100644
index 0000000000..f354e72124
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/memory/tests.rs
@@ -0,0 +1,129 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Tests for the storage-next module.
+
+use super::*;
+
+#[test]
+fn test_system_storage() {
+    let storage = SystemStorage::new(1024).unwrap();
+    assert_eq!(storage.size(), 1024);
+    assert_eq!(storage.storage_kind(), StorageKind::System);
+    assert!(storage.addr() != 0);
+
+    // Test that we can create multiple allocations
+    let storage2 = SystemStorage::new(2048).unwrap();
+    assert_eq!(storage2.size(), 2048);
+    assert_ne!(storage.addr(), storage2.addr());
+}
+
+#[test]
+fn test_system_storage_zero_size() {
+    let result = SystemStorage::new(0);
+    assert!(result.is_err());
+    assert!(matches!(
+        result.unwrap_err(),
+        StorageError::AllocationFailed(_)
+    ));
+}
+
+#[test]
+fn test_disk_storage_temp() {
+    let storage = DiskStorage::new(4096).unwrap();
+    assert_eq!(storage.size(), 4096);
+    assert!(matches!(storage.storage_kind(), StorageKind::Disk(_)));
+    // Disk storage is file-backed, so addr() returns 0 (no memory address)
+    assert_eq!(storage.addr(), 0);
+    assert!(storage.path().exists());
+}
+
+#[test]
+fn test_disk_storage_at_path() {
+    let temp_dir = tempfile::tempdir().unwrap();
+    let path = temp_dir.path().join("test.bin");
+
+    let storage = DiskStorage::new_at(&path, 8192).unwrap();
+    assert_eq!(storage.size(), 8192);
+    assert!(matches!(storage.storage_kind(), StorageKind::Disk(_)));
+    assert!(path.exists());
+}
+
+#[test]
+fn test_type_erasure() {
+    let storage = SystemStorage::new(1024).unwrap();
+    let erased: OwnedMemoryRegion = erase_storage(storage);
+
+    assert_eq!(erased.size(), 1024);
+    assert_eq!(erased.storage_kind(), StorageKind::System);
+}
+
+#[test]
+fn test_memory_descriptor() {
+    let desc = MemoryDescriptor::new(0x1000, 4096);
+    assert_eq!(desc.addr, 0x1000);
+    assert_eq!(desc.size, 4096);
+}
+
+#[cfg(feature = "testing-cuda")]
+mod cuda_tests {
+    use super::*;
+
+    #[test]
+    fn test_pinned_storage() {
+        let storage = PinnedStorage::new(2048).unwrap();
+        assert_eq!(storage.size(), 2048);
+        assert_eq!(storage.storage_kind(), StorageKind::Pinned);
+        assert!(storage.addr() != 0);
+    }
+
+    #[test]
+    fn test_pinned_storage_zero_size() {
+        let storage = PinnedStorage::new(0);
+        assert!(storage.is_err());
+        assert!(matches!(
+            storage.unwrap_err(),
+            StorageError::AllocationFailed(_)
+        ));
+    }
+
+    #[test]
+    fn test_device_storage() {
+        let storage = DeviceStorage::new(4096, 0).unwrap();
+        assert_eq!(storage.size(), 4096);
+        assert_eq!(storage.storage_kind(), StorageKind::Device(0));
+        assert!(storage.addr() != 0);
+        assert_eq!(storage.device_id(), 0);
+    }
+
+    #[test]
+    fn test_device_storage_zero_size() {
+        let result = DeviceStorage::new(0, 0);
+        assert!(result.is_err());
+        assert!(matches!(
+            result.unwrap_err(),
+            StorageError::AllocationFailed(_)
+        ));
+    }
+}
+
+// Tests for NIXL registration would require a real NIXL agent,
+// so we'll skip those for now. In practice, you'd mock the agent
+// or use integration tests.
+#[cfg(feature = "testing-nixl")]
+mod nixl_tests {
+    use super::super::registered::register_with_nixl;
+    use super::*;
+    use nixl_sys::Agent as NixlAgent;
+
+    // These tests would require a mock NIXL agent or real NIXL setup
+    // Placeholder for now
+
+    #[test]
+    fn test_nixl_registration() {
+        let pinned = PinnedStorage::new(2048).unwrap();
+        let agent = NixlAgent::new("test_agent").unwrap();
+        let registered = register_with_nixl(pinned, &agent, None).unwrap();
+        assert_eq!(registered.agent_name(), "test_agent");
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/memory/torch.rs b/lib/llm/src/block_manager/v2/memory/torch.rs
new file mode 100644
index 0000000000..c60f5e2b31
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/memory/torch.rs
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum TorchDevice {
+    Cuda(usize),
+    Other(String),
+}
+
+impl TorchDevice {
+    pub fn is_cuda(&self) -> bool {
+        matches!(self, TorchDevice::Cuda(_))
+    }
+
+    pub fn cuda_device_index(&self) -> Option<usize> {
+        match self {
+            TorchDevice::Cuda(index) => Some(*index),
+            TorchDevice::Other(_) => None,
+        }
+    }
+}
+
+pub trait TorchTensor: std::fmt::Debug + Send + Sync {
+    fn device(&self) -> TorchDevice;
+    fn data_ptr(&self) -> u64;
+    fn size_bytes(&self) -> usize;
+    fn shape(&self) -> Vec<usize>;
+    fn stride(&self) -> Vec<usize>;
+}
diff --git a/lib/llm/src/block_manager/v2/physical/layout/builder.rs b/lib/llm/src/block_manager/v2/physical/layout/builder.rs
new file mode 100644
index 0000000000..80b9b7c419
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/layout/builder.rs
@@ -0,0 +1,864 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Typed builder for constructing [`PhysicalLayout`](crate::block_manager::v2::layout::PhysicalLayout)
+//! instances with strongly-typed configuration, layout selection, and memory provisioning.
+//!
+//! The builder enforces the three steps required to materialize a physical layout:
+//! 1. Provide a [`LayoutConfig`]
+//! 2. Select a concrete layout (fully contiguous or layer separate)
+//! 3. Specify memory backing (either by allocating or by supplying existing regions)
+//!
+//! NIXL registration is always enabled. Callers must provide a [`nixl_sys::Agent`], and any memory
+//! supplied to the builder must implement [`NixlCompatible`].
+
+use crate::block_manager::v2::physical::layout::physical::PhysicalLayout;
+
+use super::{
+    BlockDimension, FullyContiguousLayout, LayerSeparateLayout, Layout, LayoutConfig, MemoryRegion,
+    physical::NixlMetadata,
+};
+
+use crate::block_manager::v2::memory::{
+    DiskStorage, NixlCompatible, NixlDescriptor, OffsetMemoryRegion, OwnedMemoryRegion,
+    RegisteredView, StorageKind, SystemStorage, register_with_nixl,
+};
+use anyhow::{Result, anyhow, bail};
+#[allow(unused_imports)]
+use nixl_sys::Agent as RawNixlAgent;
+use nixl_sys::MemType;
+use std::marker::PhantomData;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use crate::block_manager::v2::memory::{DeviceStorage, PinnedStorage};
+
+use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent;
+
+const REGION_ALIGNMENT: usize = 512;
+
+/// Layout selection exposed by the builder.
+#[derive(Debug, Clone)]
+pub enum LayoutKind {
+    FullyContiguous,
+    LayerSeparate { block_dim: BlockDimension },
+}
+
+/// Allocation strategies for builder-managed memory.
+#[derive(Debug, Clone)]
+enum AllocationKind {
+    System,
+    Pinned { numa_aware: bool },
+
+    Device { device_id: u32 },
+    Disk { path: Option<PathBuf> },
+}
+
+/// Memory provisioning plan (either provided regions or an allocation request).
+#[derive(Debug, Clone)]
+enum MemoryPlan {
+    Provided(Vec<MemoryEntry>),
+    Allocate(AllocationKind),
+}
+
+/// Memory tenancy captured during the build process.
+#[derive(Debug, Clone)]
+struct MemoryEntry {
+    region: OwnedMemoryRegion,
+    descriptor: Option<NixlDescriptor>,
+}
+
+impl MemoryEntry {
+    fn new(region: OwnedMemoryRegion, descriptor: Option<NixlDescriptor>) -> Self {
+        Self { region, descriptor }
+    }
+
+    fn ensure_registered(mut self) -> Result<Self> {
+        if self.descriptor.is_none() {
+            self.descriptor = self.region.nixl_descriptor();
+        }
+
+        #[cfg(not(test))]
+        {
+            // In production, require NIXL registration
+            if self.descriptor.is_none() {
+                bail!(
+                    "memory region {} is not registered with NIXL",
+                    self.region.addr()
+                );
+            }
+        }
+
+        // In test builds, allow None descriptors for local-only layouts
+        Ok(self)
+    }
+}
+
+/// Marker types for the builder state machine.
+pub struct NoConfig;
+pub struct HasConfig;
+
+pub struct NoLayout;
+pub struct HasLayout;
+
+pub struct NoMemory;
+pub struct HasMemory;
+
+/// Default builder state type alias.
+pub type PhysicalLayoutBuilderDefault = PhysicalLayoutBuilder<NoConfig, NoLayout, NoMemory>;
+
+/// Typed builder enforcing configuration, layout selection, and memory provisioning phases.
+pub struct PhysicalLayoutBuilder<C, L, M> {
+    agent: NixlAgent,
+    config: Option<LayoutConfig>,
+    layout_kind: Option<LayoutKind>,
+    memory_plan: Option<MemoryPlan>,
+    _config: PhantomData<C>,
+    _layout: PhantomData<L>,
+    _memory: PhantomData<M>,
+}
+
+impl PhysicalLayoutBuilder<NoConfig, NoLayout, NoMemory> {
+    /// Create a new builder in its initial state.
+    pub fn new(agent: NixlAgent) -> Self {
+        Self {
+            agent,
+            config: None,
+            layout_kind: None,
+            memory_plan: None,
+            _config: PhantomData,
+            _layout: PhantomData,
+            _memory: PhantomData,
+        }
+    }
+}
+
+impl<C, L, M> PhysicalLayoutBuilder<C, L, M> {
+    fn into_parts(
+        self,
+    ) -> (
+        NixlAgent,
+        Option<LayoutConfig>,
+        Option<LayoutKind>,
+        Option<MemoryPlan>,
+    ) {
+        (self.agent, self.config, self.layout_kind, self.memory_plan)
+    }
+
+    fn from_parts<C2, L2, M2>(
+        agent: NixlAgent,
+        config: Option<LayoutConfig>,
+        layout_kind: Option<LayoutKind>,
+        memory_plan: Option<MemoryPlan>,
+    ) -> PhysicalLayoutBuilder<C2, L2, M2> {
+        PhysicalLayoutBuilder {
+            agent,
+            config,
+            layout_kind,
+            memory_plan,
+            _config: PhantomData,
+            _layout: PhantomData,
+            _memory: PhantomData,
+        }
+    }
+}
+
+impl<L, M> PhysicalLayoutBuilder<NoConfig, L, M> {
+    /// Attach the [`LayoutConfig`] required to size the layout and allocations.
+    pub fn with_config(self, config: LayoutConfig) -> PhysicalLayoutBuilder<HasConfig, L, M> {
+        let (agent, _config, layout_kind, memory_plan) = self.into_parts();
+        PhysicalLayoutBuilder::<HasConfig, L, M>::from_parts(
+            agent,
+            Some(config),
+            layout_kind,
+            memory_plan,
+        )
+    }
+}
+
+impl<M> PhysicalLayoutBuilder<HasConfig, NoLayout, M> {
+    /// Select the fully contiguous layout variant.
+    pub fn fully_contiguous(self) -> PhysicalLayoutBuilder<HasConfig, HasLayout, M> {
+        let (agent, config, _layout, memory_plan) = self.into_parts();
+        PhysicalLayoutBuilder::<HasConfig, HasLayout, M>::from_parts(
+            agent,
+            config,
+            Some(LayoutKind::FullyContiguous),
+            memory_plan,
+        )
+    }
+
+    /// Select the layer-separate layout variant with the provided block dimension ordering.
+    pub fn layer_separate(
+        self,
+        block_dim: BlockDimension,
+    ) -> PhysicalLayoutBuilder<HasConfig, HasLayout, M> {
+        let (agent, config, _layout, memory_plan) = self.into_parts();
+        PhysicalLayoutBuilder::<HasConfig, HasLayout, M>::from_parts(
+            agent,
+            config,
+            Some(LayoutKind::LayerSeparate { block_dim }),
+            memory_plan,
+        )
+    }
+}
+
+impl PhysicalLayoutBuilder<HasConfig, HasLayout, NoMemory> {
+    fn set_memory_plan(
+        self,
+        plan: MemoryPlan,
+    ) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+        let (agent, config, layout_kind, _memory) = self.into_parts();
+        PhysicalLayoutBuilder::<HasConfig, HasLayout, HasMemory>::from_parts(
+            agent,
+            config,
+            layout_kind,
+            Some(plan),
+        )
+    }
+
+    pub fn allocate_system(self) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+        self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::System))
+    }
+
+    /// Allocate pinned (page-locked) host memory.
+    pub fn allocate_pinned(
+        self,
+        numa_aware: bool,
+    ) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+        self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Pinned { numa_aware }))
+    }
+
+    /// Allocate device memory on the specified CUDA device (or the context device if `None`).
+    pub fn allocate_device(
+        self,
+        device_id: u32,
+    ) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+        self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Device { device_id }))
+    }
+
+    /// Allocate disk-backed storage. When `path` is `None`, a temporary file is used.
+    pub fn allocate_disk(
+        self,
+        path: Option<PathBuf>,
+    ) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+        self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Disk { path }))
+    }
+
+    /// Use existing NIXL-compatible memory regions supplied by the caller.
+    pub fn with_memory_regions<S>(
+        self,
+        regions: Vec<S>,
+    ) -> Result<PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory>>
+    where
+        S: MemoryRegion + NixlCompatible + 'static,
+    {
+        let (agent, config, layout_kind, _memory) = self.into_parts();
+        let entries = register_existing_regions(&agent, regions)?;
+        Ok(
+            PhysicalLayoutBuilder::<HasConfig, HasLayout, HasMemory>::from_parts(
+                agent,
+                config,
+                layout_kind,
+                Some(MemoryPlan::Provided(entries)),
+            ),
+        )
+    }
+
+    /// Use pre-registered memory regions (already wrapped in `Arc<dyn MemoryRegion>`).
+    ///
+    /// All regions must already expose a NIXL descriptor.
+    pub fn with_registered_regions(
+        self,
+        regions: Vec<OwnedMemoryRegion>,
+    ) -> Result<PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory>> {
+        let entries = regions
+            .into_iter()
+            .enumerate()
+            .map(|(index, region)| {
+                let descriptor = region.nixl_descriptor().ok_or_else(|| {
+                    anyhow!(
+                        "provided memory region at index {} is not NIXL registered",
+                        index
+                    )
+                })?;
+                Ok(MemoryEntry::new(region, Some(descriptor)))
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let (agent, config, layout_kind, _memory) = self.into_parts();
+        Ok(
+            PhysicalLayoutBuilder::<HasConfig, HasLayout, HasMemory>::from_parts(
+                agent,
+                config,
+                layout_kind,
+                Some(MemoryPlan::Provided(entries)),
+            ),
+        )
+    }
+}
+
+impl PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+    /// Finalize the builder, constructing the [`PhysicalLayout`].
+    pub fn build(self) -> Result<PhysicalLayout> {
+        let (agent, config, layout_kind, memory_plan) = self.into_parts();
+
+        let config = config.ok_or_else(|| anyhow!("layout config missing despite type state"))?;
+        let layout_kind =
+            layout_kind.ok_or_else(|| anyhow!("layout kind missing despite type state"))?;
+        let memory_plan =
+            memory_plan.ok_or_else(|| anyhow!("memory plan missing despite type state"))?;
+
+        let required_sizes = compute_allocation_sizes(&config, &layout_kind)?;
+        let entries = resolve_memory_plan(&agent, memory_plan, &required_sizes)?;
+
+        validate_memory_sizes(&entries, &required_sizes)?;
+        let kind = derive_storage_kind(&entries)?;
+        let metadata = derive_nixl_metadata(&agent, &entries)?;
+
+        let layout: Arc<dyn Layout> = match layout_kind {
+            LayoutKind::FullyContiguous => {
+                let entry = entries.first().ok_or_else(|| {
+                    anyhow!("fully contiguous layout requires a single memory region")
+                })?;
+                let layout = FullyContiguousLayout::new(config.clone(), Arc::clone(&entry.region))?;
+                Arc::new(layout)
+            }
+            LayoutKind::LayerSeparate { block_dim } => {
+                let regions: Vec<OwnedMemoryRegion> = entries
+                    .iter()
+                    .map(|entry| Arc::clone(&entry.region))
+                    .collect();
+                let layout = LayerSeparateLayout::new(config.clone(), regions, block_dim)?;
+                Arc::new(layout)
+            }
+        };
+
+        Ok(PhysicalLayout::new_local(layout, kind, metadata))
+    }
+}
+
+fn register_existing_regions<S>(agent: &NixlAgent, regions: Vec<S>) -> Result<Vec<MemoryEntry>>
+where
+    S: MemoryRegion + NixlCompatible + 'static,
+{
+    regions
+        .into_iter()
+        .map(|region| register_storage(region, agent))
+        .collect()
+}
+
+fn resolve_memory_plan(
+    agent: &NixlAgent,
+    plan: MemoryPlan,
+    sizes: &[usize],
+) -> Result<Vec<MemoryEntry>> {
+    match plan {
+        MemoryPlan::Provided(entries) => {
+            if entries.len() != sizes.len() {
+                bail!(
+                    "provided memory count ({}) does not match required allocations ({})",
+                    entries.len(),
+                    sizes.len()
+                );
+            }
+            entries
+                .into_iter()
+                .map(MemoryEntry::ensure_registered)
+                .collect()
+        }
+        MemoryPlan::Allocate(strategy) => allocate_regions(agent, strategy, sizes),
+    }
+}
+
+fn allocate_regions(
+    agent: &NixlAgent,
+    strategy: AllocationKind,
+    sizes: &[usize],
+) -> Result<Vec<MemoryEntry>> {
+    if sizes.is_empty() {
+        return Ok(Vec::new());
+    }
+
+    let reserve_size = total_allocation_size(sizes, REGION_ALIGNMENT)?;
+
+    let base_entry = match strategy {
+        AllocationKind::System => allocate_system_entry(reserve_size, agent)?,
+        AllocationKind::Pinned { numa_aware } => {
+            allocate_pinned_entry(reserve_size, agent, numa_aware)?
+        }
+
+        AllocationKind::Device { device_id } => {
+            allocate_device_entry(reserve_size, agent, device_id)?
+        }
+        AllocationKind::Disk { path } => allocate_disk_entry(reserve_size, agent, path)?,
+    };
+
+    create_offset_entries(base_entry, sizes, REGION_ALIGNMENT)
+}
+
+fn allocate_system_entry(size: usize, agent: &NixlAgent) -> Result<MemoryEntry> {
+    let storage = SystemStorage::new(size)
+        .map_err(|e| anyhow!("failed to allocate system memory ({size} bytes): {e}"))?;
+    register_storage(storage, agent)
+}
+
+fn allocate_pinned_entry(size: usize, agent: &NixlAgent, _numa_aware: bool) -> Result<MemoryEntry> {
+    let storage = PinnedStorage::new(size)
+        .map_err(|e| anyhow!("failed to allocate pinned memory ({size} bytes): {e}"))?;
+    register_storage(storage, agent)
+}
+
+fn allocate_device_entry(size: usize, agent: &NixlAgent, device_id: u32) -> Result<MemoryEntry> {
+    let storage = DeviceStorage::new(size, device_id).map_err(|e| {
+        anyhow!("failed to allocate device memory ({size} bytes) on device {device_id}: {e}")
+    })?;
+    register_storage(storage, agent)
+}
+
+fn allocate_disk_entry(
+    size: usize,
+    agent: &NixlAgent,
+    path: Option<PathBuf>,
+) -> Result<MemoryEntry> {
+    let storage = if let Some(path) = path {
+        DiskStorage::new_at(&path, size)
+            .map_err(|e| anyhow!("failed to allocate disk storage at {}: {e}", path.display()))?
+    } else {
+        DiskStorage::new(size).map_err(|e| anyhow!("failed to allocate disk storage: {e}"))?
+    };
+    register_storage(storage, agent)
+}
+
+// When testing, we allow unregistered layouts to help with test time. NIXL + UCX is very expensive to setup
+// so we only use that backend when it's needed.
+#[cfg(test)]
+fn register_storage<S>(storage: S, agent: &NixlAgent) -> Result<MemoryEntry>
+where
+    S: MemoryRegion + NixlCompatible + 'static,
+{
+    let storage_kind = storage.storage_kind();
+
+    // Determine if registration is needed based on storage type and available backends
+    let should_register = match storage_kind {
+        StorageKind::System | StorageKind::Pinned => {
+            // System/Pinned memory needs UCX for remote transfers
+            agent.has_backend("UCX") || agent.has_backend("POSIX")
+        }
+        StorageKind::Device(_) => {
+            // Device memory needs UCX for remote transfers OR GDS for direct disk transfers
+            agent.has_backend("UCX") || agent.has_backend("GDS_MT")
+        }
+        StorageKind::Disk(_) => {
+            // Disk storage needs POSIX for regular I/O OR GDS for GPU direct I/O
+            agent.has_backend("POSIX") || agent.has_backend("GDS_MT")
+        }
+    };
+
+    if !should_register {
+        // Skip registration - only local non-NIXL transfers will be used
+        let region: OwnedMemoryRegion = Arc::new(storage);
+        return Ok(MemoryEntry::new(region, None));
+    }
+
+    // Register with NIXL using the appropriate backend
+    match register_with_nixl(storage, agent.raw_agent(), None) {
+        Ok(registered) => {
+            let descriptor = registered.descriptor();
+            let region: OwnedMemoryRegion = Arc::new(registered);
+            Ok(MemoryEntry::new(region, Some(descriptor)))
+        }
+        Err(_storage) => bail!("failed to register memory with NIXL agent {}", agent.name()),
+    }
+}
+
+// Production builds always register
+#[cfg(not(test))]
+fn register_storage<S>(storage: S, agent: &NixlAgent) -> Result<MemoryEntry>
+where
+    S: MemoryRegion + NixlCompatible + 'static,
+{
+    // Production builds always register for safety
+    match register_with_nixl(storage, agent.raw_agent(), None) {
+        Ok(registered) => {
+            let descriptor = registered.descriptor();
+            let region: OwnedMemoryRegion = Arc::new(registered);
+            Ok(MemoryEntry::new(region, Some(descriptor)))
+        }
+        Err(_storage) => bail!("failed to register memory with NIXL agent {}", agent.name()),
+    }
+}
+
+fn create_offset_entries(
+    base_entry: MemoryEntry,
+    sizes: &[usize],
+    alignment: usize,
+) -> Result<Vec<MemoryEntry>> {
+    if sizes.is_empty() {
+        return Ok(Vec::new());
+    }
+
+    let base_region = base_entry.region;
+    let base_descriptor = base_entry.descriptor;
+    let base_addr = base_region.addr();
+    let base_len = base_region.size();
+
+    let mut entries = Vec::with_capacity(sizes.len());
+    let mut offset = 0usize;
+
+    for (index, &size) in sizes.iter().enumerate() {
+        let region = if index == 0 && offset == 0 && size == base_len && sizes.len() == 1 {
+            Arc::clone(&base_region)
+        } else {
+            let view = OffsetMemoryRegion::new(Arc::clone(&base_region), offset, size)
+                .map_err(|e| anyhow!("failed to create offset region: {e}"))?;
+            Arc::new(view) as OwnedMemoryRegion
+        };
+
+        let descriptor = base_descriptor
+            .as_ref()
+            .map(|descriptor| derive_descriptor(descriptor, offset, size))
+            .transpose()?;
+
+        entries.push(MemoryEntry::new(region, descriptor));
+
+        offset = offset
+            .checked_add(size)
+            .ok_or_else(|| anyhow!("offset computation overflow"))?;
+
+        if index + 1 < sizes.len() && alignment > 1 {
+            let current_addr = base_addr
+                .checked_add(offset)
+                .ok_or_else(|| anyhow!("address computation overflow"))?;
+            let aligned_addr = align_up(current_addr, alignment)?;
+            offset = aligned_addr
+                .checked_sub(base_addr)
+                .ok_or_else(|| anyhow!("alignment subtraction overflow"))?;
+        }
+    }
+
+    if offset > base_len {
+        bail!(
+            "allocated base region ({base_len} bytes) is insufficient for {offset} bytes with padding"
+        );
+    }
+
+    Ok(entries)
+}
+
+fn derive_descriptor(base: &NixlDescriptor, offset: usize, size: usize) -> Result<NixlDescriptor> {
+    let mut descriptor = base.clone();
+    descriptor.size = size;
+    if descriptor.mem_type != MemType::File {
+        descriptor.addr = descriptor
+            .addr
+            .checked_add(offset as u64)
+            .ok_or_else(|| anyhow!("descriptor address overflow"))?;
+    }
+    Ok(descriptor)
+}
+
+fn compute_allocation_sizes(config: &LayoutConfig, kind: &LayoutKind) -> Result<Vec<usize>> {
+    match kind {
+        LayoutKind::FullyContiguous => {
+            let factors = [
+                config.num_blocks,
+                config.num_layers,
+                config.outer_dim,
+                config.page_size,
+                config.inner_dim,
+                config.dtype_width_bytes,
+            ];
+            let total = mul_chain(&factors)?;
+            Ok(vec![total])
+        }
+        LayoutKind::LayerSeparate { .. } => {
+            let factors = [
+                config.num_blocks,
+                config.outer_dim,
+                config.page_size,
+                config.inner_dim,
+                config.dtype_width_bytes,
+            ];
+            let per_layer = mul_chain(&factors)?;
+            Ok(vec![per_layer; config.num_layers])
+        }
+    }
+}
+
+fn mul_chain(factors: &[usize]) -> Result<usize> {
+    factors.iter().try_fold(1usize, |acc, &value| {
+        acc.checked_mul(value)
+            .ok_or_else(|| anyhow!("allocation size overflow during layout computation"))
+    })
+}
+
+fn total_allocation_size(sizes: &[usize], alignment: usize) -> Result<usize> {
+    if sizes.is_empty() {
+        return Ok(0);
+    }
+
+    let mut total = *sizes
+        .first()
+        .ok_or_else(|| anyhow!("allocation requires at least one region"))?;
+
+    for size in sizes.iter().skip(1) {
+        total = total
+            .checked_add(*size)
+            .ok_or_else(|| anyhow!("allocation size overflow during aggregation"))?;
+        if alignment > 1 {
+            total = total
+                .checked_add(alignment - 1)
+                .ok_or_else(|| anyhow!("allocation alignment padding overflow"))?;
+        }
+    }
+
+    Ok(total)
+}
+
+fn align_up(value: usize, alignment: usize) -> Result<usize> {
+    if alignment <= 1 {
+        return Ok(value);
+    }
+    let remainder = value % alignment;
+    if remainder == 0 {
+        Ok(value)
+    } else {
+        value
+            .checked_add(alignment - remainder)
+            .ok_or_else(|| anyhow!("alignment overflow"))
+    }
+}
+
+fn validate_memory_sizes(entries: &[MemoryEntry], required: &[usize]) -> Result<()> {
+    for (entry, &required_size) in entries.iter().zip(required.iter()) {
+        if entry.region.size() < required_size {
+            bail!(
+                "memory region too small: required {} bytes, available {} bytes",
+                required_size,
+                entry.region.size()
+            );
+        }
+    }
+    Ok(())
+}
+
+fn derive_storage_kind(entries: &[MemoryEntry]) -> Result<StorageKind> {
+    let first = entries
+        .first()
+        .ok_or_else(|| anyhow!("no memory regions available to determine storage location"))?;
+    let first_kind = first.region.storage_kind();
+
+    for entry in entries.iter().skip(1) {
+        let kind = entry.region.storage_kind();
+        if kind != first_kind {
+            bail!(
+                "all memory regions must share the same storage location (found {:?} and {:?})",
+                first_kind,
+                kind
+            );
+        }
+    }
+
+    Ok(first_kind)
+}
+
+fn derive_nixl_metadata(agent: &NixlAgent, entries: &[MemoryEntry]) -> Result<NixlMetadata> {
+    // Try to find a descriptor from entries
+    let descriptor_opt = entries.iter().find_map(|entry| entry.descriptor.clone());
+
+    #[cfg(test)]
+    {
+        // In test builds, allow layouts without NIXL registration
+        // Use defaults for local-only transfers
+        if let Some(descriptor) = descriptor_opt {
+            Ok(NixlMetadata::new(
+                agent.name().to_string(),
+                descriptor.mem_type,
+                descriptor.device_id,
+            ))
+        } else {
+            // Use placeholder metadata for unregistered layouts
+            let first_entry = entries
+                .first()
+                .ok_or_else(|| anyhow!("no memory entries"))?;
+            let storage_kind = first_entry.region.storage_kind();
+            let (mem_type, device_id) = match storage_kind {
+                StorageKind::System => (MemType::Dram, 0),
+                StorageKind::Pinned => (MemType::Dram, 0),
+                StorageKind::Device(id) => (MemType::Vram, id as u64),
+                StorageKind::Disk(id) => (MemType::File, id),
+            };
+            Ok(NixlMetadata::new(
+                agent.name().to_string(),
+                mem_type,
+                device_id,
+            ))
+        }
+    }
+
+    #[cfg(not(test))]
+    {
+        let descriptor = descriptor_opt
+            .ok_or_else(|| anyhow!("memory entries missing NIXL registration metadata"))?;
+        Ok(NixlMetadata::new(
+            agent.name().to_string(),
+            descriptor.mem_type,
+            descriptor.device_id,
+        ))
+    }
+}
+
+#[cfg(all(test, feature = "testing-nixl"))]
+mod tests {
+    use super::super::{BlockDimension, LayoutConfig};
+    use super::*;
+
+    use crate::block_manager::v2::memory::{MemoryRegion, OwnedMemoryRegion, StorageKind};
+    use nixl_sys::MemType;
+    use std::any::Any;
+    use std::sync::Arc;
+
+    #[derive(Debug)]
+    struct TestRegisteredRegion {
+        data: Vec<u8>,
+        kind: StorageKind,
+        descriptor: NixlDescriptor,
+    }
+
+    impl TestRegisteredRegion {
+        fn new(size: usize, kind: StorageKind, mem_type: MemType, device_id: u64) -> Self {
+            let data = vec![0u8; size];
+            let addr = data.as_ptr() as u64;
+            let descriptor = NixlDescriptor {
+                addr,
+                size,
+                mem_type,
+                device_id,
+            };
+            Self {
+                data,
+                kind,
+                descriptor,
+            }
+        }
+    }
+
+    impl MemoryRegion for TestRegisteredRegion {
+        fn addr(&self) -> usize {
+            self.data.as_ptr() as usize
+        }
+
+        fn size(&self) -> usize {
+            self.data.len()
+        }
+
+        fn storage_kind(&self) -> StorageKind {
+            self.kind
+        }
+
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn nixl_descriptor(&self) -> Option<NixlDescriptor> {
+            Some(self.descriptor.clone())
+        }
+    }
+
+    fn make_layout_config() -> LayoutConfig {
+        LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(3)
+            .outer_dim(2)
+            .page_size(4)
+            .inner_dim(8)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap()
+    }
+
+    fn fully_contiguous_size(cfg: &LayoutConfig) -> usize {
+        cfg.num_blocks
+            * cfg.num_layers
+            * cfg.outer_dim
+            * cfg.page_size
+            * cfg.inner_dim
+            * cfg.dtype_width_bytes
+    }
+
+    fn per_layer_size(cfg: &LayoutConfig) -> usize {
+        cfg.num_blocks * cfg.outer_dim * cfg.page_size * cfg.inner_dim * cfg.dtype_width_bytes
+    }
+
+    #[test]
+    fn builds_fully_contiguous_from_registered_regions() {
+        let agent = NixlAgent::require_backends("builder-test-fully", &[])
+            .expect("failed to create wrapped agent");
+        let cfg = make_layout_config();
+
+        let required = fully_contiguous_size(&cfg);
+        let region = Arc::new(TestRegisteredRegion::new(
+            required,
+            StorageKind::System,
+            MemType::Dram,
+            0,
+        )) as OwnedMemoryRegion;
+
+        let physical = PhysicalLayoutBuilder::new(agent.clone())
+            .with_config(cfg.clone())
+            .fully_contiguous()
+            .with_registered_regions(vec![region])
+            .expect("registered regions accepted")
+            .build()
+            .expect("builder should succeed");
+
+        assert_eq!(physical.location(), StorageKind::System);
+        assert!(physical.layout().as_ref().is_fully_contiguous());
+        assert_eq!(physical.layout().config().num_blocks, cfg.num_blocks);
+        assert_eq!(physical.layout().config().num_layers, cfg.num_layers);
+
+        let metadata = physical.nixl_metadata();
+        assert_eq!(metadata.agent_name(), agent.name());
+        assert_eq!(metadata.mem_type(), MemType::Dram);
+    }
+
+    #[test]
+    fn builds_layer_separate_from_registered_regions() {
+        let agent = NixlAgent::require_backends("builder-test-layer", &[])
+            .expect("failed to create wrapped agent");
+        let cfg = make_layout_config();
+
+        let per_layer = per_layer_size(&cfg);
+        let regions: Vec<OwnedMemoryRegion> = (0..cfg.num_layers)
+            .map(|_| {
+                Arc::new(TestRegisteredRegion::new(
+                    per_layer,
+                    StorageKind::System,
+                    MemType::Dram,
+                    0,
+                )) as OwnedMemoryRegion
+            })
+            .collect();
+
+        let physical = PhysicalLayoutBuilder::new(agent.clone())
+            .with_config(cfg.clone())
+            .layer_separate(BlockDimension::BlockIsFirstDim)
+            .with_registered_regions(regions)
+            .expect("registered layer regions accepted")
+            .build()
+            .expect("builder should succeed");
+
+        assert_eq!(physical.location(), StorageKind::System);
+        assert!(!physical.layout().as_ref().is_fully_contiguous());
+        assert_eq!(physical.layout().config().num_layers, cfg.num_layers);
+
+        let metadata = physical.nixl_metadata();
+        assert_eq!(metadata.agent_name(), agent.name());
+        assert_eq!(metadata.mem_type(), MemType::Dram);
+    }
+}
+
+// fn context_device_id(ctx: &TransferContext) -> u32 {
+//     ctx.stream().context().ordinal() as u32
+// }
diff --git a/lib/llm/src/block_manager/v2/physical/layout/config.rs b/lib/llm/src/block_manager/v2/physical/layout/config.rs
new file mode 100644
index 0000000000..e4900478db
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/layout/config.rs
@@ -0,0 +1,101 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use derive_builder::Builder;
+use serde::{Deserialize, Serialize};
+use validator::{Validate, ValidationError};
+
+use super::InnerShape;
+
+/// Configuration for block layouts
+#[derive(Debug, Clone, Builder, Validate, Serialize, Deserialize, PartialEq, Eq)]
+pub struct LayoutConfig {
+    /// Number of blocks
+    #[validate(range(min = 1))]
+    pub num_blocks: usize,
+
+    /// Number of layers
+    #[validate(range(min = 1))]
+    pub num_layers: usize,
+
+    /// Number of outer dimensions
+    #[validate(range(min = 1, max = 2))]
+    pub outer_dim: usize,
+
+    /// Page size
+    #[validate(range(min = 1))]
+    pub page_size: usize,
+
+    /// Inner dimension
+    #[validate(range(min = 1))]
+    pub inner_dim: usize,
+
+    /// Alignment
+    #[validate(custom(function = "validate_power_of_2"))]
+    #[builder(default = "1")]
+    pub alignment: usize,
+
+    /// Data type
+    #[validate(custom(function = "validate_dtype_width_bytes"))]
+    #[builder(default = "2")]
+    pub dtype_width_bytes: usize,
+
+    /// Inner shape format (NHD, HND, or Unknown)
+    #[builder(default = "InnerShape::Unknown")]
+    pub inner_shape: InnerShape,
+}
+
+impl LayoutConfig {
+    /// Builder for LayoutConfig
+    pub fn builder() -> LayoutConfigBuilder {
+        LayoutConfigBuilder::default()
+    }
+
+    pub fn required_bytes(&self) -> usize {
+        self.num_blocks
+            .saturating_mul(self.num_layers)
+            .saturating_mul(self.outer_dim)
+            .saturating_mul(self.page_size)
+            .saturating_mul(self.inner_dim)
+            .saturating_mul(self.dtype_width_bytes)
+    }
+}
+
+/// The first two dimensions of the tensor, `shape[0]` and `shape[1]`, one of those corresponds to the
+/// block dimension, while the other corresponds to the outer dimension.
+///
+/// The outer dimension is typically:
+/// - 1: MLA or K and V stored together,
+/// - 2: K and V stored separately,
+///
+/// The block dimension tell us the number of blocks.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum BlockDimension {
+    /// The block dimension is the first dimension of the tensor, `[n_blocks, outer_dim, inner_dim]`
+    BlockIsFirstDim,
+
+    /// The block dimension is the second dimension of the tensor, `[outer_dim, n_blocks, inner_dim]`
+    /// This is a replacement for v1's `outer_contiguous` is true.
+    BlockIsSecondDim,
+}
+
+/// Validation function for Option<usize> to check if it's Some(power_of_2).
+pub fn validate_power_of_2(alignment: usize) -> Result<(), ValidationError> {
+    if !alignment.is_power_of_two() {
+        // Return validation error if alignment is not a power of 2
+        return Err(validator::ValidationError::new(
+            "alignment_must_be_power_of_2",
+        ));
+    }
+    // Passes validation if alignment is a power of 2
+    Ok(())
+}
+
+pub fn validate_dtype_width_bytes(dtype_width_bytes: usize) -> Result<(), ValidationError> {
+    if !dtype_width_bytes.is_power_of_two() || !(2..=8).contains(&dtype_width_bytes) {
+        return Err(validator::ValidationError::new(
+            "dtype_width_bytes_must_be_power_of_two_and_less_than_8_bytes",
+        ));
+    }
+    Ok(())
+}
diff --git a/lib/llm/src/block_manager/v2/physical/layout/fully_contiguous.rs b/lib/llm/src/block_manager/v2/physical/layout/fully_contiguous.rs
new file mode 100644
index 0000000000..7a438c8a42
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/layout/fully_contiguous.rs
@@ -0,0 +1,271 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Fully contiguous layout implementation.
+//!
+//! This layout stores all blocks in a single contiguous memory allocation
+//! with the shape: [num_blocks, num_layers, outer_dim, page_size, inner_dim].
+
+use anyhow::{Result, anyhow};
+use std::sync::Arc;
+use validator::Validate;
+
+use super::serialize::{BlockFormat, FullyContiguousDetails, LayoutTypeDetails};
+use super::{Layout, LayoutConfig, MemoryDescriptor, MemoryRegion, OwnedMemoryRegion};
+
+/// Fully contiguous layout where all blocks are in a single allocation.
+#[derive(Debug)]
+pub struct FullyContiguousLayout {
+    config: LayoutConfig,
+    /// Base address of the allocation
+    base_addr: usize,
+    /// Stride between blocks in bytes
+    block_stride: usize,
+    /// Stride between layers in bytes
+    layer_stride: usize,
+    /// Stride between outer dimensions in bytes
+    outer_stride: usize,
+    /// Size of each memory region (page) in bytes
+    region_size: usize,
+    /// Owned memory region backing this layout
+    memory: Arc<dyn MemoryRegion>,
+    /// Format of blocks in memory
+    block_format: BlockFormat,
+}
+
+impl FullyContiguousLayout {
+    /// Create a new fully contiguous layout.
+    ///
+    /// # Arguments
+    /// * `config` - Layout configuration
+    /// * `memory` - Owned memory region that backs this layout
+    ///
+    /// # Returns
+    /// A new FullyContiguousLayout instance
+    pub fn new(config: LayoutConfig, memory: Arc<dyn MemoryRegion>) -> Result<Self> {
+        config.validate()?;
+
+        let base_addr = memory.addr();
+
+        // Calculate strides
+        let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
+        let outer_stride = region_size;
+        let layer_stride = outer_stride * config.outer_dim;
+        let block_stride = layer_stride * config.num_layers;
+
+        // Validate that the memory region is large enough
+        let required_size = block_stride * config.num_blocks;
+        if memory.size() < required_size {
+            return Err(anyhow!(
+                "Memory region too small for layout. Required: {} bytes, got: {} bytes",
+                required_size,
+                memory.size()
+            ));
+        }
+
+        Ok(Self {
+            config,
+            base_addr,
+            block_stride,
+            layer_stride,
+            outer_stride,
+            region_size,
+            memory,
+            block_format: BlockFormat::default(),
+        })
+    }
+
+    /// Create a new fully contiguous layout with a specific block format.
+    ///
+    /// # Arguments
+    /// * `config` - Layout configuration
+    /// * `memory` - Owned memory region that backs this layout
+    /// * `block_format` - Format of blocks in memory
+    ///
+    /// # Returns
+    /// A new FullyContiguousLayout instance
+    pub(crate) fn new_with_format(
+        config: LayoutConfig,
+        memory: Arc<dyn MemoryRegion>,
+        block_format: BlockFormat,
+    ) -> Result<Self> {
+        let mut layout = Self::new(config, memory)?;
+        layout.block_format = block_format;
+        Ok(layout)
+    }
+
+    /// Get the block format.
+    pub fn block_format(&self) -> BlockFormat {
+        self.block_format
+    }
+
+    /// Calculate the address of a specific memory region.
+    fn calculate_address(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<usize> {
+        if block_id >= self.config.num_blocks {
+            return Err(anyhow!(
+                "Block ID {} out of range (max: {})",
+                block_id,
+                self.config.num_blocks
+            ));
+        }
+        if layer_id >= self.config.num_layers {
+            return Err(anyhow!(
+                "Layer ID {} out of range (max: {})",
+                layer_id,
+                self.config.num_layers
+            ));
+        }
+        if outer_id >= self.config.outer_dim {
+            return Err(anyhow!(
+                "Outer ID {} out of range (max: {})",
+                outer_id,
+                self.config.outer_dim
+            ));
+        }
+
+        Ok(self.base_addr
+            + block_id * self.block_stride
+            + layer_id * self.layer_stride
+            + outer_id * self.outer_stride)
+    }
+
+    /// Get mutable reference to the memory Arc for NIXL registration.
+    pub fn memory_arc_mut(&mut self) -> &mut Arc<dyn MemoryRegion> {
+        &mut self.memory
+    }
+}
+
+impl Layout for FullyContiguousLayout {
+    fn config(&self) -> &LayoutConfig {
+        &self.config
+    }
+
+    fn memory_regions(&self) -> &[OwnedMemoryRegion] {
+        std::slice::from_ref(&self.memory)
+    }
+
+    fn memory_region(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<MemoryDescriptor> {
+        let addr = self.calculate_address(block_id, layer_id, outer_id)?;
+        Ok(MemoryDescriptor::new(addr, self.region_size))
+    }
+
+    fn required_allocations(&self) -> Vec<usize> {
+        // Single contiguous allocation
+        vec![self.block_stride * self.config.num_blocks]
+    }
+
+    fn is_fully_contiguous(&self) -> bool {
+        true
+    }
+
+    fn num_blocks(&self) -> usize {
+        self.config.num_blocks
+    }
+
+    fn num_layers(&self) -> usize {
+        self.config.num_layers
+    }
+
+    fn outer_dim(&self) -> usize {
+        self.config.outer_dim
+    }
+
+    fn page_size(&self) -> usize {
+        self.config.page_size
+    }
+
+    fn inner_dim(&self) -> usize {
+        self.config.inner_dim
+    }
+
+    fn dtype_width_bytes(&self) -> usize {
+        self.config.dtype_width_bytes
+    }
+
+    fn serialization_details(&self) -> LayoutTypeDetails {
+        LayoutTypeDetails::FullyContiguous(FullyContiguousDetails {
+            block_format: self.block_format,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::super::tests::*;
+    use super::*;
+
+    #[test]
+    fn test_fully_contiguous_layout_creation() {
+        let config = LayoutConfig::builder()
+            .num_blocks(10)
+            .num_layers(4)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        let required_bytes = config.required_bytes();
+        assert_eq!(required_bytes, 10 * 4 * 2 * 16 * 128 * 2);
+
+        let memory = MockMemory::new(0x1000, required_bytes);
+
+        let layout = FullyContiguousLayout::new(config, memory).unwrap();
+        assert_eq!(layout.num_blocks(), 10);
+        assert!(layout.is_fully_contiguous());
+    }
+
+    #[test]
+    fn test_memory_region() {
+        let config = LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(2)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        let required_size = config.required_bytes();
+        let memory = MockMemory::new(0x1000, required_size);
+        let layout = FullyContiguousLayout::new(config.clone(), memory).unwrap();
+
+        // Test accessing specific memory regions
+        let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
+
+        // Block 0, Layer 0, Outer 0
+        let region = layout.memory_region(0, 0, 0).unwrap();
+        assert_eq!(region.addr, 0x1000);
+        assert_eq!(region.size, region_size);
+
+        // Block 0, Layer 0, Outer 1
+        let region = layout.memory_region(0, 0, 1).unwrap();
+        assert_eq!(region.addr, 0x1000 + region_size);
+        assert_eq!(region.size, region_size);
+
+        // Block 0, Layer 1, Outer 0
+        let region = layout.memory_region(0, 1, 0).unwrap();
+        assert_eq!(region.addr, 0x1000 + 2 * region_size);
+        assert_eq!(region.size, region_size);
+
+        // Block 1, Layer 0, Outer 0
+        let region = layout.memory_region(1, 0, 0).unwrap();
+        assert_eq!(
+            region.addr,
+            0x1000 + (config.outer_dim * config.num_layers * region_size)
+        );
+        assert_eq!(region.size, region_size);
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/layout/integration_tests.rs b/lib/llm/src/block_manager/v2/physical/layout/integration_tests.rs
new file mode 100644
index 0000000000..e09a71cef5
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/layout/integration_tests.rs
@@ -0,0 +1,401 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Integration tests comparing v1 and v2 layout implementations.
+//!
+//! These tests validate that the new v2 layout system produces identical
+//! memory region **addresses** as the proven v1 implementation.
+//!
+//! **Note on Size Differences**: V1's `memory_region()` returns `layer_stride` as the
+//! size (covering all outer dimensions), while V2 returns `outer_stride` (single page).
+//! This is an intentional API difference - V2 provides more granular access.
+//! Therefore, these tests only compare addresses, not sizes.
+
+#![cfg(test)]
+
+use anyhow::Result;
+use std::{any::Any, sync::Arc};
+
+use crate::block_manager::{
+    layout::{
+        BlockDimension, BlockLayout, BlockLayoutConfig, GenericBlockLayout, LayoutConfig,
+        LayoutType,
+        tests::{setup_layer_separate_layout, setup_layout},
+    },
+    storage::{Storage, tests::NullDeviceStorage},
+    v2::storage::StorageKind,
+};
+
+use super::{
+    FullyContiguousLayout, LayerSeparateLayout, Layout, LayoutConfig as V2LayoutConfig,
+    MemoryRegion,
+};
+
+// Test constants matching v1 tests
+const NUM_BLOCKS: usize = 7;
+const NUM_LAYERS: usize = 5;
+const OUTER_DIM: usize = 2;
+const PAGE_SIZE: usize = 4;
+const INNER_DIM: usize = 13;
+const DTYPE_WIDTH_BYTES: usize = 4;
+
+/// Wrapper to make v1 NullDeviceStorage compatible with v2 MemoryRegion trait.
+#[derive(Debug)]
+struct V1StorageWrapper {
+    storage: NullDeviceStorage,
+}
+
+impl MemoryRegion for V1StorageWrapper {
+    fn addr(&self) -> usize {
+        self.storage.addr() as usize
+    }
+
+    fn size(&self) -> usize {
+        self.storage.size()
+    }
+
+    fn storage_kind(&self) -> StorageKind {
+        StorageKind::System
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+/// Create v1 layout configuration
+fn create_v1_config() -> LayoutConfig {
+    LayoutConfig::builder()
+        .num_blocks(NUM_BLOCKS)
+        .num_layers(NUM_LAYERS)
+        .outer_dim(OUTER_DIM)
+        .page_size(PAGE_SIZE)
+        .inner_dim(INNER_DIM)
+        .alignment(1)
+        .dtype_width_bytes(DTYPE_WIDTH_BYTES)
+        .build()
+        .unwrap()
+}
+
+/// Create v2 layout configuration (equivalent to v1)
+fn create_v2_config() -> V2LayoutConfig {
+    create_v1_config()
+}
+
+#[test]
+fn test_v1_v2_fully_contiguous_equivalence() -> Result<()> {
+    // Create v1 layout
+    let v1_layout = setup_layout(None)?;
+
+    // Create v2 layout with same configuration
+    let v2_config = create_v2_config();
+    let required_size =
+        NUM_BLOCKS * NUM_LAYERS * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES;
+    let v1_storage = NullDeviceStorage::new(required_size as u64);
+    let memory = Arc::new(V1StorageWrapper {
+        storage: v1_storage,
+    }) as Arc<dyn MemoryRegion>;
+    let v2_layout = FullyContiguousLayout::new(v2_config, memory)?;
+
+    // Compare all memory regions
+    for block_id in 0..NUM_BLOCKS {
+        for layer_id in 0..NUM_LAYERS {
+            for outer_id in 0..OUTER_DIM {
+                let v1_region = v1_layout.memory_region(block_id, layer_id, outer_id)?;
+                let v2_region = v2_layout.memory_region(block_id, layer_id, outer_id)?;
+
+                assert_eq!(
+                    v1_region.addr(),
+                    v2_region.addr,
+                    "Address mismatch at block={}, layer={}, outer={}",
+                    block_id,
+                    layer_id,
+                    outer_id
+                );
+                assert_eq!(
+                    v1_region.size(),
+                    v2_region.size,
+                    "Size mismatch at block={}, layer={}, outer={}",
+                    block_id,
+                    layer_id,
+                    outer_id
+                );
+            }
+        }
+    }
+
+    // Verify metadata
+    assert_eq!(v1_layout.num_blocks(), v2_layout.num_blocks());
+    assert_eq!(v1_layout.num_layers(), v2_layout.num_layers());
+    assert_eq!(v1_layout.outer_dim(), v2_layout.outer_dim());
+    assert_eq!(v1_layout.page_size(), v2_layout.page_size());
+    assert_eq!(v1_layout.inner_dim(), v2_layout.inner_dim());
+
+    Ok(())
+}
+
+#[test]
+fn test_v1_v2_layer_separate_block_contiguous_equivalence() -> Result<()> {
+    // Create v1 layout (block contiguous = !outer_contiguous)
+    let v1_layout = setup_layer_separate_layout(None, BlockDimension::BlockIsFirstDim)?;
+
+    // Create v2 layout with same configuration
+    let v2_config = create_v2_config();
+    let per_layer_size = NUM_BLOCKS * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES;
+
+    let memory: Vec<Arc<dyn MemoryRegion>> = (0..NUM_LAYERS)
+        .map(|_| {
+            Arc::new(V1StorageWrapper {
+                storage: NullDeviceStorage::new(per_layer_size as u64),
+            }) as Arc<dyn MemoryRegion>
+        })
+        .collect();
+
+    let v2_layout = LayerSeparateLayout::new(v2_config, memory, BlockDimension::BlockIsFirstDim)?;
+
+    // Verify metadata
+    assert_eq!(v1_layout.num_blocks(), v2_layout.num_blocks());
+    assert_eq!(v1_layout.num_layers(), v2_layout.num_layers());
+    assert_eq!(v1_layout.outer_dim(), v2_layout.outer_dim());
+    assert_eq!(v1_layout.page_size(), v2_layout.page_size());
+    assert_eq!(v1_layout.inner_dim(), v2_layout.inner_dim());
+
+    // Compare all memory regions
+    for block_id in 0..NUM_BLOCKS {
+        for layer_id in 0..NUM_LAYERS {
+            for outer_id in 0..OUTER_DIM {
+                let v1_region = v1_layout.memory_region(block_id, layer_id, outer_id)?;
+                let v2_region = v2_layout.memory_region(block_id, layer_id, outer_id)?;
+
+                assert_eq!(
+                    v1_region.addr(),
+                    v2_region.addr,
+                    "Address mismatch at block={}, layer={}, outer={} (block_contiguous)",
+                    block_id,
+                    layer_id,
+                    outer_id
+                );
+                assert_eq!(
+                    v1_region.size(),
+                    v2_region.size,
+                    "Size mismatch at block={}, layer={}, outer={} (block_contiguous)",
+                    block_id,
+                    layer_id,
+                    outer_id
+                );
+            }
+        }
+    }
+
+    // Verify layout type
+    assert!(!v2_layout.is_fully_contiguous());
+
+    assert_eq!(
+        v1_layout.layout_type(),
+        LayoutType::LayerSeparate {
+            block_dim: BlockDimension::BlockIsFirstDim,
+        }
+    );
+
+    Ok(())
+}
+
+#[test]
+fn test_v1_v2_layer_separate_outer_contiguous_equivalence() -> Result<()> {
+    // Create v1 layout (outer contiguous)
+    let v1_layout = setup_layer_separate_layout(None, BlockDimension::BlockIsSecondDim)?;
+
+    // Create v2 layout with same configuration
+    let v2_config = create_v2_config();
+    let per_layer_size = NUM_BLOCKS * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES;
+
+    let memory: Vec<Arc<dyn MemoryRegion>> = (0..NUM_LAYERS)
+        .map(|_| {
+            Arc::new(V1StorageWrapper {
+                storage: NullDeviceStorage::new(per_layer_size as u64),
+            }) as Arc<dyn MemoryRegion>
+        })
+        .collect();
+
+    let v2_layout = LayerSeparateLayout::new(v2_config, memory, BlockDimension::BlockIsSecondDim)?;
+
+    // Compare all memory regions
+    for block_id in 0..NUM_BLOCKS {
+        for layer_id in 0..NUM_LAYERS {
+            for outer_id in 0..OUTER_DIM {
+                let v1_region = v1_layout.memory_region(block_id, layer_id, outer_id)?;
+                let v2_region = v2_layout.memory_region(block_id, layer_id, outer_id)?;
+
+                assert_eq!(
+                    v1_region.addr(),
+                    v2_region.addr,
+                    "Address mismatch at block={}, layer={}, outer={} (outer_contiguous)",
+                    block_id,
+                    layer_id,
+                    outer_id
+                );
+                assert_eq!(
+                    v1_region.size(),
+                    v2_region.size,
+                    "Size mismatch at block={}, layer={}, outer={} (outer_contiguous)",
+                    block_id,
+                    layer_id,
+                    outer_id
+                );
+            }
+        }
+    }
+
+    // Verify layout type
+    assert!(!v2_layout.is_fully_contiguous());
+    assert_eq!(
+        v1_layout.layout_type(),
+        LayoutType::LayerSeparate {
+            block_dim: BlockDimension::BlockIsSecondDim,
+        }
+    );
+
+    Ok(())
+}
+
+#[test]
+fn test_v1_v2_stride_calculations() -> Result<()> {
+    // Test with a specific pattern to verify stride calculations
+    let _v1_layout = setup_layout(None)?;
+    let v2_config = create_v2_config();
+    let required_size =
+        NUM_BLOCKS * NUM_LAYERS * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES;
+    let v1_storage = NullDeviceStorage::new(required_size as u64);
+    let memory = Arc::new(V1StorageWrapper {
+        storage: v1_storage,
+    }) as Arc<dyn MemoryRegion>;
+    let v2_layout = FullyContiguousLayout::new(v2_config, memory)?;
+
+    // Calculate expected strides
+    let region_size = PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES;
+    let outer_stride = region_size;
+    let layer_stride = outer_stride * OUTER_DIM;
+    let block_stride = layer_stride * NUM_LAYERS;
+
+    // Test stride consistency across blocks
+    for block_id in 0..NUM_BLOCKS - 1 {
+        let region_b0 = v2_layout.memory_region(block_id, 0, 0)?;
+        let region_b1 = v2_layout.memory_region(block_id + 1, 0, 0)?;
+        assert_eq!(
+            region_b1.addr - region_b0.addr,
+            block_stride,
+            "Block stride mismatch between blocks {} and {}",
+            block_id,
+            block_id + 1
+        );
+    }
+
+    // Test stride consistency across layers
+    for layer_id in 0..NUM_LAYERS - 1 {
+        let region_l0 = v2_layout.memory_region(0, layer_id, 0)?;
+        let region_l1 = v2_layout.memory_region(0, layer_id + 1, 0)?;
+        assert_eq!(
+            region_l1.addr - region_l0.addr,
+            layer_stride,
+            "Layer stride mismatch between layers {} and {}",
+            layer_id,
+            layer_id + 1
+        );
+    }
+
+    // Test stride consistency across outer dimensions
+    for outer_id in 0..OUTER_DIM - 1 {
+        let region_o0 = v2_layout.memory_region(0, 0, outer_id)?;
+        let region_o1 = v2_layout.memory_region(0, 0, outer_id + 1)?;
+        assert_eq!(
+            region_o1.addr - region_o0.addr,
+            outer_stride,
+            "Outer stride mismatch between outer dims {} and {}",
+            outer_id,
+            outer_id + 1
+        );
+    }
+
+    Ok(())
+}
+
+#[test]
+fn test_v1_v2_edge_case_single_block() -> Result<()> {
+    // Test with minimal configuration: single block
+    let v1_config = LayoutConfig::builder()
+        .num_blocks(1)
+        .num_layers(NUM_LAYERS)
+        .outer_dim(OUTER_DIM)
+        .page_size(PAGE_SIZE)
+        .inner_dim(INNER_DIM)
+        .dtype_width_bytes(DTYPE_WIDTH_BYTES)
+        .build()
+        .unwrap();
+
+    let v1_layout = crate::block_manager::layout::FullyContiguous::allocate(
+        v1_config.clone(),
+        &crate::block_manager::storage::tests::NullDeviceAllocator,
+    )?;
+
+    let v2_config = v1_config.clone();
+
+    let required_size = 1 * NUM_LAYERS * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES;
+    let v1_storage = NullDeviceStorage::new(required_size as u64);
+    let memory = Arc::new(V1StorageWrapper {
+        storage: v1_storage,
+    }) as Arc<dyn MemoryRegion>;
+    let v2_layout = FullyContiguousLayout::new(v2_config, memory)?;
+
+    // Compare the single block across all layers and outer dims
+    for layer_id in 0..NUM_LAYERS {
+        for outer_id in 0..OUTER_DIM {
+            let v1_region = v1_layout.memory_region(0, layer_id, outer_id)?;
+            let v2_region = v2_layout.memory_region(0, layer_id, outer_id)?;
+
+            assert_eq!(v1_region.addr(), v2_region.addr);
+            assert_eq!(v1_region.size(), v2_region.size);
+        }
+    }
+
+    Ok(())
+}
+
+#[test]
+fn test_v1_v2_edge_case_single_layer() -> Result<()> {
+    // Test with minimal configuration: single layer
+    let v1_config = LayoutConfig::builder()
+        .num_blocks(NUM_BLOCKS)
+        .num_layers(1)
+        .outer_dim(OUTER_DIM)
+        .page_size(PAGE_SIZE)
+        .inner_dim(INNER_DIM)
+        .dtype_width_bytes(DTYPE_WIDTH_BYTES)
+        .build()?;
+
+    let v1_layout = crate::block_manager::layout::FullyContiguous::allocate(
+        v1_config.clone(),
+        &crate::block_manager::storage::tests::NullDeviceAllocator,
+    )?;
+
+    let v2_config = v1_config.clone();
+
+    let required_size = NUM_BLOCKS * 1 * OUTER_DIM * PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES;
+    let v1_storage = NullDeviceStorage::new(required_size as u64);
+    let memory = Arc::new(V1StorageWrapper {
+        storage: v1_storage,
+    }) as Arc<dyn MemoryRegion>;
+    let v2_layout = FullyContiguousLayout::new(v2_config, memory)?;
+
+    // Compare the single layer across all blocks and outer dims
+    for block_id in 0..NUM_BLOCKS {
+        for outer_id in 0..OUTER_DIM {
+            let v1_region = v1_layout.memory_region(block_id, 0, outer_id)?;
+            let v2_region = v2_layout.memory_region(block_id, 0, outer_id)?;
+
+            assert_eq!(v1_region.addr(), v2_region.addr);
+            assert_eq!(v1_region.size(), v2_region.size);
+        }
+    }
+
+    Ok(())
+}
diff --git a/lib/llm/src/block_manager/v2/physical/layout/layer_separate.rs b/lib/llm/src/block_manager/v2/physical/layout/layer_separate.rs
new file mode 100644
index 0000000000..035dbb18b1
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/layout/layer_separate.rs
@@ -0,0 +1,311 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Layer-separate layout implementation.
+//!
+//! This layout stores each layer in its own allocation, which is the typical
+//! vLLM layout. Each layer can be either block-contiguous or outer-contiguous:
+//! - Block-contiguous: [num_blocks, outer_dim, page_size, inner_dim]
+//! - Outer-contiguous: [outer_dim, num_blocks, page_size, inner_dim]
+
+use anyhow::{Result, anyhow};
+use std::sync::Arc;
+use validator::Validate;
+
+use super::serialize::{LayerSeparateDetails, LayoutTypeDetails};
+use super::{
+    BlockDimension, Layout, LayoutConfig, MemoryDescriptor, MemoryRegion, OwnedMemoryRegion,
+};
+
+/// Layer-separate layout where each layer has its own allocation.
+#[derive(Debug)]
+pub struct LayerSeparateLayout {
+    config: LayoutConfig,
+    /// Base addresses for each layer
+    layer_base_addrs: Vec<usize>,
+    /// Whether the outer dimension is contiguous (vs block dimensionl
+    block_dim: BlockDimension,
+    /// Stride between blocks in bytes
+    block_stride: usize,
+    /// Stride between outer dimensions in bytes
+    outer_stride: usize,
+    /// Size of each memory region (page) in bytes
+    region_size: usize,
+    /// Owned memory regions backing this layout (one per layer)
+    memory_regions: Vec<Arc<dyn MemoryRegion>>,
+}
+
+impl LayerSeparateLayout {
+    /// Create a new layer-separate layout.
+    ///
+    /// # Arguments
+    /// - `config` - Layout configuration
+    /// - `memory` - Vector of owned memory regions (one per layer)
+    /// - `outer_contiguous` - If true, outer dimension is contiguous with the inner dimension, i.e. (num_blocks, outer_dim, ...);
+    ///   if false, block dimension is contiguous with the inner dimension, i.e. (outer_dim, num_blocks, ...).
+    ///
+    /// # Returns
+    /// A new LayerSeparateLayout instance
+    pub fn new(
+        config: LayoutConfig,
+        memory: Vec<Arc<dyn MemoryRegion>>,
+        block_dim: BlockDimension,
+    ) -> Result<Self> {
+        config.validate()?;
+
+        if memory.len() != config.num_layers {
+            return Err(anyhow!(
+                "Memory region count ({}) must match num_layers ({})",
+                memory.len(),
+                config.num_layers
+            ));
+        }
+
+        // Calculate strides
+        let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
+
+        let (block_stride, outer_stride) = if block_dim == BlockDimension::BlockIsSecondDim {
+            // Layout: [outer_dim, num_blocks, page_size, inner_dim]
+            let block_stride = region_size;
+            let outer_stride = block_stride * config.num_blocks;
+            (block_stride, outer_stride)
+        } else {
+            // Layout: [num_blocks, outer_dim, page_size, inner_dim]
+            let outer_stride = region_size;
+            let block_stride = outer_stride * config.outer_dim;
+            (block_stride, outer_stride)
+        };
+
+        // Extract base addresses and validate sizes
+        let mut layer_base_addrs = Vec::with_capacity(config.num_layers);
+        let required_size = config.num_blocks * config.outer_dim * region_size;
+
+        for (i, mem) in memory.iter().enumerate() {
+            if mem.size() < required_size {
+                return Err(anyhow!(
+                    "Memory region {} too small for layout. Required: {} bytes, got: {} bytes",
+                    i,
+                    required_size,
+                    mem.size()
+                ));
+            }
+            layer_base_addrs.push(mem.addr());
+        }
+
+        Ok(Self {
+            config,
+            layer_base_addrs,
+            block_dim,
+            block_stride,
+            outer_stride,
+            region_size,
+            memory_regions: memory,
+        })
+    }
+
+    /// Calculate the address of a specific memory region.
+    fn calculate_address(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<usize> {
+        if block_id >= self.config.num_blocks {
+            return Err(anyhow!(
+                "Block ID {} out of range (max: {})",
+                block_id,
+                self.config.num_blocks
+            ));
+        }
+        if layer_id >= self.config.num_layers {
+            return Err(anyhow!(
+                "Layer ID {} out of range (max: {})",
+                layer_id,
+                self.config.num_layers
+            ));
+        }
+        if outer_id >= self.config.outer_dim {
+            return Err(anyhow!(
+                "Outer ID {} out of range (max: {})",
+                outer_id,
+                self.config.outer_dim
+            ));
+        }
+
+        let base_addr = self.layer_base_addrs[layer_id];
+        let offset = block_id * self.block_stride + outer_id * self.outer_stride;
+
+        Ok(base_addr + offset)
+    }
+
+    pub fn block_dim(&self) -> BlockDimension {
+        self.block_dim
+    }
+
+    /// Get mutable reference to the memory regions for NIXL registration.
+    pub fn memory_regions_mut(&mut self) -> &mut [Arc<dyn MemoryRegion>] {
+        &mut self.memory_regions
+    }
+}
+
+impl Layout for LayerSeparateLayout {
+    fn config(&self) -> &LayoutConfig {
+        &self.config
+    }
+
+    fn memory_regions(&self) -> &[OwnedMemoryRegion] {
+        &self.memory_regions
+    }
+
+    fn memory_region(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<MemoryDescriptor> {
+        let addr = self.calculate_address(block_id, layer_id, outer_id)?;
+        Ok(MemoryDescriptor::new(addr, self.region_size))
+    }
+
+    fn required_allocations(&self) -> Vec<usize> {
+        // One allocation per layer
+        let per_layer_size = self.config.num_blocks * self.config.outer_dim * self.region_size;
+        vec![per_layer_size; self.config.num_layers]
+    }
+
+    fn is_fully_contiguous(&self) -> bool {
+        false
+    }
+
+    fn num_blocks(&self) -> usize {
+        self.config.num_blocks
+    }
+
+    fn num_layers(&self) -> usize {
+        self.config.num_layers
+    }
+
+    fn outer_dim(&self) -> usize {
+        self.config.outer_dim
+    }
+
+    fn page_size(&self) -> usize {
+        self.config.page_size
+    }
+
+    fn inner_dim(&self) -> usize {
+        self.config.inner_dim
+    }
+
+    fn dtype_width_bytes(&self) -> usize {
+        self.config.dtype_width_bytes
+    }
+
+    fn serialization_details(&self) -> LayoutTypeDetails {
+        LayoutTypeDetails::LayerSeparate(LayerSeparateDetails {
+            block_dim: self.block_dim,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::super::tests::*;
+    use super::*;
+
+    #[test]
+    fn test_layer_separate_block_contiguous() {
+        let config = LayoutConfig::builder()
+            .num_blocks(10)
+            .num_layers(4)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        let per_layer_size = 10 * 2 * 16 * 128 * 2;
+        let memory: Vec<Arc<dyn MemoryRegion>> = (0..4)
+            .map(|i| {
+                MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)
+                    as Arc<dyn MemoryRegion>
+            })
+            .collect();
+
+        let layout =
+            LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap();
+
+        assert_eq!(layout.num_blocks(), 10);
+        assert!(!layout.is_fully_contiguous());
+        assert_eq!(layout.required_allocations().len(), 4);
+    }
+
+    #[test]
+    fn test_layer_separate_outer_contiguous() {
+        let config = LayoutConfig::builder()
+            .num_blocks(10)
+            .num_layers(4)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        let per_layer_size = 10 * 2 * 16 * 128 * 2;
+        let memory: Vec<Arc<dyn MemoryRegion>> = (0..4)
+            .map(|i| {
+                MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)
+                    as Arc<dyn MemoryRegion>
+            })
+            .collect();
+
+        let layout =
+            LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsSecondDim).unwrap();
+        assert_eq!(layout.num_blocks(), 10);
+        assert!(!layout.is_fully_contiguous());
+    }
+
+    #[test]
+    fn test_memory_region() {
+        let config = LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(2)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        let per_layer_size = 2 * 2 * 16 * 128 * 2;
+        let memory: Vec<Arc<dyn MemoryRegion>> = (0..2)
+            .map(|i| {
+                MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)
+                    as Arc<dyn MemoryRegion>
+            })
+            .collect();
+
+        let layout =
+            LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap();
+
+        // Test accessing specific memory regions
+        let region_size = 16 * 128 * 2;
+
+        // Block 0, Layer 0, Outer 0 - should be at layer 0's base address
+        let region = layout.memory_region(0, 0, 0).unwrap();
+        assert_eq!(region.addr, 0x1000);
+        assert_eq!(region.size, region_size);
+
+        // Block 0, Layer 1, Outer 0 - should be at layer 1's base address
+        let region = layout.memory_region(0, 1, 0).unwrap();
+        assert_eq!(region.addr, 0x1000 + per_layer_size);
+        assert_eq!(region.size, region_size);
+
+        // Block 0, Layer 0, Outer 1 - should be offset within layer 0
+        let region = layout.memory_region(0, 0, 1).unwrap();
+        assert_eq!(region.addr, 0x1000 + region_size);
+        assert_eq!(region.size, region_size);
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/layout/mod.rs b/lib/llm/src/block_manager/v2/physical/layout/mod.rs
new file mode 100644
index 0000000000..1b125a41b0
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/layout/mod.rs
@@ -0,0 +1,135 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Decoupled layout system for block management.
+//!
+//! This module provides a simplified layout abstraction that:
+//! - Maps block IDs to physical memory regions (address + size)
+//! - Decouples memory regions from storage type information
+//! - Specifies allocation requirements without performing allocation
+//! - Uses trait objects for memory ownership
+
+pub(crate) mod builder;
+
+mod config;
+mod fully_contiguous;
+mod layer_separate;
+mod physical;
+mod serialize;
+mod validation;
+
+#[cfg(test)]
+pub(super) mod tests;
+
+// #[cfg(test)]
+// mod integration_tests;
+
+pub use builder::{LayoutKind, PhysicalLayoutBuilder};
+pub use config::{BlockDimension, LayoutConfig};
+pub use fully_contiguous::FullyContiguousLayout;
+pub use layer_separate::LayerSeparateLayout;
+pub use physical::{NixlMetadata, PhysicalLayout};
+pub use serialize::{
+    BlockFormat, FullyContiguousDetails, LayerSeparateDetails, LayoutDescriptor, LayoutTypeDetails,
+};
+pub use validation::{TensorFormat, validate_tensor_shapes, validate_tensor_strides};
+
+// mod registration;
+// pub use registration::{RegisteredLayout, RegisteredStorageMetadata, RegistrationManager};
+
+use anyhow::Result;
+use serde::{Deserialize, Serialize};
+
+pub use crate::block_manager::v2::memory::{MemoryDescriptor, MemoryRegion, OwnedMemoryRegion};
+
+/// Core layout trait for mapping block IDs to memory regions.
+///
+/// Layouts specify how KV cache blocks are organized in memory without
+/// performing allocation themselves. They provide:
+/// - Memory region lookup for specific blocks
+/// - Allocation requirements for external allocators
+/// - Metadata about block organization
+pub trait Layout: Send + Sync + std::fmt::Debug {
+    /// Get the configuration for this layout.
+    fn config(&self) -> &LayoutConfig;
+
+    /// Get the root memory regions backing this layout.
+    ///
+    /// These regions correspond to the concrete allocations that store the layout's data.
+    /// Implementations that derive memory procedurally can return an empty slice.
+    fn memory_regions(&self) -> &[OwnedMemoryRegion];
+
+    /// Get memory regions for a specific block_id, layer_id, outer_id.
+    ///
+    /// Returns a [MemoryRegion] for the continuous region specified by the given block_id,
+    /// layer_id, outer_id.
+    ///
+    /// # Arguments
+    /// * `block_id` - The ID of the block to query (0..num_blocks)
+    /// * `layer_id` - The ID of the layer to query (0..num_layers)
+    /// * `outer_id` - The ID of the outer dimension to query (0..outer_dim)
+    fn memory_region(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<MemoryDescriptor>;
+
+    /// Get the allocation requirements for this layout.
+    ///
+    /// Returns a vector of allocation sizes needed to back this layout.
+    /// For fully contiguous layouts, this will be a single size.
+    /// For layer-separate layouts, this will contain one size per layer.
+    ///
+    /// # Returns
+    /// Vector of allocation sizes in bytes.
+    fn required_allocations(&self) -> Vec<usize>;
+
+    /// Check if this layout uses fully contiguous memory.
+    ///
+    /// Fully contiguous layouts have all blocks in a single allocation,
+    /// which enables certain optimizations.
+    fn is_fully_contiguous(&self) -> bool;
+
+    /// Get the total number of blocks in this layout.
+    fn num_blocks(&self) -> usize;
+
+    /// Get the number of layers per block.
+    fn num_layers(&self) -> usize;
+
+    /// Get the outer dimension size.
+    ///
+    /// In typical KV cache layouts, this is often 2 (for K and V),
+    /// but can be 1 for architectures like MLA.
+    fn outer_dim(&self) -> usize;
+
+    /// Get the page size (often corresponds to block size in tokens).
+    fn page_size(&self) -> usize;
+
+    /// Get the inner dimension size.
+    ///
+    /// This is typically the hidden size divided by tensor parallel size.
+    fn inner_dim(&self) -> usize;
+
+    /// Get the data type width in bytes.
+    fn dtype_width_bytes(&self) -> usize;
+
+    /// Get serialization details for this layout type.
+    ///
+    /// This provides the layout-type-specific information needed to serialize
+    /// and reconstruct the layout on a remote node.
+    fn serialization_details(&self) -> serialize::LayoutTypeDetails;
+}
+
+/// Inner shape format for tensor layout
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum InnerShape {
+    /// Unknown shape - fallback when we can't determine the format
+    Unknown,
+    /// NHD format: [block_size, num_heads, head_dim]
+    /// Common for attention layers where N=tokens, H=heads, D=dimension
+    NHD,
+    /// HND format: [num_heads, block_size, head_dim]
+    /// Alternative layout with heads first
+    HND,
+}
diff --git a/lib/llm/src/block_manager/v2/physical/layout/physical.rs b/lib/llm/src/block_manager/v2/physical/layout/physical.rs
new file mode 100644
index 0000000000..886c791939
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/layout/physical.rs
@@ -0,0 +1,290 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Physical layout types that combine abstract layouts with storage location metadata.
+
+use super::{
+    FullyContiguousLayout, LayerSeparateLayout, Layout, MemoryDescriptor,
+    builder::{PhysicalLayoutBuilder, PhysicalLayoutBuilderDefault},
+    serialize::{LayoutDescriptor, LayoutTypeDetails},
+};
+
+use crate::block_manager::v2::memory::{MemoryRegion, StorageKind};
+use anyhow::{Result, anyhow};
+use serde::{Deserialize, Serialize};
+use std::any::Any;
+use std::sync::Arc;
+
+use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent;
+
+/// Runtime representation of a layout with its physical storage location.
+///
+/// A `PhysicalLayout` wraps an abstract [`Layout`] with information about where
+/// its memory physically resides (GPU, host, disk) and whether it's local or remote.
+/// This enables the transfer system to select appropriate copy strategies and build
+/// NIXL transfer descriptors.
+#[derive(Debug, Clone)]
+pub struct PhysicalLayout {
+    /// The abstract layout defining memory organization
+    layout: Arc<dyn Layout>,
+
+    /// Physical storage location (System, Device, Pinned, Disk)
+    location: StorageKind,
+
+    /// NIXL registration metadata
+    nixl_metadata: NixlMetadata,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct NixlMetadata {
+    agent_name: String,
+    mem_type: nixl_sys::MemType,
+    device_id: u64,
+}
+
+impl NixlMetadata {
+    pub fn new(agent_name: String, mem_type: nixl_sys::MemType, device_id: u64) -> Self {
+        Self {
+            agent_name,
+            mem_type,
+            device_id,
+        }
+    }
+
+    pub fn agent_name(&self) -> &str {
+        &self.agent_name
+    }
+
+    pub fn mem_type(&self) -> nixl_sys::MemType {
+        self.mem_type
+    }
+
+    pub fn device_id(&self) -> u64 {
+        self.device_id
+    }
+}
+
+impl PhysicalLayout {
+    /// Create a typed builder that enforces NIXL registration.
+    pub fn builder(agent: NixlAgent) -> PhysicalLayoutBuilderDefault {
+        PhysicalLayoutBuilder::new(agent)
+    }
+
+    /// Create a new local physical layout.
+    ///
+    /// # Arguments
+    /// * `layout` - The abstract layout to wrap
+    /// * `location` - Where the layout's memory resides
+    pub(crate) fn new_local(
+        layout: Arc<dyn Layout>,
+        location: StorageKind,
+        nixl_metadata: NixlMetadata,
+    ) -> Self {
+        Self {
+            layout,
+            location,
+            nixl_metadata,
+        }
+    }
+
+    // /// Create a new remote physical layout from a descriptor.
+    // ///
+    // /// # Arguments
+    // /// * `layout` - The abstract layout to wrap
+    // /// * `location` - Where the layout's memory resides (on remote node)
+    // /// * `remote_agent` - Name of the NIXL agent on the remote node
+    // pub fn new_remote(
+    //     layout: Arc<dyn Layout>,
+    //     location: StorageKind,
+    //     remote_agent: String,
+    // ) -> Self {
+    //     let metadata = NixlMetadata::new(
+    //         remote_agent.clone(),
+    //         location.to_nixl_mem_type(),
+    //         location.device_id(),
+    //     );
+    //     let registrations = vec![RegisteredStorageMetadata::new(
+    //         metadata.agent_name().to_string(),
+    //         location,
+    //     )];
+    //     Self {
+    //         layout,
+    //         location,
+    //         locality: Locality::Remote(remote_agent),
+    //         nixl_metadata: Some(metadata),
+    //         registered: registrations,
+    //     }
+    // }
+
+    /// Get the underlying layout.
+    pub fn layout(&self) -> &Arc<dyn Layout> {
+        &self.layout
+    }
+
+    /// Get the storage location.
+    pub fn location(&self) -> StorageKind {
+        self.location
+    }
+
+    /// Get the NIXL metadata.
+    pub fn nixl_metadata(&self) -> &NixlMetadata {
+        &self.nixl_metadata
+    }
+
+    /// Get a memory region with location information.
+    ///
+    /// # Arguments
+    /// * `block_id` - Block identifier
+    /// * `layer_id` - Layer identifier
+    /// * `outer_id` - Outer dimension identifier
+    pub fn memory_region(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<MemoryDescriptor> {
+        self.layout.memory_region(block_id, layer_id, outer_id)
+    }
+
+    /// Serialize this physical layout for transmission to remote nodes.
+    ///
+    /// This converts the runtime `PhysicalLayout` into a `LayoutDescriptor` that
+    /// contains all information needed to reconstruct the layout on a remote node,
+    /// including layout configuration, memory descriptors, NIXL metadata, and
+    /// layout-type-specific details.
+    ///
+    /// # Returns
+    /// A serializable representation of this layout
+    pub fn to_descriptor(&self) -> Result<LayoutDescriptor> {
+        // Extract memory descriptors
+        let memory_descriptors = self
+            .layout
+            .memory_regions()
+            .iter()
+            .map(|region| MemoryDescriptor {
+                addr: region.addr(),
+                size: region.size(),
+            })
+            .collect();
+
+        // Get layout type details from the layout itself
+        let layout_type_details = self.layout.serialization_details();
+
+        Ok(LayoutDescriptor {
+            version: LayoutDescriptor::CURRENT_VERSION,
+            layout_config: self.layout.config().clone(),
+            location: self.location,
+            nixl_metadata: self.nixl_metadata.clone(),
+            memory_descriptors,
+            layout_type_details,
+        })
+    }
+
+    /// Reconstruct a physical layout from serialized data received from a remote node.
+    ///
+    /// This creates a new `PhysicalLayout` from a `LayoutDescriptor`. The reconstructed
+    /// layout will have memory descriptors that point to the remote node's memory,
+    /// allowing NIXL to build RDMA descriptors for remote access.
+    ///
+    /// # Arguments
+    /// * `serialized` - Serialized layout data from a remote node
+    ///
+    /// # Returns
+    /// A new `PhysicalLayout` representing the remote layout
+    ///
+    /// # Note
+    /// The memory regions in the reconstructed layout are not valid for local access;
+    /// they represent remote memory addresses and are used to build NIXL transfer descriptors.
+    pub fn from_descriptor(serialized: LayoutDescriptor) -> Result<Self> {
+        // Validate version
+        if serialized.version > LayoutDescriptor::CURRENT_VERSION {
+            return Err(anyhow!(
+                "Unsupported serialization version: {}. Maximum supported: {}",
+                serialized.version,
+                LayoutDescriptor::CURRENT_VERSION
+            ));
+        }
+
+        // Create remote memory regions from descriptors
+        let remote_regions: Vec<Arc<dyn MemoryRegion>> = serialized
+            .memory_descriptors
+            .iter()
+            .map(|desc| {
+                Arc::new(RemoteMemoryDescriptor {
+                    addr: desc.addr,
+                    size: desc.size,
+                    storage_kind: serialized.location,
+                }) as Arc<dyn MemoryRegion>
+            })
+            .collect();
+
+        // Reconstruct the layout based on type
+        let layout: Arc<dyn Layout> = match serialized.layout_type_details {
+            LayoutTypeDetails::FullyContiguous(details) => {
+                if remote_regions.len() != 1 {
+                    return Err(anyhow!(
+                        "FullyContiguous layout requires exactly 1 memory region, got {}",
+                        remote_regions.len()
+                    ));
+                }
+                let layout = FullyContiguousLayout::new_with_format(
+                    serialized.layout_config.clone(),
+                    remote_regions[0].clone(),
+                    details.block_format,
+                )?;
+                Arc::new(layout)
+            }
+            LayoutTypeDetails::LayerSeparate(details) => {
+                if remote_regions.len() != serialized.layout_config.num_layers {
+                    return Err(anyhow!(
+                        "LayerSeparate layout requires {} memory regions (one per layer), got {}",
+                        serialized.layout_config.num_layers,
+                        remote_regions.len()
+                    ));
+                }
+                let layout = LayerSeparateLayout::new(
+                    serialized.layout_config.clone(),
+                    remote_regions,
+                    details.block_dim,
+                )?;
+                Arc::new(layout)
+            }
+        };
+
+        Ok(Self {
+            layout,
+            location: serialized.location,
+            nixl_metadata: serialized.nixl_metadata,
+        })
+    }
+}
+
+/// A memory region that represents remote memory addresses.
+///
+/// This type is used when reconstructing layouts from serialized data.
+/// The addresses are not valid for local access but can be used to
+/// build NIXL transfer descriptors for remote memory access.
+#[derive(Debug)]
+struct RemoteMemoryDescriptor {
+    addr: usize,
+    size: usize,
+    storage_kind: StorageKind,
+}
+
+impl MemoryRegion for RemoteMemoryDescriptor {
+    fn addr(&self) -> usize {
+        self.addr
+    }
+
+    fn size(&self) -> usize {
+        self.size
+    }
+
+    fn storage_kind(&self) -> StorageKind {
+        self.storage_kind
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/layout/serialize.rs b/lib/llm/src/block_manager/v2/physical/layout/serialize.rs
new file mode 100644
index 0000000000..997742a075
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/layout/serialize.rs
@@ -0,0 +1,268 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Serialization types for physical layouts.
+//!
+//! This module provides types for serializing and deserializing physical layouts
+//! so they can be transmitted to remote nodes and reconstructed there for RDMA operations.
+
+use super::physical::NixlMetadata;
+use super::{BlockDimension, LayoutConfig};
+use crate::block_manager::v2::memory::{MemoryDescriptor, StorageKind};
+use anyhow::Result;
+use serde::{Deserialize, Serialize};
+
+/// Format of blocks in a fully contiguous layout.
+///
+/// This enum describes how the blocks are organized and formatted in memory.
+/// Currently only `Operational` is supported, but future variants may include
+/// different compression schemes or memory layouts.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum BlockFormat {
+    /// Standard operational format - blocks are stored in their normal, uncompressed form.
+    Operational,
+}
+
+impl Default for BlockFormat {
+    fn default() -> Self {
+        Self::Operational
+    }
+}
+
+/// Details specific to fully contiguous layouts.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FullyContiguousDetails {
+    /// Format of the blocks in memory
+    pub block_format: BlockFormat,
+}
+
+/// Details specific to layer-separate layouts.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LayerSeparateDetails {
+    /// Block dimension ordering (block-first or block-second)
+    pub block_dim: BlockDimension,
+}
+
+/// Layout-type-specific details.
+///
+/// This enum captures the information that differs between layout types
+/// and is needed to reconstruct the layout on a remote node.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum LayoutTypeDetails {
+    /// Fully contiguous layout details
+    FullyContiguous(FullyContiguousDetails),
+    /// Layer-separate layout details
+    LayerSeparate(LayerSeparateDetails),
+}
+
+/// Serializable representation of a physical layout.
+///
+/// This structure contains all information needed to reconstruct a layout
+/// on a remote node, including:
+/// - Layout configuration (dimensions, sizes, etc.)
+/// - Storage location and NIXL metadata
+/// - Memory descriptors for all regions
+/// - Layout-type-specific details
+///
+/// The serialized form can be transmitted over the network and used to
+/// build NIXL transfer descriptors for remote memory access.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LayoutDescriptor {
+    /// Serialization format version (for future compatibility)
+    pub version: u32,
+
+    /// Layout configuration
+    pub layout_config: LayoutConfig,
+
+    /// Storage location
+    pub location: StorageKind,
+
+    /// NIXL metadata from the source node
+    pub nixl_metadata: NixlMetadata,
+
+    /// Memory descriptors for all regions backing this layout
+    pub memory_descriptors: Vec<MemoryDescriptor>,
+
+    /// Layout-type-specific details
+    pub layout_type_details: LayoutTypeDetails,
+}
+
+impl LayoutDescriptor {
+    /// Current serialization version
+    pub const CURRENT_VERSION: u32 = 1;
+
+    /// Serialize this layout to a JSON string.
+    ///
+    /// # Returns
+    /// JSON string representation of the layout
+    pub fn to_json(&self) -> Result<String> {
+        serde_json::to_string(self)
+            .map_err(|e| anyhow::anyhow!("failed to serialize layout to JSON: {}", e))
+    }
+
+    /// Serialize this layout to JSON bytes.
+    ///
+    /// # Returns
+    /// UTF-8 encoded JSON bytes
+    pub fn to_json_bytes(&self) -> Result<Vec<u8>> {
+        serde_json::to_vec(self)
+            .map_err(|e| anyhow::anyhow!("failed to serialize layout to JSON bytes: {}", e))
+    }
+
+    /// Deserialize a layout from a JSON string.
+    ///
+    /// # Arguments
+    /// * `json` - JSON string representation
+    ///
+    /// # Returns
+    /// Deserialized layout
+    pub fn from_json(json: &str) -> Result<Self> {
+        serde_json::from_str(json)
+            .map_err(|e| anyhow::anyhow!("failed to deserialize layout from JSON: {}", e))
+    }
+
+    /// Deserialize a layout from JSON bytes.
+    ///
+    /// # Arguments
+    /// * `bytes` - UTF-8 encoded JSON bytes
+    ///
+    /// # Returns
+    /// Deserialized layout
+    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self> {
+        serde_json::from_slice(bytes)
+            .map_err(|e| anyhow::anyhow!("failed to deserialize layout from JSON bytes: {}", e))
+    }
+
+    /// Get the layout configuration.
+    pub fn layout_config(&self) -> &LayoutConfig {
+        &self.layout_config
+    }
+
+    /// Get the storage location.
+    pub fn location(&self) -> StorageKind {
+        self.location
+    }
+
+    /// Get the NIXL metadata from the source node.
+    pub fn nixl_metadata(&self) -> &NixlMetadata {
+        &self.nixl_metadata
+    }
+
+    /// Get the memory descriptors.
+    pub fn memory_descriptors(&self) -> &[MemoryDescriptor] {
+        &self.memory_descriptors
+    }
+
+    /// Get the layout type details.
+    pub fn layout_type_details(&self) -> &LayoutTypeDetails {
+        &self.layout_type_details
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn make_test_config() -> LayoutConfig {
+        LayoutConfig::builder()
+            .num_blocks(10)
+            .num_layers(4)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap()
+    }
+
+    #[test]
+    fn test_block_format_default() {
+        assert_eq!(BlockFormat::default(), BlockFormat::Operational);
+    }
+
+    #[test]
+    fn test_serialized_layout_json_roundtrip() {
+        let layout = LayoutDescriptor {
+            version: LayoutDescriptor::CURRENT_VERSION,
+            layout_config: make_test_config(),
+            location: StorageKind::System,
+            nixl_metadata: NixlMetadata::new("test_agent".to_string(), nixl_sys::MemType::Dram, 0),
+            memory_descriptors: vec![MemoryDescriptor::new(0x1000, 4096)],
+            layout_type_details: LayoutTypeDetails::FullyContiguous(FullyContiguousDetails {
+                block_format: BlockFormat::Operational,
+            }),
+        };
+
+        // Test to_json/from_json
+        let json = layout.to_json().unwrap();
+        let deserialized = LayoutDescriptor::from_json(&json).unwrap();
+
+        assert_eq!(deserialized.version, layout.version);
+        assert_eq!(deserialized.layout_config, layout.layout_config);
+        assert_eq!(deserialized.location, layout.location);
+        assert_eq!(
+            deserialized.nixl_metadata.agent_name(),
+            layout.nixl_metadata.agent_name()
+        );
+        assert_eq!(deserialized.memory_descriptors.len(), 1);
+    }
+
+    #[test]
+    fn test_serialized_layout_json_bytes_roundtrip() {
+        let layout = LayoutDescriptor {
+            version: LayoutDescriptor::CURRENT_VERSION,
+            layout_config: make_test_config(),
+            location: StorageKind::System,
+            nixl_metadata: NixlMetadata::new("test_agent".to_string(), nixl_sys::MemType::Vram, 5),
+            memory_descriptors: vec![
+                MemoryDescriptor::new(0x1000, 2048),
+                MemoryDescriptor::new(0x2000, 2048),
+            ],
+            layout_type_details: LayoutTypeDetails::LayerSeparate(LayerSeparateDetails {
+                block_dim: BlockDimension::BlockIsFirstDim,
+            }),
+        };
+
+        // Test to_json_bytes/from_json_bytes
+        let bytes = layout.to_json_bytes().unwrap();
+        let deserialized = LayoutDescriptor::from_json_bytes(&bytes).unwrap();
+
+        assert_eq!(deserialized.version, layout.version);
+        assert_eq!(deserialized.nixl_metadata.device_id(), 5);
+        assert_eq!(deserialized.memory_descriptors.len(), 2);
+    }
+
+    #[test]
+    fn test_fully_contiguous_details_serialization() {
+        let details = LayoutTypeDetails::FullyContiguous(FullyContiguousDetails {
+            block_format: BlockFormat::Operational,
+        });
+
+        let json = serde_json::to_string(&details).unwrap();
+        let deserialized: LayoutTypeDetails = serde_json::from_str(&json).unwrap();
+
+        match deserialized {
+            LayoutTypeDetails::FullyContiguous(d) => {
+                assert_eq!(d.block_format, BlockFormat::Operational);
+            }
+            _ => panic!("Expected FullyContiguous variant"),
+        }
+    }
+
+    #[test]
+    fn test_layer_separate_details_serialization() {
+        let details = LayoutTypeDetails::LayerSeparate(LayerSeparateDetails {
+            block_dim: BlockDimension::BlockIsSecondDim,
+        });
+
+        let json = serde_json::to_string(&details).unwrap();
+        let deserialized: LayoutTypeDetails = serde_json::from_str(&json).unwrap();
+
+        match deserialized {
+            LayoutTypeDetails::LayerSeparate(d) => {
+                assert_eq!(d.block_dim, BlockDimension::BlockIsSecondDim);
+            }
+            _ => panic!("Expected LayerSeparate variant"),
+        }
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/layout/tests.rs b/lib/llm/src/block_manager/v2/physical/layout/tests.rs
new file mode 100644
index 0000000000..f0c763a177
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/layout/tests.rs
@@ -0,0 +1,367 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Integration tests for layout serialization.
+//!
+//! These tests verify the complete serialization and deserialization flow,
+//! ensuring that layouts can be transmitted to remote nodes and reconstructed
+//! with all necessary metadata intact.
+
+use crate::block_manager::v2::memory::{
+    MemoryRegion, NixlDescriptor, OwnedMemoryRegion, StorageKind,
+};
+use crate::block_manager::v2::physical::layout::physical::PhysicalLayout;
+use crate::block_manager::v2::physical::layout::{BlockDimension, LayoutConfig, LayoutDescriptor};
+use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent;
+use std::any::Any;
+use std::sync::Arc;
+
+// Simple mock implementation for testing
+#[derive(Debug)]
+pub struct MockMemory {
+    addr: usize,
+    size: usize,
+}
+
+impl MockMemory {
+    pub fn new(addr: usize, size: usize) -> Arc<Self> {
+        Arc::new(Self { addr, size })
+    }
+}
+
+impl MemoryRegion for MockMemory {
+    fn addr(&self) -> usize {
+        self.addr
+    }
+    fn size(&self) -> usize {
+        self.size
+    }
+    fn storage_kind(&self) -> StorageKind {
+        StorageKind::System
+    }
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+/// Mock memory region for testing serialization
+#[derive(Debug)]
+struct TestMemoryRegion {
+    addr: usize,
+    size: usize,
+    kind: StorageKind,
+    descriptor: NixlDescriptor,
+}
+
+impl TestMemoryRegion {
+    fn new(addr: usize, size: usize, kind: StorageKind) -> Arc<Self> {
+        Arc::new(Self {
+            addr,
+            size,
+            kind,
+            descriptor: NixlDescriptor {
+                addr: addr as u64,
+                size,
+                mem_type: nixl_sys::MemType::Dram,
+                device_id: 0,
+            },
+        })
+    }
+}
+
+impl MemoryRegion for TestMemoryRegion {
+    fn addr(&self) -> usize {
+        self.addr
+    }
+
+    fn size(&self) -> usize {
+        self.size
+    }
+
+    fn storage_kind(&self) -> StorageKind {
+        self.kind
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn nixl_descriptor(&self) -> Option<NixlDescriptor> {
+        Some(self.descriptor.clone())
+    }
+}
+
+fn make_test_config() -> LayoutConfig {
+    LayoutConfig::builder()
+        .num_blocks(10)
+        .num_layers(4)
+        .outer_dim(2)
+        .page_size(16)
+        .inner_dim(128)
+        .dtype_width_bytes(2)
+        .build()
+        .unwrap()
+}
+
+#[test]
+fn test_fully_contiguous_layout_serialization_roundtrip() {
+    let agent = NixlAgent::require_backends("test-fc-serialize", &[])
+        .expect("failed to create wrapped agent");
+    let config = make_test_config();
+
+    // Calculate required size
+    let required_size = config.num_blocks
+        * config.num_layers
+        * config.outer_dim
+        * config.page_size
+        * config.inner_dim
+        * config.dtype_width_bytes;
+
+    // Create test memory region
+    let memory = TestMemoryRegion::new(0x10000, required_size, StorageKind::System);
+    let regions = vec![memory as OwnedMemoryRegion];
+
+    // Build physical layout
+    let original_layout = PhysicalLayout::builder(agent)
+        .with_config(config.clone())
+        .fully_contiguous()
+        .with_registered_regions(regions)
+        .expect("failed to provide regions")
+        .build()
+        .expect("failed to build layout");
+
+    // Serialize to LayoutDescriptor
+    let serialized = original_layout
+        .to_descriptor()
+        .expect("failed to serialize layout");
+
+    // Verify serialized data
+    assert_eq!(serialized.version, LayoutDescriptor::CURRENT_VERSION);
+    assert_eq!(serialized.layout_config, config);
+    assert_eq!(serialized.location, StorageKind::System);
+    assert_eq!(serialized.memory_descriptors.len(), 1);
+    assert_eq!(serialized.memory_descriptors[0].addr, 0x10000);
+    assert_eq!(serialized.memory_descriptors[0].size, required_size);
+
+    // Serialize to JSON
+    let json = serialized.to_json().expect("failed to serialize to JSON");
+    assert!(json.contains("\"version\":1"));
+    assert!(json.contains("\"num_blocks\":10"));
+
+    // Deserialize from JSON
+    let deserialized = LayoutDescriptor::from_json(&json).expect("failed to deserialize from JSON");
+
+    // Verify deserialized matches original
+    assert_eq!(deserialized.version, serialized.version);
+    assert_eq!(deserialized.layout_config, serialized.layout_config);
+    assert_eq!(deserialized.location, serialized.location);
+    assert_eq!(
+        deserialized.memory_descriptors.len(),
+        serialized.memory_descriptors.len()
+    );
+
+    // Reconstruct layout from serialized data
+    let reconstructed =
+        PhysicalLayout::from_descriptor(deserialized).expect("failed to reconstruct layout");
+
+    // Verify reconstructed layout has same configuration
+    assert_eq!(reconstructed.layout().config(), &config);
+    assert_eq!(reconstructed.location(), StorageKind::System);
+    assert_eq!(reconstructed.layout().num_blocks(), 10);
+    assert_eq!(reconstructed.layout().num_layers(), 4);
+    assert!(reconstructed.layout().is_fully_contiguous());
+}
+
+#[test]
+fn test_layer_separate_layout_serialization_roundtrip() {
+    let agent = NixlAgent::require_backends("test-ls-serialize", &[])
+        .expect("failed to create wrapped agent");
+    let config = make_test_config();
+
+    // Calculate per-layer size
+    let per_layer_size = config.num_blocks
+        * config.outer_dim
+        * config.page_size
+        * config.inner_dim
+        * config.dtype_width_bytes;
+
+    // Create memory regions (one per layer)
+    let regions: Vec<OwnedMemoryRegion> = (0..config.num_layers)
+        .map(|i| {
+            TestMemoryRegion::new(
+                0x10000 + i * per_layer_size,
+                per_layer_size,
+                StorageKind::System,
+            ) as OwnedMemoryRegion
+        })
+        .collect();
+
+    // Build physical layout
+    let original_layout = PhysicalLayout::builder(agent)
+        .with_config(config.clone())
+        .layer_separate(BlockDimension::BlockIsFirstDim)
+        .with_registered_regions(regions)
+        .expect("failed to provide regions")
+        .build()
+        .expect("failed to build layout");
+
+    // Serialize to LayoutDescriptor
+    let serialized = original_layout
+        .to_descriptor()
+        .expect("failed to serialize layout");
+
+    // Verify serialized data
+    assert_eq!(serialized.version, LayoutDescriptor::CURRENT_VERSION);
+    assert_eq!(serialized.layout_config, config);
+    assert_eq!(serialized.memory_descriptors.len(), 4); // One per layer
+
+    // Verify memory descriptors
+    for (i, desc) in serialized.memory_descriptors.iter().enumerate() {
+        assert_eq!(desc.addr, 0x10000 + i * per_layer_size);
+        assert_eq!(desc.size, per_layer_size);
+    }
+
+    // Serialize to JSON bytes
+    let json_bytes = serialized
+        .to_json_bytes()
+        .expect("failed to serialize to JSON bytes");
+
+    // Deserialize from JSON bytes
+    let deserialized = LayoutDescriptor::from_json_bytes(&json_bytes)
+        .expect("failed to deserialize from JSON bytes");
+
+    // Verify deserialized matches original
+    assert_eq!(deserialized.version, serialized.version);
+    assert_eq!(deserialized.layout_config, serialized.layout_config);
+    assert_eq!(
+        deserialized.memory_descriptors.len(),
+        serialized.memory_descriptors.len()
+    );
+
+    // Reconstruct layout from serialized data
+    let reconstructed =
+        PhysicalLayout::from_descriptor(deserialized).expect("failed to reconstruct layout");
+
+    // Verify reconstructed layout has same configuration
+    assert_eq!(reconstructed.layout().config(), &config);
+    assert_eq!(reconstructed.location(), StorageKind::System);
+    assert_eq!(reconstructed.layout().num_blocks(), 10);
+    assert_eq!(reconstructed.layout().num_layers(), 4);
+    assert!(!reconstructed.layout().is_fully_contiguous());
+}
+
+#[test]
+fn test_memory_region_calculation_after_deserialization() {
+    let agent = NixlAgent::require_backends("test-memory-calc", &[])
+        .expect("failed to create wrapped agent");
+    let config = LayoutConfig::builder()
+        .num_blocks(2)
+        .num_layers(2)
+        .outer_dim(2)
+        .page_size(4)
+        .inner_dim(8)
+        .dtype_width_bytes(2)
+        .build()
+        .unwrap();
+
+    let required_size = config.num_blocks
+        * config.num_layers
+        * config.outer_dim
+        * config.page_size
+        * config.inner_dim
+        * config.dtype_width_bytes;
+
+    let memory = TestMemoryRegion::new(0x1000, required_size, StorageKind::System);
+    let regions = vec![memory as OwnedMemoryRegion];
+
+    let original_layout = PhysicalLayout::builder(agent)
+        .with_config(config.clone())
+        .fully_contiguous()
+        .with_registered_regions(regions)
+        .expect("failed to provide regions")
+        .build()
+        .expect("failed to build layout");
+
+    // Serialize and deserialize
+    let serialized = original_layout
+        .to_descriptor()
+        .expect("failed to serialize");
+    let reconstructed = PhysicalLayout::from_descriptor(serialized).expect("failed to reconstruct");
+
+    // Verify memory region calculations
+    let region = reconstructed
+        .memory_region(0, 0, 0)
+        .expect("failed to get memory region");
+    assert_eq!(region.addr, 0x1000);
+
+    let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
+    assert_eq!(region.size, region_size);
+
+    // Test different block/layer/outer indices
+    let region = reconstructed
+        .memory_region(1, 1, 1)
+        .expect("failed to get memory region");
+    // Address should be: base + block_stride + layer_stride + outer_stride
+    let layer_stride = config.outer_dim * region_size;
+    let block_stride = config.num_layers * layer_stride;
+    let expected_addr = 0x1000 + block_stride + layer_stride + region_size;
+    assert_eq!(region.addr, expected_addr);
+}
+
+#[test]
+fn test_version_check_on_deserialization() {
+    let config = make_test_config();
+
+    // Calculate required size for fully contiguous layout
+    let required_size = config.num_blocks
+        * config.num_layers
+        * config.outer_dim
+        * config.page_size
+        * config.inner_dim
+        * config.dtype_width_bytes;
+
+    let mut serialized = LayoutDescriptor {
+        version: 999, // Future version
+        layout_config: config.clone(),
+        location: StorageKind::System,
+        nixl_metadata: crate::block_manager::v2::physical::layout::physical::NixlMetadata::new(
+            "test".to_string(),
+            nixl_sys::MemType::Dram,
+            0,
+        ),
+        memory_descriptors: vec![],
+        layout_type_details:
+            crate::block_manager::v2::physical::layout::LayoutTypeDetails::FullyContiguous(
+                crate::block_manager::v2::physical::layout::FullyContiguousDetails {
+                    block_format:
+                        crate::block_manager::v2::physical::layout::BlockFormat::Operational,
+                },
+            ),
+    };
+
+    // Should fail with unsupported version
+    let result = PhysicalLayout::from_descriptor(serialized.clone());
+    assert!(result.is_err());
+    assert!(
+        result
+            .unwrap_err()
+            .to_string()
+            .contains("Unsupported serialization version")
+    );
+
+    // Should succeed with supported version
+    serialized.version = LayoutDescriptor::CURRENT_VERSION;
+    serialized.memory_descriptors = vec![crate::block_manager::v2::memory::MemoryDescriptor::new(
+        0x1000,
+        required_size,
+    )];
+    let result = PhysicalLayout::from_descriptor(serialized);
+    if let Err(ref e) = result {
+        eprintln!("Error during deserialization: {}", e);
+    }
+    assert!(
+        result.is_ok(),
+        "Expected successful deserialization, got error: {:?}",
+        result.err()
+    );
+}
diff --git a/lib/llm/src/block_manager/v2/physical/layout/validation.rs b/lib/llm/src/block_manager/v2/physical/layout/validation.rs
new file mode 100644
index 0000000000..126c7299f1
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/layout/validation.rs
@@ -0,0 +1,122 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Tensor validation utilities for layout creation.
+
+use anyhow::{Result, anyhow};
+use std::sync::Arc;
+
+use crate::block_manager::v2::memory::TorchTensor;
+
+/// Format of tensor layout (for future TP translation).
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TensorFormat {
+    /// NHD format: [N, H, D] where N=block_size, H=heads, D=hidden
+    NHD,
+    /// HND format: [H, N, D] where H=heads, N=block_size, D=hidden
+    HND,
+    /// Unknown or ambiguous format
+    Unknown,
+}
+
+/// Validate tensor strides and detect format.
+///
+/// This function checks that tensor strides are monotonically decreasing,
+/// which ensures tensor-contiguous layout. The stride validation is flexible
+/// at the inner dimension boundary to accommodate different layouts.
+///
+/// Additionally, it attempts to detect whether the layout is NHD or HND format,
+/// which is important for future tensor parallel (TP) translation.
+///
+/// # Arguments
+/// * `tensors` - Slice of tensors to validate
+///
+/// # Returns
+/// The detected tensor format (NHD, HND, or Unknown)
+pub fn validate_tensor_strides(tensors: &[Arc<dyn TorchTensor>]) -> Result<TensorFormat> {
+    if tensors.is_empty() {
+        return Err(anyhow!("Cannot validate empty tensor list"));
+    }
+
+    let mut format = TensorFormat::Unknown;
+
+    for tensor in tensors {
+        let stride = tensor.stride();
+        let shape = tensor.shape();
+
+        if stride.len() < 2 {
+            return Err(anyhow!(
+                "Tensor must have at least 2 dimensions, got stride: {:?}",
+                stride
+            ));
+        }
+
+        // Check monotonic decreasing stride
+        // Note: We're flexible at the combined inner dimension boundary as per requirements
+        let mut prev_stride = usize::MAX;
+        for (i, &current_stride) in stride.iter().enumerate() {
+            if current_stride > prev_stride {
+                return Err(anyhow!(
+                    "Tensor strides must be monotonically decreasing (until inner dimension). \
+                     Got stride: {:?} at position {}",
+                    stride,
+                    i
+                ));
+            }
+            prev_stride = current_stride;
+        }
+
+        // Attempt to detect NHD vs HND format based on shape and stride patterns
+        // This is a heuristic and may need refinement based on actual usage
+        if shape.len() >= 3 {
+            // If the first dimension stride is smaller than the second, likely HND
+            // If the first dimension stride is larger than the second, likely NHD
+            if stride[0] < stride[1] {
+                format = TensorFormat::HND;
+            } else if stride[0] > stride[1] {
+                format = TensorFormat::NHD;
+            }
+        }
+    }
+
+    Ok(format)
+}
+
+/// Validate that all tensors have consistent shapes.
+///
+/// # Arguments
+/// * `tensors` - Slice of tensors to validate
+///
+/// # Returns
+/// The common shape shared by all tensors
+pub fn validate_tensor_shapes(tensors: &[Arc<dyn TorchTensor>]) -> Result<Vec<usize>> {
+    if tensors.is_empty() {
+        return Err(anyhow!("Cannot validate empty tensor list"));
+    }
+
+    let first_shape = tensors[0].shape();
+
+    for tensor in &tensors[1..] {
+        if tensor.shape() != first_shape {
+            return Err(anyhow!(
+                "All tensors must have the same shape. Expected {:?}, got {:?}",
+                first_shape,
+                tensor.shape()
+            ));
+        }
+    }
+
+    Ok(first_shape)
+}
+
+#[allow(dead_code)]
+pub fn determine_compressed_shape(shape: &[usize]) -> usize {
+    shape.iter().product()
+}
+
+#[cfg(test)]
+mod tests {
+
+    // Note: These tests would require mock TorchTensor implementations
+    // which we can add if needed for testing infrastructure
+}
diff --git a/lib/llm/src/block_manager/v2/physical/manager/handle.rs b/lib/llm/src/block_manager/v2/physical/manager/handle.rs
new file mode 100644
index 0000000000..25bd013227
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/manager/handle.rs
@@ -0,0 +1,115 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Layout handle type encoding worker ID and layout ID.
+
+use bincode::{Decode, Encode};
+
+/// Unique handle for a layout combining worker_id and layout_id.
+///
+/// The handle encodes:
+/// - Bits 0-63: worker_id (u64)
+/// - Bits 64-79: layout_id (u16)
+/// - Bits 80-127: Reserved (48 bits, currently unused)
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Encode, Decode)]
+pub struct LayoutHandle(u128);
+
+impl LayoutHandle {
+    /// Create a new layout handle from worker_id and layout_id.
+    ///
+    /// # Arguments
+    /// * `worker_id` - Unique identifier for the worker (0-63 bits)
+    /// * `layout_id` - Layout identifier within the worker (64-79 bits)
+    pub fn new(worker_id: u64, layout_id: u16) -> Self {
+        let handle = (worker_id as u128) | ((layout_id as u128) << 64);
+        Self(handle)
+    }
+
+    /// Extract the worker_id from this handle.
+    pub fn worker_id(&self) -> u64 {
+        (self.0 & 0xFFFF_FFFF_FFFF_FFFF) as u64
+    }
+
+    /// Extract the layout_id from this handle.
+    pub fn layout_id(&self) -> u16 {
+        ((self.0 >> 64) & 0xFFFF) as u16
+    }
+
+    /// Get the raw u128 value.
+    pub fn as_u128(&self) -> u128 {
+        self.0
+    }
+
+    /// Create a handle from a raw u128 value.
+    pub fn from_u128(value: u128) -> Self {
+        Self(value)
+    }
+}
+
+impl std::fmt::Display for LayoutHandle {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "LayoutHandle(worker={}, layout={})",
+            self.worker_id(),
+            self.layout_id()
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_handle_encoding() {
+        let worker_id = 0x1234_5678_9ABC_DEF0u64;
+        let layout_id = 0x4242u16;
+
+        let handle = LayoutHandle::new(worker_id, layout_id);
+
+        assert_eq!(handle.worker_id(), worker_id);
+        assert_eq!(handle.layout_id(), layout_id);
+    }
+
+    #[test]
+    fn test_handle_roundtrip() {
+        let handle = LayoutHandle::new(42, 100);
+        let raw = handle.as_u128();
+        let restored = LayoutHandle::from_u128(raw);
+
+        assert_eq!(handle, restored);
+        assert_eq!(restored.worker_id(), 42);
+        assert_eq!(restored.layout_id(), 100);
+    }
+
+    #[test]
+    fn test_handle_max_values() {
+        let max_worker = u64::MAX;
+        let max_layout = u16::MAX;
+
+        let handle = LayoutHandle::new(max_worker, max_layout);
+
+        assert_eq!(handle.worker_id(), max_worker);
+        assert_eq!(handle.layout_id(), max_layout);
+    }
+
+    #[test]
+    fn test_handle_bincode_roundtrip() {
+        let handle = LayoutHandle::new(999, 42);
+
+        let encoded = bincode::encode_to_vec(handle, bincode::config::standard()).unwrap();
+        let (decoded, _): (LayoutHandle, _) =
+            bincode::decode_from_slice(&encoded, bincode::config::standard()).unwrap();
+
+        assert_eq!(handle, decoded);
+    }
+
+    #[test]
+    fn test_handle_display() {
+        let handle = LayoutHandle::new(123, 456);
+        let display = format!("{}", handle);
+        assert!(display.contains("123"));
+        assert!(display.contains("456"));
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/manager/local.rs b/lib/llm/src/block_manager/v2/physical/manager/local.rs
new file mode 100644
index 0000000000..8157b3671c
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/manager/local.rs
@@ -0,0 +1,119 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Local layout wrapper with handle and metadata.
+
+use std::ops::Deref;
+
+use super::handle::LayoutHandle;
+use crate::block_manager::v2::physical::layout::PhysicalLayout;
+
+/// A local physical layout with an assigned handle.
+///
+/// This wraps a `PhysicalLayout` that exists on the local worker,
+/// associating it with a unique handle that combines the worker_id
+/// and a locally-assigned layout_id.
+///
+/// This type is cheap to clone as `PhysicalLayout` contains `Arc` internally.
+#[derive(Debug, Clone)]
+pub struct LocalLayout {
+    handle: LayoutHandle,
+    layout: PhysicalLayout,
+}
+
+#[allow(dead_code)]
+impl LocalLayout {
+    /// Create a new local layout.
+    ///
+    /// # Arguments
+    /// * `handle` - Unique handle for this layout
+    /// * `layout` - The physical layout
+    pub fn new(handle: LayoutHandle, layout: PhysicalLayout) -> Self {
+        Self { handle, layout }
+    }
+
+    /// Get the handle for this layout.
+    pub fn handle(&self) -> LayoutHandle {
+        self.handle
+    }
+
+    /// Get a reference to the physical layout.
+    pub fn layout(&self) -> &PhysicalLayout {
+        &self.layout
+    }
+
+    /// Get the worker_id from the handle.
+    pub fn worker_id(&self) -> u64 {
+        self.handle.worker_id()
+    }
+
+    /// Get the layout_id from the handle.
+    pub fn layout_id(&self) -> u16 {
+        self.handle.layout_id()
+    }
+
+    /// Consume this local layout and return the physical layout.
+    pub fn into_layout(self) -> PhysicalLayout {
+        self.layout
+    }
+}
+
+impl Deref for LocalLayout {
+    type Target = PhysicalLayout;
+
+    fn deref(&self) -> &Self::Target {
+        &self.layout
+    }
+}
+
+#[cfg(all(test, feature = "testing-nixl"))]
+mod tests {
+    use super::*;
+    use crate::block_manager::v2::physical::layout::{LayoutConfig, PhysicalLayout};
+    use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent;
+
+    fn create_test_agent(name: &str) -> NixlAgent {
+        NixlAgent::require_backends(name, &[]).expect("failed to create wrapped agent")
+    }
+
+    fn make_test_layout() -> PhysicalLayout {
+        let agent = create_test_agent("test-local");
+        let config = LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(2)
+            .outer_dim(2)
+            .page_size(4)
+            .inner_dim(8)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        PhysicalLayout::builder(agent)
+            .with_config(config)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap()
+    }
+
+    #[test]
+    fn test_local_layout_creation() {
+        let handle = LayoutHandle::new(42, 100);
+        let layout = make_test_layout();
+        let local = LocalLayout::new(handle, layout);
+
+        assert_eq!(local.handle(), handle);
+        assert_eq!(local.worker_id(), 42);
+        assert_eq!(local.layout_id(), 100);
+    }
+
+    #[test]
+    fn test_local_layout_into_layout() {
+        let handle = LayoutHandle::new(1, 2);
+        let layout = make_test_layout();
+        let local = LocalLayout::new(handle, layout);
+
+        let _recovered = local.into_layout();
+        // Successfully consumed and returned the layout
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/manager/metadata.rs b/lib/llm/src/block_manager/v2/physical/manager/metadata.rs
new file mode 100644
index 0000000000..a64144e71f
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/manager/metadata.rs
@@ -0,0 +1,239 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Serialization types for exporting/importing layout metadata with NIXL integration.
+
+use super::handle::LayoutHandle;
+use crate::block_manager::v2::physical::layout::LayoutDescriptor;
+use anyhow::Result;
+use bincode::{Decode, Encode};
+use bytes::Bytes;
+
+/// Worker identification combining worker_id and NIXL agent name.
+#[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)]
+pub struct WorkerAddress {
+    /// Unique identifier for this worker
+    pub worker_id: u64,
+    /// NIXL agent name on this worker
+    pub nixl_agent_name: String,
+}
+
+impl WorkerAddress {
+    /// Create a new worker address.
+    pub fn new(worker_id: u64, nixl_agent_name: String) -> Self {
+        Self {
+            worker_id,
+            nixl_agent_name,
+        }
+    }
+}
+
+/// Local layout descriptor with its assigned handle from the TransportManager.
+#[derive(Debug, Clone, Encode, Decode)]
+pub struct LocalLayoutDescriptor {
+    /// Unique handle for this layout
+    pub handle: LayoutHandle,
+    /// Serialized layout data (uses Serde, bridged via bincode)
+    #[bincode(with_serde)]
+    pub layout: LayoutDescriptor,
+}
+
+impl LocalLayoutDescriptor {
+    /// Create a new serialized layout with handle.
+    pub fn new(handle: LayoutHandle, layout: LayoutDescriptor) -> Self {
+        Self { handle, layout }
+    }
+}
+
+/// The set of [`LocalLayoutDescriptor`] that are RDMA enabled. This object packages the detail
+/// about the layouts and the NIXL RDMA metadata required to reconstruct the layouts and access
+/// the memory via NIXL RDMA.
+#[derive(Debug, Encode, Decode)]
+pub struct RdmaLayoutDescriptors {
+    /// Worker identification
+    pub worker_address: WorkerAddress,
+    /// Exported NIXL metadata from nixl_sys::Agent::get_local_md()
+    pub nixl_metadata: Vec<u8>,
+    /// Serialized layouts (handle + layout data)
+    pub layouts: Vec<LocalLayoutDescriptor>,
+}
+
+/// Managed memory metadata package for export/import.
+///
+/// This is the wire format for transmitting layout metadata between workers.
+/// It contains everything needed to reconstruct remote layouts and load their
+/// NIXL registration data.
+pub struct SerializedLayout(Bytes);
+
+impl SerializedLayout {
+    /// Pack metadata into a serialized form.
+    ///
+    /// # Arguments
+    /// * `worker_address` - Worker identification
+    /// * `nixl_metadata` - NIXL metadata blob from get_local_md()
+    /// * `layouts` - Vector of layouts with handles to export
+    ///
+    /// # Returns
+    /// Packed metadata ready for transmission
+    pub fn pack(
+        worker_address: WorkerAddress,
+        nixl_metadata: Vec<u8>,
+        layouts: Vec<LocalLayoutDescriptor>,
+    ) -> Result<Self> {
+        let inner = RdmaLayoutDescriptors {
+            worker_address,
+            nixl_metadata,
+            layouts,
+        };
+        let bytes = bincode::encode_to_vec(&inner, bincode::config::standard())
+            .map_err(|e| anyhow::anyhow!("failed to encode managed memory metadata: {}", e))?;
+        Ok(Self(Bytes::from(bytes)))
+    }
+
+    /// Unpack metadata from serialized form.
+    ///
+    /// # Returns
+    /// Unpacked metadata structure
+    pub fn unpack(&self) -> Result<RdmaLayoutDescriptors> {
+        let (inner, _) = bincode::decode_from_slice(&self.0, bincode::config::standard())
+            .map_err(|e| anyhow::anyhow!("failed to decode managed memory metadata: {}", e))?;
+        Ok(inner)
+    }
+
+    /// Get the raw bytes.
+    pub fn as_bytes(&self) -> &Bytes {
+        &self.0
+    }
+
+    /// Create from raw bytes.
+    pub fn from_bytes(bytes: Bytes) -> Self {
+        Self(bytes)
+    }
+
+    /// Get the size in bytes.
+    pub fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    /// Check if empty.
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+}
+
+impl std::fmt::Debug for SerializedLayout {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("SerializedLayout")
+            .field("size_bytes", &self.len())
+            .finish()
+    }
+}
+
+#[cfg(all(test, feature = "testing-nixl"))]
+mod tests {
+    use super::*;
+    use crate::block_manager::v2::memory::{MemoryDescriptor, StorageKind};
+    use crate::block_manager::v2::physical::layout::{
+        BlockFormat, FullyContiguousDetails, LayoutConfig, LayoutDescriptor, LayoutTypeDetails,
+        NixlMetadata,
+    };
+
+    fn make_test_serialized_layout() -> LayoutDescriptor {
+        let config = LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(2)
+            .outer_dim(2)
+            .page_size(4)
+            .inner_dim(8)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        LayoutDescriptor {
+            version: 1,
+            layout_config: config,
+            location: StorageKind::System,
+            nixl_metadata: NixlMetadata::new("test".to_string(), nixl_sys::MemType::Dram, 0),
+            memory_descriptors: vec![MemoryDescriptor::new(0x1000, 4096)],
+            layout_type_details: LayoutTypeDetails::FullyContiguous(FullyContiguousDetails {
+                block_format: BlockFormat::Operational,
+            }),
+        }
+    }
+
+    #[test]
+    fn test_worker_address() {
+        let addr = WorkerAddress::new(42, "test_agent".to_string());
+        assert_eq!(addr.worker_id, 42);
+        assert_eq!(addr.nixl_agent_name, "test_agent");
+    }
+
+    #[test]
+    fn test_serialized_layout_with_handle() {
+        let handle = LayoutHandle::new(1, 2);
+        let layout = make_test_serialized_layout();
+        let with_handle = LocalLayoutDescriptor::new(handle, layout);
+
+        assert_eq!(with_handle.handle, handle);
+    }
+
+    #[test]
+    fn test_metadata_pack_unpack() {
+        let worker_address = WorkerAddress::new(100, "worker_100".to_string());
+        let nixl_metadata = vec![1, 2, 3, 4, 5];
+        let layouts = vec![LocalLayoutDescriptor::new(
+            LayoutHandle::new(100, 1),
+            make_test_serialized_layout(),
+        )];
+
+        let packed =
+            SerializedLayout::pack(worker_address.clone(), nixl_metadata.clone(), layouts).unwrap();
+
+        assert!(!packed.is_empty());
+        assert!(!packed.is_empty());
+
+        let unpacked = packed.unpack().unwrap();
+
+        assert_eq!(unpacked.worker_address, worker_address);
+        assert_eq!(unpacked.nixl_metadata, nixl_metadata);
+        assert_eq!(unpacked.layouts.len(), 1);
+        assert_eq!(unpacked.layouts[0].handle.worker_id(), 100);
+        assert_eq!(unpacked.layouts[0].handle.layout_id(), 1);
+    }
+
+    #[test]
+    fn test_metadata_multiple_layouts() {
+        let worker_address = WorkerAddress::new(200, "worker_200".to_string());
+        let nixl_metadata = vec![10, 20, 30];
+        let layouts = vec![
+            LocalLayoutDescriptor::new(LayoutHandle::new(200, 1), make_test_serialized_layout()),
+            LocalLayoutDescriptor::new(LayoutHandle::new(200, 2), make_test_serialized_layout()),
+            LocalLayoutDescriptor::new(LayoutHandle::new(200, 3), make_test_serialized_layout()),
+        ];
+
+        let packed =
+            SerializedLayout::pack(worker_address, nixl_metadata, layouts.clone()).unwrap();
+        let unpacked = packed.unpack().unwrap();
+
+        assert_eq!(unpacked.layouts.len(), 3);
+        for (i, layout) in unpacked.layouts.iter().enumerate() {
+            assert_eq!(layout.handle.worker_id(), 200);
+            assert_eq!(layout.handle.layout_id(), (i + 1) as u16);
+        }
+    }
+
+    #[test]
+    fn test_metadata_from_bytes() {
+        let worker_address = WorkerAddress::new(42, "test".to_string());
+        let nixl_metadata = vec![1, 2, 3];
+        let layouts = vec![];
+
+        let packed = SerializedLayout::pack(worker_address, nixl_metadata, layouts).unwrap();
+        let bytes = packed.as_bytes().clone();
+
+        let restored = SerializedLayout::from_bytes(bytes);
+        let unpacked = restored.unpack().unwrap();
+
+        assert_eq!(unpacked.worker_address.worker_id, 42);
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/manager/mod.rs b/lib/llm/src/block_manager/v2/physical/manager/mod.rs
new file mode 100644
index 0000000000..b5e8cdd6c5
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/manager/mod.rs
@@ -0,0 +1,627 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transport manager for local and remote physical layouts with transfer execution.
+
+mod handle;
+mod local;
+mod metadata;
+mod remote;
+
+pub use handle::LayoutHandle;
+pub use metadata::{SerializedLayout, WorkerAddress};
+
+pub(crate) use local::LocalLayout;
+pub(crate) use metadata::LocalLayoutDescriptor;
+pub(crate) use remote::RemoteLayout;
+
+use crate::block_manager::v2::memory::StorageKind;
+use crate::block_manager::v2::physical::layout::PhysicalLayout;
+use crate::block_manager::v2::physical::transfer::TransferContext;
+use crate::block_manager::v2::physical::transfer::context::TransferCompleteNotification;
+use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent;
+use crate::block_manager::v2::physical::transfer::options::TransferOptions;
+use anyhow::{Result, anyhow, bail};
+use std::collections::{HashMap, HashSet};
+use std::sync::atomic::{AtomicU16, Ordering};
+use std::sync::{Arc, RwLock};
+
+/// Public entry point for layout and transfer management.
+///
+/// TransportManager combines layout registration/metadata management with
+/// transfer execution capabilities, providing a unified API for:
+/// - Registering local layouts and obtaining handles
+/// - Exporting/importing layout metadata for remote workers
+/// - Executing transfers between layouts using handles
+/// - Managing CUDA, NIXL, and other execution resources
+#[derive(Clone)]
+pub struct TransportManager {
+    registry: Arc<RwLock<LayoutRegistry>>,
+    context: Arc<TransferContext>,
+}
+
+impl TransportManager {
+    /// Create a new TransportManager builder.
+    ///
+    /// The builder configures the worker ID, NIXL agent, CUDA device,
+    /// and other execution parameters before creating the manager.
+    ///
+    /// # Example
+    /// ```ignore
+    /// let manager = TransportManager::builder()
+    ///     .worker_id(0)  // NIXL agent name defaults to "worker-0"
+    ///     .nixl_backend("ucx")  // Optional: defaults to UCX from env
+    ///     .cuda_device_id(0)
+    ///     .build()?;
+    ///
+    /// // Or with custom agent name:
+    /// let manager = TransportManager::builder()
+    ///     .worker_id(0)
+    ///     .nixl_agent_name("custom-agent")
+    ///     .build()?;
+    /// ```
+    pub fn builder() -> crate::block_manager::v2::physical::transfer::context::TransferConfigBuilder
+    {
+        TransferContext::builder()
+    }
+
+    /// Create a TransportManager from a built TransferContext.
+    ///
+    /// This is used internally by the builder to wrap the context
+    /// and create the associated registry.
+    pub(crate) fn from_context(context: TransferContext) -> Self {
+        let worker_id = context.worker_id();
+        let nixl_agent = context.nixl_agent().clone();
+        let registry = Arc::new(RwLock::new(LayoutRegistry::new(nixl_agent, worker_id)));
+
+        Self {
+            registry,
+            context: Arc::new(context),
+        }
+    }
+
+    // ===== Layout Registration and Metadata Management =====
+
+    /// Register a local physical layout and return a unique handle.
+    ///
+    /// This registers the layout with the embedded memory manager, assigning
+    /// it a unique handle that can be used for handle-based transfers.
+    ///
+    /// # Arguments
+    /// * `layout` - Physical layout to register
+    ///
+    /// # Returns
+    /// Unique handle for the registered layout
+    ///
+    /// # Errors
+    /// Returns an error if layout IDs are exhausted (u16::MAX reached)
+    pub fn register_layout(&self, layout: PhysicalLayout) -> Result<LayoutHandle> {
+        self.registry.write().unwrap().register_local(layout)
+    }
+
+    /// Export layout metadata for transmission to remote workers.
+    ///
+    /// This exports all registered local layouts along with NIXL metadata
+    /// needed for remote memory registration.
+    ///
+    /// # Returns
+    /// Packed metadata ready for transmission to remote workers
+    pub fn export_metadata(&self) -> Result<SerializedLayout> {
+        self.registry.read().unwrap().export_metadata()
+    }
+
+    /// Import remote layout metadata.
+    ///
+    /// This loads NIXL metadata and reconstructs physical layouts from a remote
+    /// worker's exported metadata.
+    ///
+    /// # Arguments
+    /// * `metadata` - Packed metadata from remote worker
+    ///
+    /// # Returns
+    /// Vector of handles for the imported remote layouts
+    ///
+    /// # Errors
+    /// Returns an error if the remote worker was already loaded or if metadata
+    /// loading/reconstruction fails
+    pub fn import_metadata(&self, metadata: SerializedLayout) -> Result<Vec<LayoutHandle>> {
+        self.registry.write().unwrap().import_metadata(metadata)
+    }
+
+    // ===== Handle-Based Transfer API =====
+
+    /// Transfer complete blocks between layouts using handles.
+    ///
+    /// This function copies entire blocks (all layers and outer dimensions) between
+    /// the source and destination layouts identified by their handles. The transfer
+    /// strategy (memcpy, CUDA, NIXL) is automatically selected based on storage locations.
+    ///
+    /// The lock on the registry is held only briefly during layout lookup,
+    /// then released before executing the actual transfer.
+    ///
+    /// # Arguments
+    /// * `src_handle` - Handle to source layout
+    /// * `src_blocks` - Source block IDs to transfer
+    /// * `dst_handle` - Handle to destination layout
+    /// * `dst_blocks` - Destination block IDs to transfer
+    ///
+    /// # Returns
+    /// A notification handle that can be awaited for transfer completion
+    ///
+    /// # Errors
+    /// Returns an error if:
+    /// - Either handle is invalid
+    /// - Block IDs are out of bounds
+    /// - Transfer execution fails
+    pub fn execute_transfer(
+        &self,
+        src_handle: LayoutHandle,
+        src_blocks: &[usize],
+        dst_handle: LayoutHandle,
+        dst_blocks: &[usize],
+        options: TransferOptions,
+    ) -> Result<TransferCompleteNotification> {
+        // Clone layouts inside the lock, then drop lock before transfer
+        let (src_layout, dst_layout) = {
+            let registry = self.registry.read().unwrap();
+            let src = registry
+                .get_layout(src_handle)
+                .ok_or_else(|| anyhow!("invalid source handle: {}", src_handle))?
+                .clone(); // Cheap: just Arc refcount bump
+            let dst = registry
+                .get_layout(dst_handle)
+                .ok_or_else(|| anyhow!("invalid destination handle: {}", dst_handle))?
+                .clone();
+            (src, dst)
+        }; // Lock released here
+
+        // Execute transfer with no lock held
+        super::transfer::executor::execute_transfer(
+            &src_layout,
+            &dst_layout,
+            src_blocks,
+            dst_blocks,
+            options,
+            &self.context,
+        )
+    }
+
+    // ===== Query Methods =====
+
+    /// Get the worker ID for this manager.
+    pub fn worker_id(&self) -> u64 {
+        self.context.worker_id()
+    }
+
+    /// Get handles for all locally registered layouts.
+    pub fn get_local_handles(&self) -> Vec<LayoutHandle> {
+        self.registry.read().unwrap().local_handles()
+    }
+
+    /// Get handles for all imported remote layouts.
+    pub fn get_remote_handles(&self) -> Vec<LayoutHandle> {
+        self.registry.read().unwrap().remote_handles()
+    }
+
+    // ===== Internal Methods for Testing =====
+
+    /// Get the internal transfer context (for testing only).
+    pub fn context(&self) -> &Arc<TransferContext> {
+        &self.context
+    }
+
+    /// Get the H2D stream (for testing only).
+    #[cfg(all(test, feature = "testing-cuda"))]
+    pub(crate) fn h2d_stream(&self) -> &std::sync::Arc<cudarc::driver::CudaStream> {
+        self.context.h2d_stream()
+    }
+
+    /// Get the D2H stream (for testing only).
+    #[cfg(all(test, feature = "testing-cuda"))]
+    #[allow(dead_code)]
+    pub(crate) fn d2h_stream(&self) -> &std::sync::Arc<cudarc::driver::CudaStream> {
+        self.context.d2h_stream()
+    }
+
+    /// Get the CUDA context (for testing only).
+    #[cfg(all(test, feature = "testing-cuda"))]
+    pub(crate) fn cuda_context(&self) -> &std::sync::Arc<cudarc::driver::CudaContext> {
+        self.context.cuda_context()
+    }
+
+    /// Register a CUDA event for completion (for testing only).
+    #[cfg(all(test, feature = "testing-cuda"))]
+    pub(crate) fn register_cuda_event(
+        &self,
+        event: cudarc::driver::CudaEvent,
+    ) -> TransferCompleteNotification {
+        self.context.register_cuda_event(event)
+    }
+}
+
+/// Internal registry for local and remote physical layouts with NIXL integration.
+///
+/// The LayoutRegistry handles:
+/// - Registering local layouts with unique handles
+/// - Exporting local layout metadata for remote access
+/// - Importing remote layout metadata and reconstructing layouts
+/// - Managing NIXL metadata for RDMA operations
+#[derive(Debug)]
+pub(crate) struct LayoutRegistry {
+    /// NIXL agent for memory registration
+    nixl_agent: NixlAgent,
+    /// Worker ID for this manager
+    worker_id: u64,
+    /// Next layout ID to assign (monotonically increasing)
+    next_layout_id: AtomicU16,
+    /// Local layouts registered on this worker
+    local_layouts: HashMap<LayoutHandle, LocalLayout>,
+    /// Remote layouts imported from other workers
+    remote_layouts: HashMap<LayoutHandle, RemoteLayout>,
+    /// Set of loaded remote workers (agent_name, worker_id) to prevent duplicates
+    loaded_remotes: HashSet<(String, u64)>,
+}
+
+#[expect(dead_code)]
+impl LayoutRegistry {
+    /// Create a new layout manager.
+    ///
+    /// # Arguments
+    /// * `nixl_agent` - NIXL agent for memory registration
+    /// * `worker_id` - Unique identifier for this worker
+    pub(crate) fn new(nixl_agent: NixlAgent, worker_id: u64) -> Self {
+        Self {
+            nixl_agent,
+            worker_id,
+            next_layout_id: AtomicU16::new(0),
+            local_layouts: HashMap::new(),
+            remote_layouts: HashMap::new(),
+            loaded_remotes: HashSet::new(),
+        }
+    }
+
+    /// Register a local physical layout.
+    ///
+    /// # Arguments
+    /// * `layout` - Physical layout to register
+    ///
+    /// # Returns
+    /// Unique handle for the registered layout
+    ///
+    /// # Errors
+    /// Returns an error if layout IDs are exhausted (u16::MAX reached)
+    pub(crate) fn register_local(&mut self, layout: PhysicalLayout) -> Result<LayoutHandle> {
+        // Get next layout ID
+        let layout_id = self.next_layout_id.fetch_add(1, Ordering::SeqCst);
+        if layout_id == u16::MAX {
+            bail!("Layout ID overflow: maximum number of layouts (65535) reached");
+        }
+
+        // Create handle
+        let handle = LayoutHandle::new(self.worker_id, layout_id);
+
+        // Wrap in LocalLayout
+        let local_layout = LocalLayout::new(handle, layout);
+
+        // Store
+        self.local_layouts.insert(handle, local_layout);
+
+        Ok(handle)
+    }
+
+    /// Export local layout metadata for transmission to remote workers.
+    ///
+    /// This exports:
+    /// - NIXL agent metadata for remote memory registration
+    /// - All host and device layouts (disk layouts are excluded)
+    /// - Worker address information
+    ///
+    /// # Returns
+    /// Packed metadata ready for transmission
+    pub(crate) fn export_metadata(&self) -> Result<SerializedLayout> {
+        // Get NIXL metadata from agent
+        let nixl_metadata = self
+            .nixl_agent
+            .get_local_md()
+            .map_err(|e| anyhow!("failed to get NIXL local metadata: {:?}", e))?;
+
+        // Create worker address
+        let worker_address = WorkerAddress::new(self.worker_id, self.nixl_agent.name().to_string());
+
+        // Filter and serialize layouts (only host and device, skip disk)
+        let mut serialized_layouts = Vec::new();
+        for (handle, local_layout) in &self.local_layouts {
+            let location = local_layout.layout().location();
+
+            // Only export host and device layouts
+            if matches!(
+                location,
+                StorageKind::System | StorageKind::Device(_) | StorageKind::Pinned
+            ) {
+                let serialized = local_layout
+                    .layout()
+                    .to_descriptor()
+                    .map_err(|e| anyhow!("failed to serialize layout {}: {}", handle, e))?;
+
+                serialized_layouts.push(LocalLayoutDescriptor::new(*handle, serialized));
+            }
+        }
+
+        // Pack into managed metadata
+        SerializedLayout::pack(worker_address, nixl_metadata, serialized_layouts)
+    }
+
+    /// Import remote layout metadata.
+    ///
+    /// This:
+    /// - Validates the remote worker hasn't been loaded already
+    /// - Loads NIXL metadata into the agent
+    /// - Reconstructs physical layouts from serialized data
+    /// - Stores them as remote layouts
+    ///
+    /// # Arguments
+    /// * `metadata` - Packed metadata from remote worker
+    ///
+    /// # Returns
+    /// Vector of handles for the imported layouts
+    ///
+    /// # Errors
+    /// Returns an error if:
+    /// - The remote worker was already loaded
+    /// - NIXL metadata loading fails
+    /// - Agent name mismatch after loading
+    /// - Layout reconstruction fails
+    pub(crate) fn import_metadata(
+        &mut self,
+        metadata: SerializedLayout,
+    ) -> Result<Vec<LayoutHandle>> {
+        // Unpack metadata
+        let inner = metadata.unpack()?;
+
+        // Validate not already loaded
+        let remote_key = (
+            inner.worker_address.nixl_agent_name.clone(),
+            inner.worker_address.worker_id,
+        );
+        if self.loaded_remotes.contains(&remote_key) {
+            bail!(
+                "Remote worker already loaded: {} (worker_id={})",
+                remote_key.0,
+                remote_key.1
+            );
+        }
+
+        // Load NIXL metadata
+        let returned_agent_name = self
+            .nixl_agent
+            .load_remote_md(&inner.nixl_metadata)
+            .map_err(|e| anyhow!("failed to load remote NIXL metadata: {:?}", e))?;
+
+        // Verify agent name matches
+        if returned_agent_name != inner.worker_address.nixl_agent_name {
+            bail!(
+                "Agent name mismatch: expected '{}', got '{}'",
+                inner.worker_address.nixl_agent_name,
+                returned_agent_name
+            );
+        }
+
+        // Reconstruct layouts
+        let mut imported_handles = Vec::new();
+        for serialized_with_handle in inner.layouts {
+            let handle = serialized_with_handle.handle;
+            let layout = PhysicalLayout::from_descriptor(serialized_with_handle.layout)
+                .map_err(|e| anyhow!("failed to reconstruct layout {}: {}", handle, e))?;
+
+            let remote_layout = RemoteLayout::new(handle, layout);
+            self.remote_layouts.insert(handle, remote_layout);
+            imported_handles.push(handle);
+        }
+
+        // Mark remote as loaded
+        self.loaded_remotes.insert(remote_key);
+
+        Ok(imported_handles)
+    }
+
+    /// Get a local layout by handle.
+    pub(crate) fn get_local(&self, handle: LayoutHandle) -> Option<&LocalLayout> {
+        self.local_layouts.get(&handle)
+    }
+
+    /// Get a remote layout by handle.
+    pub(crate) fn get_remote(&self, handle: LayoutHandle) -> Option<&RemoteLayout> {
+        self.remote_layouts.get(&handle)
+    }
+
+    /// Get a layout by handle (either local or remote).
+    ///
+    /// # Returns
+    /// Returns a reference to the PhysicalLayout if found
+    pub(crate) fn get_layout(&self, handle: LayoutHandle) -> Option<&PhysicalLayout> {
+        self.local_layouts
+            .get(&handle)
+            .map(|l| l.layout())
+            .or_else(|| self.remote_layouts.get(&handle).map(|r| r.layout()))
+    }
+
+    /// Check if a handle refers to a local layout.
+    pub(crate) fn is_local(&self, handle: LayoutHandle) -> bool {
+        self.local_layouts.contains_key(&handle)
+    }
+
+    /// Check if a handle refers to a remote layout.
+    pub(crate) fn is_remote(&self, handle: LayoutHandle) -> bool {
+        self.remote_layouts.contains_key(&handle)
+    }
+
+    /// Get the number of local layouts.
+    pub(crate) fn local_count(&self) -> usize {
+        self.local_layouts.len()
+    }
+
+    /// Get the number of remote layouts.
+    pub(crate) fn remote_count(&self) -> usize {
+        self.remote_layouts.len()
+    }
+
+    /// Get the worker ID for this manager.
+    pub(crate) fn worker_id(&self) -> u64 {
+        self.worker_id
+    }
+
+    /// Get all local layout handles.
+    pub(crate) fn local_handles(&self) -> Vec<LayoutHandle> {
+        self.local_layouts.keys().copied().collect()
+    }
+
+    /// Get all remote layout handles.
+    pub(crate) fn remote_handles(&self) -> Vec<LayoutHandle> {
+        self.remote_layouts.keys().copied().collect()
+    }
+}
+
+#[cfg(all(test, feature = "testing-nixl"))]
+mod tests {
+    use super::*;
+    use crate::block_manager::v2::physical::layout::LayoutConfig;
+    use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent;
+
+    fn make_test_agent(name: &str) -> NixlAgent {
+        NixlAgent::require_backends(name, &[]).expect("failed to create wrapped agent")
+    }
+
+    fn make_test_layout(agent: &NixlAgent) -> PhysicalLayout {
+        let config = LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(2)
+            .outer_dim(2)
+            .page_size(4)
+            .inner_dim(8)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        PhysicalLayout::builder(agent.clone())
+            .with_config(config)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap()
+    }
+
+    #[test]
+    fn test_manager_creation() {
+        let agent = make_test_agent("test-manager");
+        let manager = LayoutRegistry::new(agent, 42);
+
+        assert_eq!(manager.worker_id(), 42);
+        assert_eq!(manager.local_count(), 0);
+        assert_eq!(manager.remote_count(), 0);
+    }
+
+    #[test]
+    fn test_register_local() {
+        let agent = make_test_agent("test-register");
+        let mut manager = LayoutRegistry::new(agent.clone(), 100);
+
+        let layout = make_test_layout(&agent);
+        let handle = manager.register_local(layout).unwrap();
+
+        assert_eq!(handle.worker_id(), 100);
+        assert_eq!(handle.layout_id(), 0);
+        assert_eq!(manager.local_count(), 1);
+        assert!(manager.is_local(handle));
+        assert!(!manager.is_remote(handle));
+    }
+
+    #[test]
+    fn test_register_multiple_locals() {
+        let agent = make_test_agent("test-multiple");
+        let mut manager = LayoutRegistry::new(agent.clone(), 1);
+
+        let handle1 = manager.register_local(make_test_layout(&agent)).unwrap();
+        let handle2 = manager.register_local(make_test_layout(&agent)).unwrap();
+        let handle3 = manager.register_local(make_test_layout(&agent)).unwrap();
+
+        assert_eq!(handle1.layout_id(), 0);
+        assert_eq!(handle2.layout_id(), 1);
+        assert_eq!(handle3.layout_id(), 2);
+        assert_eq!(manager.local_count(), 3);
+    }
+
+    #[test]
+    #[ignore] // Requires actual NIXL memory registration
+    fn test_export_import_roundtrip() {
+        // Create source manager and register layouts
+        let source_agent = make_test_agent("source");
+        let mut source_manager = LayoutRegistry::new(source_agent.clone(), 1);
+
+        let handle1 = source_manager
+            .register_local(make_test_layout(&source_agent))
+            .unwrap();
+        let handle2 = source_manager
+            .register_local(make_test_layout(&source_agent))
+            .unwrap();
+
+        // Export metadata
+        let metadata = source_manager.export_metadata().unwrap();
+        assert!(!metadata.is_empty());
+
+        // Create destination manager and import
+        let dest_agent = make_test_agent("dest");
+        let mut dest_manager = LayoutRegistry::new(dest_agent, 2);
+
+        let imported_handles = dest_manager.import_metadata(metadata).unwrap();
+
+        // Verify
+        assert_eq!(imported_handles.len(), 2);
+        assert_eq!(dest_manager.remote_count(), 2);
+        assert!(dest_manager.is_remote(handle1));
+        assert!(dest_manager.is_remote(handle2));
+
+        // Can get layouts
+        assert!(dest_manager.get_remote(handle1).is_some());
+        assert!(dest_manager.get_remote(handle2).is_some());
+        assert!(dest_manager.get_layout(handle1).is_some());
+    }
+
+    #[test]
+    #[ignore] // Requires actual NIXL memory registration
+    fn test_import_duplicate_remote_fails() {
+        let source_agent = make_test_agent("source2");
+        let mut source_manager = LayoutRegistry::new(source_agent.clone(), 10);
+
+        source_manager
+            .register_local(make_test_layout(&source_agent))
+            .unwrap();
+
+        let metadata = source_manager.export_metadata().unwrap();
+
+        let dest_agent = make_test_agent("dest2");
+        let mut dest_manager = LayoutRegistry::new(dest_agent, 20);
+
+        // First import succeeds
+        let metadata_clone = SerializedLayout::from_bytes(metadata.as_bytes().clone());
+        dest_manager.import_metadata(metadata).unwrap();
+
+        // Second import should fail
+        let result = dest_manager.import_metadata(metadata_clone);
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains("already loaded"));
+    }
+
+    #[test]
+    fn test_get_layout_handles() {
+        let agent = make_test_agent("test-handles");
+        let mut manager = LayoutRegistry::new(agent.clone(), 5);
+
+        let h1 = manager.register_local(make_test_layout(&agent)).unwrap();
+        let h2 = manager.register_local(make_test_layout(&agent)).unwrap();
+
+        let handles = manager.local_handles();
+        assert_eq!(handles.len(), 2);
+        assert!(handles.contains(&h1));
+        assert!(handles.contains(&h2));
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/manager/remote.rs b/lib/llm/src/block_manager/v2/physical/manager/remote.rs
new file mode 100644
index 0000000000..f80defcbce
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/manager/remote.rs
@@ -0,0 +1,127 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Remote layout wrapper reconstructed from imported metadata.
+
+use super::handle::LayoutHandle;
+use crate::block_manager::v2::physical::layout::PhysicalLayout;
+
+/// A remote physical layout reconstructed from imported metadata.
+///
+/// This wraps a `PhysicalLayout` that was deserialized from another worker's
+/// exported metadata. The layout's memory regions point to addresses on the
+/// remote worker and are used for building NIXL RDMA transfer descriptors.
+///
+/// This type is cheap to clone as `PhysicalLayout` contains `Arc` internally.
+#[derive(Debug, Clone)]
+pub struct RemoteLayout {
+    handle: LayoutHandle,
+    layout: PhysicalLayout,
+}
+
+#[allow(dead_code)]
+impl RemoteLayout {
+    /// Create a new remote layout.
+    ///
+    /// # Arguments
+    /// * `handle` - Unique handle for this layout (from remote worker)
+    /// * `layout` - The reconstructed physical layout
+    pub fn new(handle: LayoutHandle, layout: PhysicalLayout) -> Self {
+        Self { handle, layout }
+    }
+
+    /// Get the handle for this layout.
+    pub fn handle(&self) -> LayoutHandle {
+        self.handle
+    }
+
+    /// Get a reference to the physical layout.
+    pub fn layout(&self) -> &PhysicalLayout {
+        &self.layout
+    }
+
+    /// Get the worker_id from the handle (identifies the remote worker).
+    pub fn worker_id(&self) -> u64 {
+        self.handle.worker_id()
+    }
+
+    /// Get the layout_id from the handle.
+    pub fn layout_id(&self) -> u16 {
+        self.handle.layout_id()
+    }
+
+    /// Consume this remote layout and return the physical layout.
+    pub fn into_layout(self) -> PhysicalLayout {
+        self.layout
+    }
+}
+
+#[cfg(all(test, feature = "testing-nixl"))]
+mod tests {
+    use super::*;
+    use crate::block_manager::v2::physical::layout::{
+        LayoutConfig, LayoutDescriptor, PhysicalLayout,
+    };
+
+    fn make_serialized_layout() -> LayoutDescriptor {
+        use crate::block_manager::v2::memory::{MemoryDescriptor, StorageKind};
+        use crate::block_manager::v2::physical::layout::{
+            BlockFormat, FullyContiguousDetails, LayoutTypeDetails, NixlMetadata,
+        };
+
+        let config = LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(2)
+            .outer_dim(2)
+            .page_size(4)
+            .inner_dim(8)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        let required_size = config.num_blocks
+            * config.num_layers
+            * config.outer_dim
+            * config.page_size
+            * config.inner_dim
+            * config.dtype_width_bytes;
+
+        LayoutDescriptor {
+            version: 1,
+            layout_config: config,
+            location: StorageKind::System,
+            nixl_metadata: NixlMetadata::new(
+                "remote_agent".to_string(),
+                nixl_sys::MemType::Dram,
+                0,
+            ),
+            memory_descriptors: vec![MemoryDescriptor::new(0x1000, required_size)],
+            layout_type_details: LayoutTypeDetails::FullyContiguous(FullyContiguousDetails {
+                block_format: BlockFormat::Operational,
+            }),
+        }
+    }
+
+    #[test]
+    fn test_remote_layout_creation() {
+        let handle = LayoutHandle::new(999, 42);
+        let serialized = make_serialized_layout();
+        let layout = PhysicalLayout::from_descriptor(serialized).unwrap();
+        let remote = RemoteLayout::new(handle, layout);
+
+        assert_eq!(remote.handle(), handle);
+        assert_eq!(remote.worker_id(), 999);
+        assert_eq!(remote.layout_id(), 42);
+    }
+
+    #[test]
+    fn test_remote_layout_into_layout() {
+        let handle = LayoutHandle::new(100, 200);
+        let serialized = make_serialized_layout();
+        let layout = PhysicalLayout::from_descriptor(serialized).unwrap();
+        let remote = RemoteLayout::new(handle, layout);
+
+        let _recovered = remote.into_layout();
+        // Successfully consumed and returned the layout
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/mod.rs b/lib/llm/src/block_manager/v2/physical/mod.rs
new file mode 100644
index 0000000000..38be109612
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/mod.rs
@@ -0,0 +1,6 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+pub mod layout;
+pub mod manager;
+pub mod transfer;
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/capabilities.rs b/lib/llm/src/block_manager/v2/physical/transfer/capabilities.rs
new file mode 100644
index 0000000000..065b38e092
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/capabilities.rs
@@ -0,0 +1,209 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer capability flags for controlling direct path enablement.
+//!
+//! By default, the transfer system uses a conservative staging policy where:
+//! - Device can only transfer to/from Host
+//! - Disk can only transfer to/from Host
+//! - Host can transfer to Device, Disk, or Remote
+//! - Device ↔ Device is allowed (native CUDA)
+//!
+//! These capability flags enable optional direct paths that bypass host staging.
+
+use serde::{Deserialize, Serialize};
+use std::sync::OnceLock;
+
+use crate::block_manager::v2::physical::{
+    layout::LayoutConfig,
+    transfer::{
+        PhysicalLayout, TransferOptions, TransportManager, executor::execute_transfer,
+        nixl_agent::NixlAgent,
+    },
+};
+
+/// Transfer capability flags controlling which direct paths are enabled.
+///
+/// # Default Policy (Conservative)
+///
+/// With all flags disabled (default), the system uses host staging:
+/// - **Device → Remote**: Device → Host → Remote (2 hops)
+/// - **Disk → Remote**: Disk → Host → Remote (2 hops)
+/// - **Device ↔ Disk**: Device → Host → Disk (2 hops)
+///
+/// # Optional Direct Paths
+///
+/// - `allow_gds`: Enables GPU Direct Storage (Disk ↔ Device without host)
+/// - `allow_gpu_rdma`: Enables GPU RDMA (Device → Remote without host)
+///
+/// # Example
+///
+/// ```
+/// # use dynamo_kvbm::v2::physical::transfer::TransferCapabilities;
+/// // Default conservative policy
+/// let caps = TransferCapabilities::default();
+/// assert!(!caps.allow_gds);
+/// assert!(!caps.allow_gpu_rdma);
+///
+/// // Enable GDS for high-performance disk I/O
+/// let caps = TransferCapabilities::default().with_gds(true);
+/// ```
+static GDS_SUPPORTED: OnceLock<bool> = OnceLock::new();
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
+pub struct TransferCapabilities {
+    /// Enable GPU Direct Storage (Disk ↔ Device without host staging).
+    ///
+    /// When enabled:
+    /// - Disk → Device: Direct transfer (requires GDS support)
+    /// - Device → Disk: Direct transfer (requires GDS support)
+    ///
+    /// When disabled (default):
+    /// - Disk → Device: Disk → Host → Device (2 hops)
+    /// - Device → Disk: Device → Host → Disk (2 hops)
+    pub allow_gds: bool,
+
+    /// Enable GPU RDMA (Device → Remote without host staging).
+    ///
+    /// When enabled:
+    /// - Device → Remote: Direct NIXL transfer
+    ///
+    /// When disabled (default):
+    /// - Device → Remote: Device → Host → Remote (2 hops)
+    ///
+    /// Note: This only affects Device → Remote. Host → Remote is always direct.
+    pub allow_gpu_rdma: bool,
+}
+
+impl TransferCapabilities {
+    /// Create capabilities with default conservative policy (all direct paths disabled).
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Create capabilities with all direct paths enabled (high performance mode).
+    pub fn all_enabled() -> Self {
+        Self {
+            allow_gds: true,
+            allow_gpu_rdma: true,
+        }
+    }
+
+    /// Set the GDS (GPU Direct Storage) capability.
+    pub fn with_gds(mut self, enabled: bool) -> Self {
+        self.allow_gds = enabled;
+        self
+    }
+
+    fn test_gds_transfer(&self) -> anyhow::Result<()> {
+        let agent = NixlAgent::require_backends("agent", &["GDS_MT"])?;
+
+        // Try a little test transfer and see if it works.
+        let config = LayoutConfig::builder()
+            .num_blocks(1)
+            .num_layers(1)
+            .outer_dim(1)
+            .page_size(1)
+            .inner_dim(4096)
+            .build()?;
+
+        let src = PhysicalLayout::builder(agent.clone())
+            .with_config(config.clone())
+            .fully_contiguous()
+            .allocate_device(0)
+            .build()?;
+        let dst = PhysicalLayout::builder(agent.clone())
+            .with_config(config)
+            .fully_contiguous()
+            .allocate_disk(None)
+            .build()?;
+
+        let src_blocks = vec![0];
+        let dst_blocks = vec![0];
+
+        let ctx = TransportManager::builder()
+            .worker_id(0)
+            .nixl_agent(agent)
+            .cuda_device_id(0)
+            .build()?;
+
+        execute_transfer(
+            &src,
+            &dst,
+            &src_blocks,
+            &dst_blocks,
+            TransferOptions::default(),
+            ctx.context(),
+        )?;
+
+        Ok(())
+    }
+
+    pub fn with_gds_if_supported(mut self) -> Self {
+        self.allow_gds = *GDS_SUPPORTED.get_or_init(|| self.test_gds_transfer().is_ok());
+
+        self
+    }
+
+    /// Set the GPU RDMA capability.
+    pub fn with_gpu_rdma(mut self, enabled: bool) -> Self {
+        self.allow_gpu_rdma = enabled;
+        self
+    }
+
+    /// Check if a direct path from Device to Disk is allowed.
+    pub fn allows_device_disk_direct(&self) -> bool {
+        self.allow_gds
+    }
+
+    /// Check if a direct path from Device to Remote is allowed.
+    pub fn allows_device_remote_direct(&self) -> bool {
+        self.allow_gpu_rdma
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_capabilities() {
+        let caps = TransferCapabilities::default();
+        assert!(!caps.allow_gds);
+        assert!(!caps.allow_gpu_rdma);
+        assert!(!caps.allows_device_disk_direct());
+        assert!(!caps.allows_device_remote_direct());
+    }
+
+    #[test]
+    fn test_all_enabled() {
+        let caps = TransferCapabilities::all_enabled();
+        assert!(caps.allow_gds);
+        assert!(caps.allow_gpu_rdma);
+        assert!(caps.allows_device_disk_direct());
+        assert!(caps.allows_device_remote_direct());
+    }
+
+    #[test]
+    fn test_builder_pattern() {
+        let caps = TransferCapabilities::new()
+            .with_gds(true)
+            .with_gpu_rdma(false);
+
+        assert!(caps.allow_gds);
+        assert!(!caps.allow_gpu_rdma);
+    }
+
+    #[test]
+    fn test_selective_enablement() {
+        // Enable only GDS
+        let caps = TransferCapabilities::new().with_gds(true);
+        assert!(caps.allows_device_disk_direct());
+        assert!(!caps.allows_device_remote_direct());
+
+        // Enable only GPU RDMA
+        let caps = TransferCapabilities::new().with_gpu_rdma(true);
+        assert!(!caps.allows_device_disk_direct());
+        assert!(caps.allows_device_remote_direct());
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/checksum.rs b/lib/llm/src/block_manager/v2/physical/transfer/checksum.rs
new file mode 100644
index 0000000000..85afadd3f5
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/checksum.rs
@@ -0,0 +1,264 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Block checksum computation for verification.
+//!
+//! This module provides utilities to compute checksums of blocks for
+//! round-trip test verification.
+
+use crate::block_manager::v2::memory::StorageKind;
+
+use super::PhysicalLayout;
+
+use aligned_vec::{AVec, avec};
+use anyhow::{Result, anyhow};
+use blake3::Hasher;
+
+use std::{
+    collections::HashMap,
+    fs::File,
+    io::{Read, Seek},
+    mem::ManuallyDrop,
+    ops::Range,
+    os::fd::FromRawFd,
+};
+
+use cudarc::runtime::sys::{cudaMemcpy, cudaMemcpyKind};
+
+pub type BlockChecksum = String;
+
+/// Compute checksums for a list of blocks.
+///
+/// # Arguments
+/// * `layout` - The physical layout containing the blocks
+/// * `block_ids` - List of block IDs to checksum
+///
+/// # Returns
+/// A map from block ID to its checksum
+///
+/// # Errors
+/// Returns an error if:
+/// - Layout is remote (cannot checksum remote memory directly)
+/// - Block IDs are out of range
+pub fn compute_block_checksums(
+    layout: &PhysicalLayout,
+    block_ids: &[usize],
+) -> Result<HashMap<usize, BlockChecksum>> {
+    let mut checksums = HashMap::new();
+
+    for &block_id in block_ids {
+        let checksum = compute_single_block_checksum(layout, block_id, None)?;
+        checksums.insert(block_id, checksum);
+    }
+
+    Ok(checksums)
+}
+
+/// Compute checksums for specific layers in blocks.
+///
+/// # Arguments
+/// * `layout` - The physical layout containing the blocks
+/// * `block_ids` - List of block IDs to checksum
+/// * `layer_range` - Range of layers to include in checksum
+///
+/// # Returns
+/// A map from block ID to its checksum (for the specified layers only)
+pub fn compute_layer_checksums(
+    layout: &PhysicalLayout,
+    block_ids: &[usize],
+    layer_range: Range<usize>,
+) -> Result<HashMap<usize, BlockChecksum>> {
+    let config = layout.layout().config();
+    if layer_range.end > config.num_layers {
+        return Err(anyhow!(
+            "Layer range {:?} exceeds num_layers {}",
+            layer_range,
+            config.num_layers
+        ));
+    }
+
+    let mut checksums = HashMap::new();
+
+    for &block_id in block_ids {
+        let checksum = compute_single_block_checksum(layout, block_id, Some(layer_range.clone()))?;
+        checksums.insert(block_id, checksum);
+    }
+
+    Ok(checksums)
+}
+
+/// Compute checksum for a single block.
+fn compute_single_block_checksum(
+    layout: &PhysicalLayout,
+    block_id: usize,
+    layer_range: Option<Range<usize>>,
+) -> Result<String> {
+    let config = layout.layout().config();
+
+    if block_id >= config.num_blocks {
+        return Err(anyhow!("Block ID {} out of range", block_id));
+    }
+
+    let num_layers = config.num_layers;
+    let outer_dim = config.outer_dim;
+
+    let layers = layer_range.unwrap_or(0..num_layers);
+
+    // validate layer range
+    if layers.end > config.num_layers {
+        return Err(anyhow!(
+            "Layer range {:?} exceeds num_layers {}",
+            layers,
+            config.num_layers
+        ));
+    }
+
+    let mut hasher = Hasher::new();
+
+    // Iterate over all layers and outer dimensions
+    for layer_id in layers {
+        for outer_id in 0..outer_dim {
+            let region = layout.memory_region(block_id, layer_id, outer_id)?;
+
+            match layout.location() {
+                StorageKind::System | StorageKind::Pinned => {
+                    let slice = unsafe {
+                        std::slice::from_raw_parts(region.addr() as *const u8, region.size())
+                    };
+                    hasher.update(slice);
+                }
+                StorageKind::Device(_) => {
+                    let system_region: Vec<u8> = vec![0; region.size()];
+                    unsafe {
+                        cudaMemcpy(
+                            system_region.as_ptr() as *mut std::ffi::c_void,
+                            region.addr() as *const std::ffi::c_void,
+                            region.size(),
+                            cudaMemcpyKind::cudaMemcpyDeviceToHost,
+                        );
+                    }
+                    hasher.update(system_region.as_slice());
+                }
+                StorageKind::Disk(fd) => {
+                    let mut system_region: AVec<u8, _> = avec![[4096]| 0; region.size()];
+
+                    let mut file = ManuallyDrop::new(unsafe { File::from_raw_fd(fd as i32) });
+                    file.seek(std::io::SeekFrom::Start(region.addr() as u64))?;
+                    file.read_exact(&mut system_region)?;
+                    hasher.update(system_region.as_slice());
+                }
+            }
+        }
+    }
+
+    Ok(hasher.finalize().to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::super::tests::*;
+    use super::*;
+    use crate::block_manager::v2::physical::transfer::{FillPattern, fill_blocks};
+
+    #[test]
+    fn test_checksum_constant_pattern() {
+        let physical = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        fill_blocks(&physical, &[0, 1], FillPattern::Constant(42)).unwrap();
+
+        let checksums = compute_block_checksums(&physical, &[0, 1]).unwrap();
+
+        // Both blocks should have the same checksum values (same pattern)
+        assert_eq!(checksums[&0], checksums[&1]);
+
+        let memory_region = physical.memory_region(0, 0, 0).unwrap();
+        let slice = unsafe {
+            std::slice::from_raw_parts(memory_region.addr() as *const u8, memory_region.size())
+        };
+        assert!(slice.iter().all(|&b| b == 42));
+
+        let mut hasher = Hasher::new();
+        hasher.update(slice);
+        let checksum_mr_slice = hasher.finalize().to_string();
+
+        let vec = vec![42; memory_region.size()];
+        let mut hasher = Hasher::new();
+        hasher.update(&vec);
+        let checksum_vec = hasher.finalize().to_string();
+
+        assert_eq!(checksum_mr_slice, checksum_vec);
+    }
+
+    // #[test]
+    // fn test_checksum_different_patterns() {
+    //     let (layout, _memory) = create_test_layout(2);
+    //     let physical = PhysicalLayout::new_local(layout, StorageLocation::System);
+
+    //     // Fill blocks with different patterns
+    //     fill_blocks(&physical, &[0], FillPattern::Constant(42)).unwrap();
+    //     fill_blocks(&physical, &[1], FillPattern::Constant(100)).unwrap();
+
+    //     let checksums = compute_block_checksums(&physical, &[0, 1]).unwrap();
+
+    //     // Blocks should have different checksums
+    //     assert_ne!(checksums[&0], checksums[&1]);
+    // }
+
+    // #[test]
+    // fn test_checksum_matches() {
+    //     let (layout1, _memory1) = create_test_layout(1);
+    //     let (layout2, _memory2) = create_test_layout(1);
+
+    //     let physical1 = PhysicalLayout::new_local(layout1, StorageLocation::System);
+    //     let physical2 = PhysicalLayout::new_local(layout2, StorageLocation::System);
+
+    //     // Fill both with same pattern
+    //     fill_blocks(&physical1, &[0], FillPattern::Sequential).unwrap();
+    //     fill_blocks(&physical2, &[0], FillPattern::Sequential).unwrap();
+
+    //     let checksum1 = compute_block_checksums(&physical1, &[0]).unwrap();
+    //     let checksum2 = compute_block_checksums(&physical2, &[0]).unwrap();
+
+    //     // Checksums should match (ignoring block_id)
+    //     assert!(checksum1[&0].matches(&checksum2[&0]));
+    // }
+
+    // #[test]
+    // fn test_layer_checksums() {
+    //     let (layout, _memory) = create_test_layout(1);
+    //     let physical = PhysicalLayout::new_local(layout, StorageLocation::System);
+
+    //     // Fill entire block
+    //     fill_blocks(&physical, &[0], FillPattern::Sequential).unwrap();
+
+    //     // Compute checksums for different layer ranges
+    //     let full_checksum = compute_block_checksums(&physical, &[0]).unwrap();
+    //     let layer0_checksum = compute_layer_checksums(&physical, &[0], 0..1).unwrap();
+    //     let layer1_checksum = compute_layer_checksums(&physical, &[0], 1..2).unwrap();
+
+    //     // Layer checksums should be different from full checksum
+    //     assert_ne!(full_checksum[&0].byte_count, layer0_checksum[&0].byte_count);
+    //     assert_ne!(full_checksum[&0].byte_count, layer1_checksum[&0].byte_count);
+
+    //     // Layer 0 and Layer 1 should have same byte count (same size)
+    //     assert_eq!(
+    //         layer0_checksum[&0].byte_count,
+    //         layer1_checksum[&0].byte_count
+    //     );
+    // }
+
+    // #[test]
+    // fn test_checksum_remote_layout_fails() {
+    //     let (layout, _memory) = create_test_layout(1);
+    //     let physical =
+    //         PhysicalLayout::new_remote(layout, StorageLocation::System, "remote".to_string());
+
+    //     let result = compute_block_checksums(&physical, &[0]);
+    //     assert!(result.is_err());
+    //     assert!(result.unwrap_err().to_string().contains("remote"));
+    // }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/context.rs b/lib/llm/src/block_manager/v2/physical/transfer/context.rs
new file mode 100644
index 0000000000..9da1963af2
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/context.rs
@@ -0,0 +1,372 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer context.
+
+use std::sync::Arc;
+
+use crate::block_manager::v2::kernels::OperationalCopyBackend;
+use anyhow::Result;
+use cudarc::driver::{CudaContext, CudaEvent, CudaStream};
+use derive_builder::Builder;
+use nixl_sys::XferRequest;
+use tokio::sync::{mpsc, oneshot};
+use uuid::Uuid;
+
+use super::nixl_agent::{NixlAgent, NixlBackendConfig};
+
+use crate::block_manager::v2::physical::manager::TransportManager;
+
+// Notifications module is declared in ../mod.rs
+// Re-export for convenience
+use super::TransferCapabilities;
+pub use super::notifications;
+pub use super::notifications::TransferCompleteNotification;
+
+#[derive(Debug, Clone, Builder)]
+#[builder(pattern = "owned", build_fn(private, name = "build_internal"), public)]
+#[allow(dead_code)] // Fields are used in build() but derive macros confuse dead code analysis
+pub(crate) struct TransferConfig {
+    worker_id: u64,
+
+    /// Optional custom name for the NIXL agent. If not provided, defaults to "worker-{worker_id}"
+    #[builder(default = "None", setter(strip_option))]
+    nixl_agent_name: Option<String>,
+
+    /// Backend configuration for NIXL backends to enable
+    #[builder(default = "NixlBackendConfig::new()")]
+    nixl_backend_config: NixlBackendConfig,
+
+    #[builder(default = "0")]
+    cuda_device_id: usize,
+
+    #[builder(default = "get_tokio_runtime()")]
+    tokio_runtime: TokioRuntime,
+
+    #[builder(default = "TransferCapabilities::default()")]
+    capabilities: TransferCapabilities,
+
+    #[builder(default = "OperationalCopyBackend::Auto")]
+    operational_backend: OperationalCopyBackend,
+}
+
+impl TransferConfigBuilder {
+    /// Directly provide a pre-configured wrapped NIXL agent (mainly for testing).
+    ///
+    /// This bypasses the agent creation and backend initialization logic,
+    /// using the provided agent directly. Useful for tests that need full
+    /// control over agent configuration.
+    pub fn nixl_agent(self, agent: NixlAgent) -> TransferConfigBuilderWithAgent {
+        TransferConfigBuilderWithAgent {
+            builder: self,
+            agent,
+        }
+    }
+
+    /// Add a NIXL backend to enable (uses default plugin parameters).
+    pub fn nixl_backend(mut self, backend: impl Into<String>) -> Self {
+        let config = self
+            .nixl_backend_config
+            .get_or_insert_with(NixlBackendConfig::new);
+        *config = config.clone().with_backend(backend);
+        self
+    }
+
+    /// Load NIXL backend configuration from environment variables.
+    ///
+    /// This merges environment-based configuration with any backends already
+    /// configured via the builder.
+    pub fn with_env_backends(mut self) -> Result<Self> {
+        let env_config = NixlBackendConfig::from_env()?;
+        let config = self
+            .nixl_backend_config
+            .get_or_insert_with(NixlBackendConfig::new);
+        *config = config.clone().merge(env_config);
+        Ok(self)
+    }
+
+    pub fn build(self) -> Result<TransportManager> {
+        let mut config = self.build_internal()?;
+
+        // Merge environment backends if not explicitly configured
+        if config.nixl_backend_config.backends().is_empty() {
+            config.nixl_backend_config = NixlBackendConfig::from_env()?;
+        }
+
+        // Derive agent name from worker_id if not provided
+        let agent_name = config
+            .nixl_agent_name
+            .unwrap_or_else(|| format!("worker-{}", config.worker_id));
+
+        // Create wrapped NIXL agent with configured backends
+        let backend_names: Vec<&str> = config
+            .nixl_backend_config
+            .backends()
+            .iter()
+            .map(|s| s.as_str())
+            .collect();
+
+        let nixl_agent = if backend_names.is_empty() {
+            // No backends configured - create agent without backends
+            NixlAgent::new_with_backends(&agent_name, &[])?
+        } else {
+            // Create agent with requested backends
+            NixlAgent::new_with_backends(&agent_name, &backend_names)?
+        };
+
+        let cuda_context = CudaContext::new(config.cuda_device_id)?;
+        let context = TransferContext::new(
+            config.worker_id,
+            nixl_agent,
+            cuda_context,
+            config.tokio_runtime,
+            config.capabilities,
+            config.operational_backend,
+        )?;
+        Ok(TransportManager::from_context(context))
+    }
+}
+
+/// Builder that already has a pre-configured NIXL agent.
+///
+/// This is generally used for testing when you want to pass in an agent directly
+/// rather than having it created by the builder.
+pub struct TransferConfigBuilderWithAgent {
+    builder: TransferConfigBuilder,
+    agent: NixlAgent,
+}
+
+impl TransferConfigBuilderWithAgent {
+    /// Build the TransportManager using the pre-configured agent.
+    pub fn build(self) -> Result<TransportManager> {
+        let config = self.builder.build_internal()?;
+        let cuda_context = CudaContext::new(config.cuda_device_id)?;
+        let context = TransferContext::new(
+            config.worker_id,
+            self.agent,
+            cuda_context,
+            config.tokio_runtime,
+            config.capabilities,
+            config.operational_backend,
+        )?;
+        Ok(TransportManager::from_context(context))
+    }
+
+    // Proxy methods to allow configuring other builder fields
+    pub fn worker_id(mut self, worker_id: u64) -> Self {
+        self.builder = self.builder.worker_id(worker_id);
+        self
+    }
+
+    pub fn cuda_device_id(mut self, cuda_device_id: usize) -> Self {
+        self.builder = self.builder.cuda_device_id(cuda_device_id);
+        self
+    }
+}
+
+fn get_tokio_runtime() -> TokioRuntime {
+    match tokio::runtime::Handle::try_current() {
+        Ok(handle) => TokioRuntime::Handle(handle),
+        Err(_) => {
+            let rt = tokio::runtime::Builder::new_multi_thread()
+                .enable_all()
+                .max_blocking_threads(4)
+                .worker_threads(2)
+                .build()
+                .expect("failed to build tokio runtime");
+
+            TokioRuntime::Shared(Arc::new(rt))
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub(crate) enum TokioRuntime {
+    Handle(tokio::runtime::Handle),
+    Shared(Arc<tokio::runtime::Runtime>),
+}
+
+impl TokioRuntime {
+    pub fn handle(&self) -> &tokio::runtime::Handle {
+        match self {
+            TokioRuntime::Handle(handle) => handle,
+            TokioRuntime::Shared(runtime) => runtime.handle(),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct TransferContext {
+    worker_id: u64,
+    nixl_agent: NixlAgent,
+    #[allow(dead_code)]
+    cuda_context: Arc<CudaContext>,
+    d2h_stream: Arc<CudaStream>,
+    h2d_stream: Arc<CudaStream>,
+    #[allow(dead_code)]
+    tokio_runtime: TokioRuntime,
+    capabilities: TransferCapabilities,
+    operational_backend: OperationalCopyBackend,
+    // Channels for background notification handlers
+    tx_nixl_status:
+        mpsc::Sender<notifications::RegisterPollingNotification<notifications::NixlStatusChecker>>,
+    tx_cuda_event:
+        mpsc::Sender<notifications::RegisterPollingNotification<notifications::CudaEventChecker>>,
+    #[allow(dead_code)]
+    tx_nixl_events: mpsc::Sender<notifications::RegisterNixlNotification>,
+}
+
+impl TransferContext {
+    pub fn builder() -> TransferConfigBuilder {
+        TransferConfigBuilder::default()
+    }
+
+    pub(crate) fn new(
+        worker_id: u64,
+        nixl_agent: NixlAgent,
+        cuda_context: Arc<CudaContext>,
+        tokio_runtime: TokioRuntime,
+        capabilities: TransferCapabilities,
+        operational_backend: OperationalCopyBackend,
+    ) -> Result<Self> {
+        unsafe { cuda_context.disable_event_tracking() };
+
+        // Create channels for background notification handlers
+        let (tx_nixl_status, rx_nixl_status) = mpsc::channel(64);
+        let (tx_cuda_event, rx_cuda_event) = mpsc::channel(64);
+        let (tx_nixl_events, rx_nixl_events) = mpsc::channel(64);
+
+        // Spawn background handlers
+        let handle = tokio_runtime.handle();
+
+        // Spawn NIXL status polling handler
+        handle.spawn(notifications::process_polling_notifications(rx_nixl_status));
+
+        // Spawn CUDA event polling handler
+        handle.spawn(notifications::process_polling_notifications(rx_cuda_event));
+
+        // Spawn NIXL notification events handler
+        handle.spawn(notifications::process_nixl_notification_events(
+            nixl_agent.raw_agent().clone(),
+            rx_nixl_events,
+        ));
+
+        Ok(Self {
+            worker_id,
+            nixl_agent,
+            cuda_context: cuda_context.clone(),
+            d2h_stream: cuda_context.new_stream()?,
+            h2d_stream: cuda_context.new_stream()?,
+            tokio_runtime,
+            capabilities,
+            operational_backend,
+            tx_nixl_status,
+            tx_cuda_event,
+            tx_nixl_events,
+        })
+    }
+
+    pub(crate) fn nixl_agent(&self) -> &NixlAgent {
+        &self.nixl_agent
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn cuda_context(&self) -> &Arc<CudaContext> {
+        &self.cuda_context
+    }
+
+    pub(crate) fn d2h_stream(&self) -> &Arc<CudaStream> {
+        &self.d2h_stream
+    }
+
+    pub(crate) fn h2d_stream(&self) -> &Arc<CudaStream> {
+        &self.h2d_stream
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn tokio(&self) -> &tokio::runtime::Handle {
+        self.tokio_runtime.handle()
+    }
+
+    pub(crate) fn capabilities(&self) -> &TransferCapabilities {
+        &self.capabilities
+    }
+
+    pub(crate) fn operational_backend(&self) -> OperationalCopyBackend {
+        self.operational_backend
+    }
+
+    /// Register a NIXL transfer request for status polling completion.
+    ///
+    /// This method enqueues the transfer request to be polled for completion
+    /// using `agent.get_xfer_status()`. Returns a notification object that
+    /// can be awaited for completion.
+    pub(crate) fn register_nixl_status(
+        &self,
+        xfer_req: XferRequest,
+    ) -> TransferCompleteNotification {
+        let (done_tx, done_rx) = oneshot::channel();
+
+        let notification = notifications::RegisterPollingNotification {
+            uuid: Uuid::new_v4(),
+            checker: notifications::NixlStatusChecker::new(
+                self.nixl_agent.raw_agent().clone(),
+                xfer_req,
+            ),
+            done: done_tx,
+        };
+
+        // Send to background handler (ignore error if receiver dropped)
+        let _ = self.tx_nixl_status.try_send(notification);
+
+        TransferCompleteNotification { status: done_rx }
+    }
+
+    /// Register a CUDA event for polling completion.
+    ///
+    /// This method enqueues the CUDA event to be polled for completion.
+    /// Returns a notification object that can be awaited for completion.
+    pub(crate) fn register_cuda_event(&self, event: CudaEvent) -> TransferCompleteNotification {
+        let (done_tx, done_rx) = oneshot::channel();
+
+        let notification = notifications::RegisterPollingNotification {
+            uuid: Uuid::new_v4(),
+            checker: notifications::CudaEventChecker::new(event),
+            done: done_tx,
+        };
+
+        // Send to background handler (ignore error if receiver dropped)
+        let _ = self.tx_cuda_event.try_send(notification);
+
+        TransferCompleteNotification { status: done_rx }
+    }
+
+    /// Register a NIXL transfer request for notification-based completion.
+    ///
+    /// This method enqueues the transfer request to be completed via NIXL
+    /// notification events. Returns a notification object that can be awaited
+    /// for completion.
+    #[allow(dead_code)]
+    pub(crate) fn register_nixl_event(
+        &self,
+        xfer_req: XferRequest,
+    ) -> TransferCompleteNotification {
+        let (done_tx, done_rx) = oneshot::channel();
+
+        let notification = notifications::RegisterNixlNotification {
+            uuid: Uuid::new_v4(),
+            xfer_req,
+            done: done_tx,
+        };
+
+        // Send to background handler (ignore error if receiver dropped)
+        let _ = self.tx_nixl_events.try_send(notification);
+
+        TransferCompleteNotification { status: done_rx }
+    }
+
+    /// Get the worker ID for this context.
+    pub(crate) fn worker_id(&self) -> u64 {
+        self.worker_id
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/executor/cuda.rs b/lib/llm/src/block_manager/v2/physical/transfer/executor/cuda.rs
new file mode 100644
index 0000000000..5f6ef5764d
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/executor/cuda.rs
@@ -0,0 +1,318 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! CUDA executor for GPU memory transfers.
+
+use super::TransferContext;
+use super::{PhysicalLayout, TransferStrategy};
+use crate::block_manager::v2::kernels::OperationalCopyBackend;
+use crate::block_manager::v2::physical::transfer::context::TransferCompleteNotification;
+use anyhow::{Result, anyhow};
+use cudarc::driver::result as cuda_result;
+use std::ops::Range;
+
+// #[cfg(test)]
+// mod cuda_kernel_tests;
+
+/// Execute a CUDA transfer between host and device memory.
+///
+/// This executor handles transfers involving GPU memory using CUDA APIs.
+/// Supports async and blocking transfers depending on the strategy.
+///
+/// # Arguments
+/// * `src` - Source physical layout
+/// * `dst` - Destination physical layout
+/// * `src_block_ids` - Source block IDs to transfer
+/// * `dst_block_ids` - Destination block IDs to transfer
+/// * `layer_range` - Optional range of layers to transfer (None = all layers)
+/// * `strategy` - CUDA transfer strategy (H2D, D2H, D2D, async or blocking)
+/// * `ctx` - Transfer context with CUDA stream
+pub fn execute_cuda_transfer(
+    src: &PhysicalLayout,
+    dst: &PhysicalLayout,
+    src_block_ids: &[usize],
+    dst_block_ids: &[usize],
+    layer_range: Option<Range<usize>>,
+    strategy: TransferStrategy,
+    ctx: &TransferContext,
+) -> Result<TransferCompleteNotification> {
+    // Validate layouts
+    let src_layout = src.layout();
+    let dst_layout = dst.layout();
+
+    if src_layout.num_layers() != dst_layout.num_layers() {
+        return Err(anyhow!(
+            "Layouts have incompatible layer counts: src={}, dst={}",
+            src_layout.num_layers(),
+            dst_layout.num_layers()
+        ));
+    }
+
+    if src_layout.outer_dim() != dst_layout.outer_dim() {
+        return Err(anyhow!(
+            "Layouts have incompatible outer dimensions: src={}, dst={}",
+            src_layout.outer_dim(),
+            dst_layout.outer_dim()
+        ));
+    }
+
+    // Determine layer range
+    let layers = layer_range.unwrap_or(0..src_layout.num_layers());
+
+    // Get appropriate CUDA stream based on transfer direction
+    let stream = match strategy {
+        TransferStrategy::CudaAsyncD2H | TransferStrategy::CudaBlockingD2H => ctx.d2h_stream(),
+        _ => ctx.h2d_stream(), // H2D and D2D use h2d_stream
+    };
+
+    // Perform CUDA transfers based on strategy
+    match strategy {
+        TransferStrategy::CudaAsyncH2D => {
+            let backend = ctx.operational_backend();
+            if let Err(e) = try_execute_operational_kernel(
+                src,
+                dst,
+                src_block_ids,
+                dst_block_ids,
+                layers.clone(),
+                stream.as_ref(),
+                backend,
+            ) {
+                // Fallback to memcpy-based path
+                tracing::debug!("Kernel-based H2D failed ({}), falling back to memcpy", e);
+                execute_h2d(
+                    src,
+                    dst,
+                    src_block_ids,
+                    dst_block_ids,
+                    layers,
+                    stream.as_ref(),
+                )?;
+            }
+        }
+        TransferStrategy::CudaAsyncD2H => {
+            let backend = ctx.operational_backend();
+            if let Err(e) = try_execute_operational_kernel(
+                src,
+                dst,
+                src_block_ids,
+                dst_block_ids,
+                layers.clone(),
+                stream.as_ref(),
+                backend,
+            ) {
+                // Fallback to memcpy-based path
+                tracing::debug!("Kernel-based D2H failed ({}), falling back to memcpy", e);
+                execute_d2h(
+                    src,
+                    dst,
+                    src_block_ids,
+                    dst_block_ids,
+                    layers,
+                    stream.as_ref(),
+                )?;
+            }
+        }
+        TransferStrategy::CudaAsyncD2D => {
+            // Try kernel-based path first, fall back to memcpy on error
+            let backend = ctx.operational_backend();
+            if let Err(e) = try_execute_operational_kernel(
+                src,
+                dst,
+                src_block_ids,
+                dst_block_ids,
+                layers.clone(),
+                stream.as_ref(),
+                backend,
+            ) {
+                // Fallback to memcpy-based path
+                tracing::debug!("Kernel-based D2D failed ({}), falling back to memcpy", e);
+                execute_d2d(
+                    src,
+                    dst,
+                    src_block_ids,
+                    dst_block_ids,
+                    layers,
+                    stream.as_ref(),
+                )?;
+            }
+        }
+        TransferStrategy::CudaBlockingH2D => {
+            execute_h2d(
+                src,
+                dst,
+                src_block_ids,
+                dst_block_ids,
+                layers,
+                stream.as_ref(),
+            )?;
+            // Synchronize immediately for blocking transfer
+            stream.synchronize()?;
+        }
+        TransferStrategy::CudaBlockingD2H => {
+            execute_d2h(
+                src,
+                dst,
+                src_block_ids,
+                dst_block_ids,
+                layers,
+                stream.as_ref(),
+            )?;
+            // Synchronize immediately for blocking transfer
+            stream.synchronize()?;
+        }
+        _ => {
+            return Err(anyhow!("Invalid CUDA transfer strategy: {:?}", strategy));
+        }
+    }
+
+    // For async transfers, record an event and register it for completion tracking
+    if matches!(
+        strategy,
+        TransferStrategy::CudaAsyncH2D
+            | TransferStrategy::CudaAsyncD2H
+            | TransferStrategy::CudaAsyncD2D
+    ) {
+        let event = stream.record_event(None)?;
+        Ok(ctx.register_cuda_event(event))
+    } else {
+        // Blocking transfers are already synchronized
+        Ok(TransferCompleteNotification::completed())
+    }
+}
+
+/// Execute host-to-device transfer.
+fn execute_h2d(
+    src: &PhysicalLayout,
+    dst: &PhysicalLayout,
+    src_block_ids: &[usize],
+    dst_block_ids: &[usize],
+    layers: Range<usize>,
+    stream: &cudarc::driver::CudaStream,
+) -> Result<()> {
+    for (&src_block_id, &dst_block_id) in src_block_ids.iter().zip(dst_block_ids.iter()) {
+        for layer_id in layers.clone() {
+            for outer_id in 0..src.layout().outer_dim() {
+                let src_region = src.memory_region(src_block_id, layer_id, outer_id)?;
+                let dst_region = dst.memory_region(dst_block_id, layer_id, outer_id)?;
+
+                if src_region.size() != dst_region.size() {
+                    return Err(anyhow!(
+                        "Size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}",
+                        src_block_id,
+                        dst_block_id,
+                        layer_id,
+                        outer_id,
+                        src_region.size(),
+                        dst_region.size()
+                    ));
+                }
+
+                unsafe {
+                    let src_ptr = src_region.addr() as *const u8;
+                    let dst_ptr = dst_region.addr() as u64;
+                    let src_slice = std::slice::from_raw_parts(src_ptr, src_region.size());
+                    cuda_result::memcpy_htod_async(dst_ptr, src_slice, stream.cu_stream())?;
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+/// Execute device-to-host transfer.
+fn execute_d2h(
+    src: &PhysicalLayout,
+    dst: &PhysicalLayout,
+    src_block_ids: &[usize],
+    dst_block_ids: &[usize],
+    layers: Range<usize>,
+    stream: &cudarc::driver::CudaStream,
+) -> Result<()> {
+    for (&src_block_id, &dst_block_id) in src_block_ids.iter().zip(dst_block_ids.iter()) {
+        for layer_id in layers.clone() {
+            for outer_id in 0..src.layout().outer_dim() {
+                let src_region = src.memory_region(src_block_id, layer_id, outer_id)?;
+                let dst_region = dst.memory_region(dst_block_id, layer_id, outer_id)?;
+
+                if src_region.size() != dst_region.size() {
+                    return Err(anyhow!(
+                        "Size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}",
+                        src_block_id,
+                        dst_block_id,
+                        layer_id,
+                        outer_id,
+                        src_region.size(),
+                        dst_region.size()
+                    ));
+                }
+
+                unsafe {
+                    let src_ptr = src_region.addr() as u64;
+                    let dst_ptr = dst_region.addr() as *mut u8;
+                    let dst_slice = std::slice::from_raw_parts_mut(dst_ptr, dst_region.size());
+                    cuda_result::memcpy_dtoh_async(dst_slice, src_ptr, stream.cu_stream())?;
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+/// Execute device-to-device transfer.
+fn execute_d2d(
+    src: &PhysicalLayout,
+    dst: &PhysicalLayout,
+    src_block_ids: &[usize],
+    dst_block_ids: &[usize],
+    layers: Range<usize>,
+    stream: &cudarc::driver::CudaStream,
+) -> Result<()> {
+    for (&src_block_id, &dst_block_id) in src_block_ids.iter().zip(dst_block_ids.iter()) {
+        for layer_id in layers.clone() {
+            for outer_id in 0..src.layout().outer_dim() {
+                let src_region = src.memory_region(src_block_id, layer_id, outer_id)?;
+                let dst_region = dst.memory_region(dst_block_id, layer_id, outer_id)?;
+
+                if src_region.size() != dst_region.size() {
+                    return Err(anyhow!(
+                        "Size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}",
+                        src_block_id,
+                        dst_block_id,
+                        layer_id,
+                        outer_id,
+                        src_region.size(),
+                        dst_region.size()
+                    ));
+                }
+
+                unsafe {
+                    let src_ptr = src_region.addr() as u64;
+                    let dst_ptr = dst_region.addr() as u64;
+                    cuda_result::memcpy_dtod_async(
+                        dst_ptr,
+                        src_ptr,
+                        src_region.size(),
+                        stream.cu_stream(),
+                    )?;
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+/// TODO: For now, we've stubbed this out just so we can merge.
+/// For now, we'll always just fall back to memcpy.
+#[cfg_attr(test, allow(dead_code))]
+pub(crate) fn try_execute_operational_kernel(
+    _src: &PhysicalLayout,
+    _dst: &PhysicalLayout,
+    _src_block_ids: &[usize],
+    _dst_block_ids: &[usize],
+    _layers: Range<usize>,
+    _stream: &cudarc::driver::CudaStream,
+    _backend: OperationalCopyBackend,
+) -> Result<()> {
+    anyhow::bail!("Not implemented.");
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/executor/memcpy.rs b/lib/llm/src/block_manager/v2/physical/transfer/executor/memcpy.rs
new file mode 100644
index 0000000000..52d27cc476
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/executor/memcpy.rs
@@ -0,0 +1,84 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Memcpy executor for host-to-host transfers.
+
+use crate::block_manager::v2::physical::transfer::PhysicalLayout;
+use crate::block_manager::v2::physical::transfer::context::TransferCompleteNotification;
+use anyhow::Result;
+use std::ops::Range;
+
+/// Execute a memcpy transfer between host memory locations.
+///
+/// This executor handles transfers between System and Pinned memory using
+/// standard CPU memcpy operations. The transfer is synchronous and blocking.
+///
+/// # Arguments
+/// * `src` - Source physical layout
+/// * `dst` - Destination physical layout
+/// * `block_pairs` - Pairs of (src_block_id, dst_block_id) to transfer
+/// * `layer_range` - Optional range of layers to transfer (None = all layers)
+pub fn execute_memcpy_transfer(
+    src: &PhysicalLayout,
+    dst: &PhysicalLayout,
+    src_block_ids: &[usize],
+    dst_block_ids: &[usize],
+    layer_range: Option<Range<usize>>,
+) -> Result<TransferCompleteNotification> {
+    // Validate layouts have compatible structure
+    let src_layout = src.layout();
+    let dst_layout = dst.layout();
+
+    if src_layout.num_layers() != dst_layout.num_layers() {
+        return Err(anyhow::anyhow!(
+            "Layouts have incompatible layer counts: src={}, dst={}",
+            src_layout.num_layers(),
+            dst_layout.num_layers()
+        ));
+    }
+
+    if src_layout.outer_dim() != dst_layout.outer_dim() {
+        return Err(anyhow::anyhow!(
+            "Layouts have incompatible outer dimensions: src={}, dst={}",
+            src_layout.outer_dim(),
+            dst_layout.outer_dim()
+        ));
+    }
+
+    // Determine layer range
+    let layers = layer_range.unwrap_or(0..src_layout.num_layers());
+
+    // Perform synchronous copies
+    for (&src_block_id, &dst_block_id) in src_block_ids.iter().zip(dst_block_ids.iter()) {
+        for layer_id in layers.clone() {
+            for outer_id in 0..src_layout.outer_dim() {
+                // Get source and destination memory regions
+                let src_region = src.memory_region(src_block_id, layer_id, outer_id)?;
+                let dst_region = dst.memory_region(dst_block_id, layer_id, outer_id)?;
+
+                // Validate sizes match
+                if src_region.size() != dst_region.size() {
+                    return Err(anyhow::anyhow!(
+                        "Memory region size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}",
+                        src_block_id,
+                        dst_block_id,
+                        layer_id,
+                        outer_id,
+                        src_region.size(),
+                        dst_region.size()
+                    ));
+                }
+
+                // Perform memcpy
+                unsafe {
+                    let src_ptr = src_region.addr() as *const u8;
+                    let dst_ptr = dst_region.addr() as *mut u8;
+                    std::ptr::copy_nonoverlapping(src_ptr, dst_ptr, src_region.size());
+                }
+            }
+        }
+    }
+
+    // Memcpy is synchronous, so return already-completed notification
+    Ok(TransferCompleteNotification::completed())
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/executor/mod.rs b/lib/llm/src/block_manager/v2/physical/transfer/executor/mod.rs
new file mode 100644
index 0000000000..a3eeb36379
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/executor/mod.rs
@@ -0,0 +1,303 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer executors for different copy strategies.
+
+pub(super) mod cuda;
+mod memcpy;
+mod nixl;
+
+use super::strategy::select_strategy;
+use super::validation::validate_block_transfer;
+use super::{PhysicalLayout, TransferContext, TransferOptions, TransferPlan, TransferStrategy};
+use crate::block_manager::v2::physical::transfer::{
+    StorageKind, context::TransferCompleteNotification,
+};
+use anyhow::Result;
+use std::ops::Range;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+// Re-export the NIXL transfer builder for public use
+pub use nixl::NixlTransferBuilder;
+
+/// Execute a transfer between two physical layouts.
+///
+/// This is an internal entry point for all transfer operations called by TransportManager.
+/// It selects the appropriate strategy and dispatches to the corresponding executor.
+///
+/// # Arguments
+/// * `src` - Source physical layout
+/// * `dst` - Destination physical layout
+/// * `src_block_ids` - Source block IDs to transfer
+/// * `dst_block_ids` - Destination block IDs to transfer
+/// * `layer_range` - Optional range of layers to transfer (None = all layers)
+/// * `ctx` - Transfer context with CUDA stream and NIXL agent
+pub fn execute_transfer(
+    src: &PhysicalLayout,
+    dst: &PhysicalLayout,
+    src_block_ids: &[usize],
+    dst_block_ids: &[usize],
+    options: TransferOptions,
+    ctx: &TransferContext,
+) -> Result<TransferCompleteNotification> {
+    // Validate block IDs
+    validate_block_transfer(src_block_ids, dst_block_ids, None, src, dst, None)?;
+
+    // Select transfer plan based on locations and capabilities
+    let plan = select_strategy(src, dst, ctx)?;
+
+    // Dispatch based on plan type
+    match plan {
+        TransferPlan::Direct(strategy) => execute_direct_transfer(
+            src,
+            dst,
+            src_block_ids,
+            dst_block_ids,
+            options.layer_range,
+            strategy,
+            ctx,
+        ),
+        TransferPlan::TwoHop {
+            first,
+            bounce_location,
+            second,
+        } => execute_two_hop_transfer(TwoHopTransferParams {
+            src,
+            dst,
+            src_block_ids,
+            dst_block_ids,
+            first_strategy: first,
+            bounce_location,
+            second_strategy: second,
+            options,
+            ctx,
+        }),
+    }
+}
+
+/// Execute a direct single-hop transfer.
+fn execute_direct_transfer(
+    src: &PhysicalLayout,
+    dst: &PhysicalLayout,
+    src_block_ids: &[usize],
+    dst_block_ids: &[usize],
+    layer_range: Option<Range<usize>>,
+    strategy: TransferStrategy,
+    ctx: &TransferContext,
+) -> Result<TransferCompleteNotification> {
+    match strategy {
+        TransferStrategy::Memcpy => {
+            memcpy::execute_memcpy_transfer(src, dst, src_block_ids, dst_block_ids, layer_range)
+        }
+        TransferStrategy::CudaAsyncH2D
+        | TransferStrategy::CudaAsyncD2H
+        | TransferStrategy::CudaAsyncD2D
+        | TransferStrategy::CudaBlockingH2D
+        | TransferStrategy::CudaBlockingD2H => Ok(cuda::execute_cuda_transfer(
+            src,
+            dst,
+            src_block_ids,
+            dst_block_ids,
+            layer_range,
+            strategy,
+            ctx,
+        )?),
+        TransferStrategy::NixlRead
+        | TransferStrategy::NixlWrite
+        | TransferStrategy::NixlReadFlipped
+        | TransferStrategy::NixlWriteFlipped => {
+            let mut builder = NixlTransferBuilder::new()
+                .src(src)
+                .dst(dst)
+                .src_blocks(src_block_ids)
+                .dst_blocks(dst_block_ids)
+                .strategy(strategy);
+
+            if let Some(range) = layer_range {
+                builder = builder.layer_range(range);
+            }
+
+            builder.execute(ctx)
+        }
+        TransferStrategy::Invalid => Err(anyhow::anyhow!(
+            "Invalid transfer strategy for src={:?}, dst={:?}",
+            src.location(),
+            dst.location()
+        )),
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn execute_two_hop_transfer_chunk(
+    src: &PhysicalLayout,
+    bounce_layout: &PhysicalLayout,
+    dst: &PhysicalLayout,
+    src_block_ids: &[usize],
+    bounce_block_ids: &[usize],
+    dst_block_ids: &[usize],
+    first_strategy: TransferStrategy,
+    second_strategy: TransferStrategy,
+    layer_range: &Option<Range<usize>>,
+    ctx: &TransferContext,
+) -> Result<()> {
+    let bounce_ids_to_use = &bounce_block_ids[..src_block_ids.len()];
+
+    execute_direct_transfer(
+        src,
+        bounce_layout,
+        src_block_ids,
+        bounce_ids_to_use,
+        layer_range.clone(),
+        first_strategy,
+        ctx,
+    )?
+    .await?;
+
+    execute_direct_transfer(
+        bounce_layout,
+        dst,
+        bounce_ids_to_use,
+        dst_block_ids,
+        layer_range.clone(),
+        second_strategy,
+        ctx,
+    )?
+    .await?;
+
+    Ok(())
+}
+
+/// Parameters for two-hop transfer execution
+struct TwoHopTransferParams<'a> {
+    src: &'a PhysicalLayout,
+    dst: &'a PhysicalLayout,
+    src_block_ids: &'a [usize],
+    dst_block_ids: &'a [usize],
+    first_strategy: TransferStrategy,
+    bounce_location: StorageKind,
+    second_strategy: TransferStrategy,
+    options: TransferOptions,
+    ctx: &'a TransferContext,
+}
+
+fn execute_two_hop_transfer(params: TwoHopTransferParams) -> Result<TransferCompleteNotification> {
+    let TwoHopTransferParams {
+        src,
+        dst,
+        src_block_ids,
+        dst_block_ids,
+        first_strategy,
+        bounce_location,
+        second_strategy,
+        options,
+        ctx,
+    } = params;
+    let (tx, rx) = tokio::sync::oneshot::channel();
+
+    // TODO: Cloning all this stuff is not ideal.
+    let src_clone = src.clone();
+    let dst_clone = dst.clone();
+
+    let src_block_ids = src_block_ids.to_vec();
+    let dst_block_ids = dst_block_ids.to_vec();
+
+    let options_clone = options.clone();
+
+    let handle = ctx.tokio();
+    let ctx_clone = ctx.clone();
+    handle.spawn(async move {
+        let Some(ref bounce_buffer_spec) = options_clone.bounce_buffer else {
+            tx.send(Err(anyhow::anyhow!(
+                "Two-hop transfers require a bounce buffer."
+            )))
+            .unwrap();
+            return;
+        };
+
+        if bounce_buffer_spec.layout().location() != bounce_location {
+            tx.send(Err(anyhow::anyhow!(
+                "Bounce buffer layout does not match bounce location."
+            )))
+            .unwrap();
+            return;
+        }
+
+        let num_bounce_blocks = bounce_buffer_spec.block_ids().len();
+
+        if num_bounce_blocks < src_block_ids.len() {
+            for (src_block_ids, dst_block_ids) in src_block_ids
+                .chunks(num_bounce_blocks)
+                .zip(dst_block_ids.chunks(num_bounce_blocks))
+            {
+                let bounce_block_ids_to_use =
+                    &bounce_buffer_spec.block_ids()[..src_block_ids.len()];
+                if let Err(e) = execute_two_hop_transfer_chunk(
+                    &src_clone,
+                    bounce_buffer_spec.layout(),
+                    &dst_clone,
+                    src_block_ids,
+                    bounce_block_ids_to_use,
+                    dst_block_ids,
+                    first_strategy,
+                    second_strategy,
+                    &options_clone.layer_range,
+                    &ctx_clone,
+                )
+                .await
+                {
+                    tx.send(Err(e)).unwrap();
+                    return;
+                }
+            }
+            tx.send(Ok(())).unwrap();
+        } else {
+            let bounce_block_ids_to_use = &bounce_buffer_spec.block_ids()[..src_block_ids.len()];
+            let result = execute_two_hop_transfer_chunk(
+                &src_clone,
+                bounce_buffer_spec.layout(),
+                &dst_clone,
+                src_block_ids.as_slice(),
+                bounce_block_ids_to_use,
+                dst_block_ids.as_slice(),
+                first_strategy,
+                second_strategy,
+                &options_clone.layer_range,
+                &ctx_clone,
+            )
+            .await;
+
+            tx.send(result).unwrap();
+        }
+    });
+
+    Ok(TransferCompleteNotification { status: rx })
+}
+
+pub struct TransferNotification {
+    status: Arc<AtomicBool>,
+}
+
+impl Default for TransferNotification {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TransferNotification {
+    pub fn new() -> Self {
+        Self {
+            status: Arc::new(AtomicBool::new(false)),
+        }
+    }
+
+    pub fn done() -> Self {
+        Self {
+            status: Arc::new(AtomicBool::new(true)),
+        }
+    }
+
+    pub fn is_complete(&self) -> bool {
+        self.status.load(Ordering::Relaxed)
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/executor/nixl.rs b/lib/llm/src/block_manager/v2/physical/transfer/executor/nixl.rs
new file mode 100644
index 0000000000..2fa37f4b38
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/executor/nixl.rs
@@ -0,0 +1,320 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Typestate builder for NIXL transfers.
+//!
+//! This module provides a compile-time safe builder for NIXL transfers that ensures
+//! all required parameters are set before execution.
+
+use super::{PhysicalLayout, TransferContext, TransferStrategy};
+use crate::block_manager::v2::physical::transfer::context::TransferCompleteNotification;
+use anyhow::{Result, anyhow};
+use nixl_sys::{XferDescList, XferOp};
+use std::marker::PhantomData;
+use std::ops::Range;
+
+/// Marker type for unset builder fields.
+pub struct Unset;
+
+/// Marker type for set builder fields.
+pub struct Set;
+
+/// Typestate builder for NIXL transfers.
+///
+/// This builder uses the typestate pattern to ensure all required parameters are set
+/// at compile time. The type parameters track which fields have been set:
+/// - `TSrc`: Source layout state
+/// - `TDst`: Destination layout state
+/// - `TSrcBlocks`: Source block IDs state
+/// - `TDstBlocks`: Destination block IDs state
+/// - `TStrategy`: Transfer strategy state
+pub struct NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, TDstBlocks, TStrategy> {
+    src: Option<&'a PhysicalLayout>,
+    dst: Option<&'a PhysicalLayout>,
+    src_block_ids: Option<&'a [usize]>,
+    dst_block_ids: Option<&'a [usize]>,
+    strategy: Option<TransferStrategy>,
+    layer_range: Option<Range<usize>>,
+    write_notif: Option<uuid::Uuid>,
+    _phantom: PhantomData<(TSrc, TDst, TSrcBlocks, TDstBlocks, TStrategy)>,
+}
+
+impl<'a> NixlTransferBuilder<'a, Unset, Unset, Unset, Unset, Unset> {
+    /// Creates a new NIXL transfer builder with all fields unset.
+    pub fn new() -> Self {
+        Self {
+            src: None,
+            dst: None,
+            src_block_ids: None,
+            dst_block_ids: None,
+            strategy: None,
+            layer_range: None,
+            write_notif: None,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a> Default for NixlTransferBuilder<'a, Unset, Unset, Unset, Unset, Unset> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+// Required field setters - these consume self and return a new builder with the field marked as Set
+
+impl<'a, TDst, TSrcBlocks, TDstBlocks, TStrategy>
+    NixlTransferBuilder<'a, Unset, TDst, TSrcBlocks, TDstBlocks, TStrategy>
+{
+    /// Sets the source physical layout.
+    pub fn src(
+        self,
+        src: &'a PhysicalLayout,
+    ) -> NixlTransferBuilder<'a, Set, TDst, TSrcBlocks, TDstBlocks, TStrategy> {
+        NixlTransferBuilder {
+            src: Some(src),
+            dst: self.dst,
+            src_block_ids: self.src_block_ids,
+            dst_block_ids: self.dst_block_ids,
+            strategy: self.strategy,
+            layer_range: self.layer_range,
+            write_notif: self.write_notif,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, TSrc, TSrcBlocks, TDstBlocks, TStrategy>
+    NixlTransferBuilder<'a, TSrc, Unset, TSrcBlocks, TDstBlocks, TStrategy>
+{
+    /// Sets the destination physical layout.
+    pub fn dst(
+        self,
+        dst: &'a PhysicalLayout,
+    ) -> NixlTransferBuilder<'a, TSrc, Set, TSrcBlocks, TDstBlocks, TStrategy> {
+        NixlTransferBuilder {
+            src: self.src,
+            dst: Some(dst),
+            src_block_ids: self.src_block_ids,
+            dst_block_ids: self.dst_block_ids,
+            strategy: self.strategy,
+            layer_range: self.layer_range,
+            write_notif: self.write_notif,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, TSrc, TDst, TDstBlocks, TStrategy>
+    NixlTransferBuilder<'a, TSrc, TDst, Unset, TDstBlocks, TStrategy>
+{
+    /// Sets the source block IDs to transfer.
+    pub fn src_blocks(
+        self,
+        src_block_ids: &'a [usize],
+    ) -> NixlTransferBuilder<'a, TSrc, TDst, Set, TDstBlocks, TStrategy> {
+        NixlTransferBuilder {
+            src: self.src,
+            dst: self.dst,
+            src_block_ids: Some(src_block_ids),
+            dst_block_ids: self.dst_block_ids,
+            strategy: self.strategy,
+            layer_range: self.layer_range,
+            write_notif: self.write_notif,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, TSrc, TDst, TSrcBlocks, TStrategy>
+    NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, Unset, TStrategy>
+{
+    /// Sets the destination block IDs to transfer.
+    pub fn dst_blocks(
+        self,
+        dst_block_ids: &'a [usize],
+    ) -> NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, Set, TStrategy> {
+        NixlTransferBuilder {
+            src: self.src,
+            dst: self.dst,
+            src_block_ids: self.src_block_ids,
+            dst_block_ids: Some(dst_block_ids),
+            strategy: self.strategy,
+            layer_range: self.layer_range,
+            write_notif: self.write_notif,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, TSrc, TDst, TSrcBlocks, TDstBlocks>
+    NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, TDstBlocks, Unset>
+{
+    /// Sets the NIXL transfer strategy (Read or Write).
+    pub fn strategy(
+        self,
+        strategy: TransferStrategy,
+    ) -> NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, TDstBlocks, Set> {
+        NixlTransferBuilder {
+            src: self.src,
+            dst: self.dst,
+            src_block_ids: self.src_block_ids,
+            dst_block_ids: self.dst_block_ids,
+            strategy: Some(strategy),
+            layer_range: self.layer_range,
+            write_notif: self.write_notif,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+// Optional field setters - these can be called at any point in the builder chain
+
+impl<'a, TSrc, TDst, TSrcBlocks, TDstBlocks, TStrategy>
+    NixlTransferBuilder<'a, TSrc, TDst, TSrcBlocks, TDstBlocks, TStrategy>
+{
+    /// Sets an optional range of layers to transfer.
+    /// If not called, all layers will be transferred.
+    pub fn layer_range(mut self, layer_range: Range<usize>) -> Self {
+        self.layer_range = Some(layer_range);
+        self
+    }
+
+    /// Sets an optional write notification UUID.
+    pub fn write_notif(mut self, write_notif: uuid::Uuid) -> Self {
+        self.write_notif = Some(write_notif);
+        self
+    }
+}
+
+// Execute method - only available when all required fields are Set
+
+impl<'a> NixlTransferBuilder<'a, Set, Set, Set, Set, Set> {
+    /// Executes the NIXL transfer with the configured parameters.
+    ///
+    /// This method is only available when all required fields have been set,
+    /// enforced at compile time by the typestate pattern.
+    pub(crate) fn execute(self, ctx: &TransferContext) -> Result<TransferCompleteNotification> {
+        // Unwrap all required fields (safe because typestate guarantees they're set)
+        let src = self.src.unwrap();
+        let dst = self.dst.unwrap();
+        let src_block_ids = self.src_block_ids.unwrap();
+        let dst_block_ids = self.dst_block_ids.unwrap();
+        let strategy = self.strategy.unwrap();
+        let layer_range = self.layer_range;
+        let _write_notif = self.write_notif;
+
+        // Validate layouts
+        let src_layout = src.layout();
+        let dst_layout = dst.layout();
+
+        if src_layout.num_layers() != dst_layout.num_layers() {
+            return Err(anyhow!(
+                "Layouts have incompatible layer counts: src={}, dst={}",
+                src_layout.num_layers(),
+                dst_layout.num_layers()
+            ));
+        }
+
+        if src_layout.outer_dim() != dst_layout.outer_dim() {
+            return Err(anyhow!(
+                "Layouts have incompatible outer dimensions: src={}, dst={}",
+                src_layout.outer_dim(),
+                dst_layout.outer_dim()
+            ));
+        }
+
+        // Get NIXL agent
+        let nixl_agent = ctx.nixl_agent();
+
+        // Determine layer range
+        let layers = layer_range.unwrap_or(0..src_layout.num_layers());
+
+        // Determine NIXL operation type
+        let xfer_op = match strategy {
+            TransferStrategy::NixlRead | TransferStrategy::NixlReadFlipped => XferOp::Read,
+            TransferStrategy::NixlWrite | TransferStrategy::NixlWriteFlipped => XferOp::Write,
+            _ => {
+                return Err(anyhow!("Invalid NIXL transfer strategy: {:?}", strategy));
+            }
+        };
+
+        assert!(
+            nixl_agent.name() == src.nixl_metadata().agent_name(),
+            "the source must be local"
+        );
+
+        // Capture NIXL metadata for both layouts
+        let src_metadata = src.nixl_metadata();
+        let dst_metadata = dst.nixl_metadata();
+
+        let src_mem_type = src_metadata.mem_type();
+        let dst_mem_type = dst_metadata.mem_type();
+
+        let src_device_id = src_metadata.device_id();
+        let dst_device_id = dst_metadata.device_id();
+
+        // Build XferDescLists for source and destination
+        let mut src_dl = XferDescList::new(src_mem_type)?;
+        let mut dst_dl = XferDescList::new(dst_mem_type)?;
+
+        // Add memory regions to descriptor lists
+        for (&src_block_id, &dst_block_id) in src_block_ids.iter().zip(dst_block_ids.iter()) {
+            for layer_id in layers.clone() {
+                for outer_id in 0..src_layout.outer_dim() {
+                    let src_region = src.memory_region(src_block_id, layer_id, outer_id)?;
+                    let dst_region = dst.memory_region(dst_block_id, layer_id, outer_id)?;
+
+                    if src_region.size() != dst_region.size() {
+                        return Err(anyhow!(
+                            "Size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}",
+                            src_block_id,
+                            dst_block_id,
+                            layer_id,
+                            outer_id,
+                            src_region.size(),
+                            dst_region.size()
+                        ));
+                    }
+
+                    // Add to source descriptor list
+                    src_dl.add_desc(src_region.addr(), src_region.size(), src_device_id)?;
+
+                    // Add to destination descriptor list
+                    dst_dl.add_desc(dst_region.addr(), dst_region.size(), dst_device_id)?;
+                }
+            }
+        }
+
+        // Note: Overlap detection was removed from nixl-sys 0.6.1
+        // The NIXL library now handles overlap detection internally
+
+        if matches!(
+            strategy,
+            TransferStrategy::NixlReadFlipped | TransferStrategy::NixlWriteFlipped
+        ) {
+            std::mem::swap(&mut src_dl, &mut dst_dl);
+        }
+
+        // Create transfer request
+        let xfer_req = nixl_agent.create_xfer_req(
+            xfer_op,
+            &src_dl,
+            &dst_dl,
+            dst_metadata.agent_name(),
+            None, // opt_args
+        )?;
+
+        // Post transfer request
+        // Note: Notification handling via OptArgs can be added later if needed
+        let still_pending = nixl_agent.post_xfer_req(&xfer_req, None)?;
+
+        if still_pending {
+            // Register for async completion via status polling
+            Ok(ctx.register_nixl_status(xfer_req))
+        } else {
+            // Transfer completed synchronously
+            Ok(TransferCompleteNotification::completed())
+        }
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/fill.rs b/lib/llm/src/block_manager/v2/physical/transfer/fill.rs
new file mode 100644
index 0000000000..d24b6824ea
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/fill.rs
@@ -0,0 +1,273 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Block filling operations for testing.
+//!
+//! This module provides utilities to populate blocks with specific patterns
+//! for verification in round-trip tests.
+
+use super::PhysicalLayout;
+
+use crate::block_manager::v2::memory::StorageKind;
+use aligned_vec::{AVec, avec};
+use anyhow::{Result, anyhow};
+use cudarc::runtime::sys::{cudaMemcpy, cudaMemcpyKind};
+
+use std::{
+    fs::File,
+    io::{Seek, Write},
+    mem::ManuallyDrop,
+    ops::Range,
+    os::fd::FromRawFd,
+};
+
+/// Fill strategy for block memory.
+#[derive(Debug, Clone, Copy)]
+pub enum FillPattern {
+    /// Fill with a constant byte value
+    Constant(u8),
+
+    /// Fill with a sequential pattern: block_id + layer_id + offset % 256
+    Sequential,
+}
+
+/// Fill blocks in a physical layout with a specific pattern.
+///
+/// This operation directly writes to memory and should only be used on
+/// local layouts. Remote layouts cannot be filled directly.
+///
+/// # Arguments
+/// * `layout` - The physical layout containing the blocks
+/// * `block_ids` - List of block IDs to fill
+/// * `pattern` - Fill pattern to use
+///
+/// # Errors
+/// Returns an error if:
+/// - Layout is remote (cannot fill remote memory directly)
+/// - Block IDs are out of range
+/// - Memory access fails
+pub fn fill_blocks(
+    layout: &PhysicalLayout,
+    block_ids: &[usize],
+    pattern: FillPattern,
+) -> Result<()> {
+    // Can only fill local layouts
+    let config = layout.layout().config();
+    let num_layers = config.num_layers;
+    let outer_dim = config.outer_dim;
+
+    for &block_id in block_ids {
+        if block_id >= config.num_blocks {
+            return Err(anyhow!("Block ID {} out of range", block_id));
+        }
+
+        // Fill all layers and outer dimensions for this block
+        for layer_id in 0..num_layers {
+            for outer_id in 0..outer_dim {
+                let region = layout.memory_region(block_id, layer_id, outer_id)?;
+
+                match layout.location() {
+                    StorageKind::System | StorageKind::Pinned => {
+                        fill_memory_region(
+                            region.addr(),
+                            region.size(),
+                            block_id,
+                            layer_id,
+                            pattern,
+                        )?;
+                    }
+                    StorageKind::Device(_) => {
+                        let system_region: Vec<u8> = vec![0; region.size()];
+                        fill_memory_region(
+                            system_region.as_ptr() as usize,
+                            system_region.len(),
+                            block_id,
+                            layer_id,
+                            pattern,
+                        )?;
+                        unsafe {
+                            cudaMemcpy(
+                                region.addr() as *mut std::ffi::c_void,
+                                system_region.as_ptr() as *const std::ffi::c_void,
+                                region.size(),
+                                cudaMemcpyKind::cudaMemcpyHostToDevice,
+                            );
+                        }
+                    }
+                    StorageKind::Disk(fd) => {
+                        let system_region: AVec<u8, _> = avec![[4096]| 0; region.size()];
+                        fill_memory_region(
+                            system_region.as_ptr() as usize,
+                            system_region.len(),
+                            block_id,
+                            layer_id,
+                            pattern,
+                        )?;
+
+                        let mut file = ManuallyDrop::new(unsafe { File::from_raw_fd(fd as i32) });
+
+                        file.seek(std::io::SeekFrom::Start(region.addr() as u64))?;
+                        file.write_all(&system_region)?;
+                        file.sync_all()?;
+                        file.flush()?;
+                    }
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Fill a subset of layers in blocks with a specific pattern.
+///
+/// # Arguments
+/// * `layout` - The physical layout containing the blocks
+/// * `block_ids` - List of block IDs to fill
+/// * `layer_range` - Range of layers to fill
+/// * `pattern` - Fill pattern to use
+pub fn fill_layers(
+    layout: &PhysicalLayout,
+    block_ids: &[usize],
+    layer_range: Range<usize>,
+    pattern: FillPattern,
+) -> Result<()> {
+    let config = layout.layout().config();
+    let num_layers = config.num_layers;
+    let outer_dim = config.outer_dim;
+
+    if layer_range.end > num_layers {
+        return Err(anyhow!(
+            "Layer range {:?} exceeds num_layers {}",
+            layer_range,
+            num_layers
+        ));
+    }
+
+    for &block_id in block_ids {
+        if block_id >= config.num_blocks {
+            return Err(anyhow!("Block ID {} out of range", block_id));
+        }
+
+        // Fill specified layers and all outer dimensions
+        for layer_id in layer_range.clone() {
+            for outer_id in 0..outer_dim {
+                let region = layout.memory_region(block_id, layer_id, outer_id)?;
+                fill_memory_region(region.addr(), region.size(), block_id, layer_id, pattern)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Fill a memory region with the specified pattern.
+///
+/// # Safety
+/// This function performs unsafe memory writes. The caller must ensure:
+/// - The memory region is valid and accessible
+/// - No other references exist to this memory
+fn fill_memory_region(
+    addr: usize,
+    size: usize,
+    block_id: usize,
+    layer_id: usize,
+    pattern: FillPattern,
+) -> Result<()> {
+    unsafe {
+        let ptr = addr as *mut u8;
+        match pattern {
+            FillPattern::Constant(value) => {
+                std::ptr::write_bytes(ptr, value, size);
+            }
+            FillPattern::Sequential => {
+                for offset in 0..size {
+                    let value = ((block_id + layer_id + offset) % 256) as u8;
+                    ptr.add(offset).write(value);
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::super::tests::*;
+    use super::*;
+    use crate::block_manager::v2::memory::actions::Slice;
+
+    #[test]
+    fn test_fill_blocks_constant() {
+        let physical = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        fill_blocks(&physical, &[0, 1], FillPattern::Constant(42)).unwrap();
+
+        // Verify all bytes are set to 42
+        assert!(
+            physical
+                .memory_region(0, 0, 0)
+                .unwrap()
+                .as_slice()
+                .unwrap()
+                .iter()
+                .all(|&b| b == 42)
+        );
+    }
+
+    #[test]
+    fn test_fill_blocks_sequential() {
+        let physical = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        fill_blocks(&physical, &[0, 1], FillPattern::Sequential).unwrap();
+
+        let mr = physical.memory_region(0, 0, 0).unwrap();
+        let mr_slice = mr.as_slice().unwrap();
+
+        // Verify pattern is applied (spot check a few bytes)
+        let first_byte = mr_slice[0];
+        let second_byte = mr_slice[1];
+        assert_eq!(first_byte, 0);
+        assert_eq!(second_byte, first_byte.wrapping_add(1));
+
+        let mr = physical.memory_region(1, 1, 0).unwrap();
+        let mr_slice = mr.as_slice().unwrap();
+
+        let first_byte = mr_slice[0];
+        let second_byte = mr_slice[1];
+        assert_eq!(first_byte, 2);
+        assert_eq!(second_byte, first_byte.wrapping_add(1));
+    }
+
+    #[test]
+    fn test_fill_layers() {
+        let physical = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        // Fill only layer 0
+        fill_layers(&physical, &[0], 0..1, FillPattern::Constant(0)).unwrap();
+        fill_layers(&physical, &[0], 1..2, FillPattern::Constant(1)).unwrap();
+        fill_layers(&physical, &[1], 0..1, FillPattern::Constant(100)).unwrap();
+        fill_layers(&physical, &[1], 1..2, FillPattern::Constant(101)).unwrap();
+
+        let mr_00 = physical.memory_region(0, 0, 0).unwrap().as_slice().unwrap()[0];
+        let mr_01 = physical.memory_region(0, 1, 0).unwrap().as_slice().unwrap()[0];
+        let mr_10 = physical.memory_region(1, 0, 0).unwrap().as_slice().unwrap()[0];
+        let mr_11 = physical.memory_region(1, 1, 0).unwrap().as_slice().unwrap()[0];
+        assert_eq!(mr_00, 0);
+        assert_eq!(mr_01, 1);
+        assert_eq!(mr_10, 100);
+        assert_eq!(mr_11, 101);
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/mod.rs b/lib/llm/src/block_manager/v2/physical/transfer/mod.rs
new file mode 100644
index 0000000000..18935809f2
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/mod.rs
@@ -0,0 +1,120 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer module for copying blocks between layouts with different storage locations.
+//!
+//! This module provides functionality for transferring KV cache blocks between layouts
+//! that may be backed by different storage types (GPU memory, pinned host memory, disk, etc.)
+//! and potentially across NIXL-connected remote nodes.
+//!
+//! # Core Concepts
+//!
+//! - [`PhysicalLayout`]: Wraps a layout with its physical storage location and NIXL metadata
+//! - [`LayoutDescriptor`]: Serializable representation for cross-node communication
+//! - Transfer strategies: memcpy, CUDA, NIXL based on source/destination locations
+//! - Block-wise and layer-wise transfer operations
+//!
+//! # Usage
+//!
+//! ```rust,ignore
+//! use dynamo_kvbm::v2::transfer::{PhysicalLayout, transfer_blocks};
+//!
+//! // Create local physical layout with NIXL registration
+//! let src = PhysicalLayout::new_local(src_layout, StorageKind::Device(0))
+//!     .with_nixl_registration("local_agent".to_string())?;
+//!
+//! // Create remote physical layout
+//! let dst = PhysicalLayout::new_remote(
+//!     dst_layout,
+//!     StorageKind::Pinned,
+//!     "remote_agent".to_string()
+//! );
+//!
+//! // Transfer blocks from local to remote
+//! let src_block_ids = [0, 1, 2];
+//! let dst_block_ids = [0, 1, 2];
+//! let future = transfer_blocks(&src, &dst, &src_block_ids, &dst_block_ids, &ctx)?;
+//! future.await?;
+//! ```
+
+pub mod capabilities;
+pub mod checksum;
+pub mod context;
+pub mod executor;
+pub mod fill;
+pub mod nixl_agent;
+pub mod notifications;
+pub mod options;
+pub mod preferences;
+pub mod strategy;
+pub mod validation;
+
+#[cfg(test)]
+mod tests;
+
+// Re-export StorageKind
+pub use crate::block_manager::v2::memory::StorageKind;
+
+pub use capabilities::TransferCapabilities;
+pub use checksum::{BlockChecksum, compute_block_checksums, compute_layer_checksums};
+pub use fill::{FillPattern, fill_blocks, fill_layers};
+pub use nixl_agent::{NixlAgent, NixlBackendConfig};
+pub use options::{TransferOptions, TransferOptionsBuilder};
+pub use preferences::{NativeVsNixlPolicy, TransferPreferences};
+pub use strategy::{TransferPlan, TransferStrategy};
+pub use validation::BlockValidationError;
+
+// Internal - TransferContext is now managed by TransportManager
+pub(crate) use context::TransferContext;
+
+pub use super::layout::PhysicalLayout;
+
+// Re-export manager types - TransportManager is the primary public API
+pub use super::manager::{LayoutHandle, SerializedLayout, TransportManager, WorkerAddress};
+
+// #[cfg(test)]
+// pub use testing::{RoundTripTest, RoundTripTestResult};
+
+use anyhow::Result;
+
+/// Future representing an in-progress transfer operation.
+///
+/// The transfer completes when this future resolves.
+pub type TransferFuture = std::pin::Pin<Box<dyn std::future::Future<Output = Result<()>> + Send>>;
+
+/// Specification for bounce buffer in multi-hop transfers.
+///
+/// This structure provides the layout and block IDs to use as an intermediate
+/// staging area when direct transfers are not allowed.
+pub trait BounceBufferSpec: Send + Sync {
+    fn layout(&self) -> &PhysicalLayout;
+    fn block_ids(&self) -> &[usize];
+}
+
+// #[cfg(all(test, feature = "testing-cuda"))]
+// mod cuda_integration_tests {
+//     use super::*;
+//     use crate::block_manager::v2::layout::{
+//         FullyContiguousLayout, Layout, LayoutConfig, MemoryRegion, OwnedMemoryRegion,
+//     };
+//     use cudarc::driver::CudaContext;
+//     use std::sync::Arc;
+
+//     // TODO: Add CUDA-specific integration tests
+//     // These would test:
+//     // - H2D transfers
+//     // - D2H transfers
+//     // - D2D transfers
+//     // - Async completion via event synchronization
+// }
+
+// #[cfg(all(test, feature = "testing-nixl"))]
+// mod nixl_integration_tests {
+//     use super::*;
+
+//     // TODO: Add NIXL-specific integration tests
+//     // These would test:
+//     // - Remote memory access via NIXL Read
+//     // - Disk-backed transfers via NIXL Write
+//     // - Cross-node serialization with LayoutDescriptor
+// }
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/config.rs b/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/config.rs
new file mode 100644
index 0000000000..b25680fe3b
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/config.rs
@@ -0,0 +1,170 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! NIXL backend configuration with Figment support.
+//!
+//! This module provides configuration extraction for NIXL backends from
+//! environment variables with the pattern: `DYN_KVBM_NIXL_BACKEND_<backend>_<key>=<value>`
+
+use anyhow::{Result, bail};
+use dynamo_runtime::config::parse_bool;
+use std::collections::HashSet;
+
+/// Configuration for NIXL backends.
+///
+/// Supports extracting backend configurations from environment variables:
+/// - `DYN_KVBM_NIXL_BACKEND_UCX=true` - Enable UCX backend with default params
+/// - `DYN_KVBM_NIXL_BACKEND_GDS=false` - Explicitly disable GDS backend
+/// - Valid values: true/false, 1/0, on/off, yes/no (case-insensitive)
+/// - Invalid values (e.g., "maybe", "random") will cause an error
+/// - Custom params (e.g., `DYN_KVBM_NIXL_BACKEND_UCX_PARAM1=value`) will cause an error
+///
+/// # Examples
+///
+/// ```rust,ignore
+/// // Extract from environment
+/// let config = NixlBackendConfig::from_env()?;
+///
+/// // Or combine with builder overrides
+/// let config = NixlBackendConfig::from_env()?
+///     .with_backend("ucx")
+///     .with_backend("gds");
+/// ```
+#[derive(Debug, Clone, Default)]
+pub struct NixlBackendConfig {
+    /// Set of enabled backends (just backend names, no custom params yet)
+    backends: HashSet<String>,
+}
+
+impl NixlBackendConfig {
+    /// Create a new empty configuration.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Create configuration from environment variables.
+    ///
+    /// Extracts backends from `DYN_KVBM_NIXL_BACKEND_<backend>=<value>` variables.
+    ///
+    /// # Errors
+    /// Returns an error if:
+    /// - Custom parameters are detected (not yet supported)
+    /// - Invalid boolean values are provided (must be truthy or falsey)
+    pub fn from_env() -> Result<Self> {
+        let mut backends = HashSet::new();
+
+        // Extract all environment variables that match our pattern
+        for (key, value) in std::env::vars() {
+            if let Some(remainder) = key.strip_prefix("DYN_KVBM_NIXL_BACKEND_") {
+                // Check if there's an underscore (indicating custom params)
+                if remainder.contains('_') {
+                    bail!(
+                        "Custom NIXL backend parameters are not yet supported. \
+                         Found: {}. Please use only DYN_KVBM_NIXL_BACKEND_<backend>=true \
+                         to enable backends with default parameters.",
+                        key
+                    );
+                }
+
+                // Simple backend enablement (e.g., DYN_KVBM_NIXL_BACKEND_UCX=true)
+                let backend_name = remainder.to_uppercase();
+                match parse_bool(&value) {
+                    Ok(true) => {
+                        backends.insert(backend_name);
+                    }
+                    Ok(false) => {
+                        // Explicitly disabled, don't add to backends
+                        continue;
+                    }
+                    Err(e) => bail!("Invalid value for {}: {}", key, e),
+                }
+            }
+        }
+
+        // Default to UCX if no backends specified
+        if backends.is_empty() {
+            backends.insert("UCX".to_string());
+        }
+
+        Ok(Self { backends })
+    }
+
+    /// Add a backend to the configuration.
+    ///
+    /// Backend names will be converted to uppercase for consistency.
+    pub fn with_backend(mut self, backend: impl Into<String>) -> Self {
+        self.backends.insert(backend.into().to_uppercase());
+        self
+    }
+
+    /// Get the set of enabled backends.
+    pub fn backends(&self) -> &HashSet<String> {
+        &self.backends
+    }
+
+    /// Check if a specific backend is enabled.
+    pub fn has_backend(&self, backend: &str) -> bool {
+        self.backends.contains(&backend.to_uppercase())
+    }
+
+    /// Merge another configuration into this one.
+    ///
+    /// Backends from the other configuration will be added to this one.
+    pub fn merge(mut self, other: NixlBackendConfig) -> Self {
+        self.backends.extend(other.backends);
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_new_config_is_empty() {
+        let config = NixlBackendConfig::new();
+        assert!(config.backends().is_empty());
+    }
+
+    #[test]
+    fn test_with_backend() {
+        let config = NixlBackendConfig::new()
+            .with_backend("ucx")
+            .with_backend("gds_mt");
+
+        assert!(config.has_backend("ucx"));
+        assert!(config.has_backend("UCX"));
+        assert!(config.has_backend("gds_mt"));
+        assert!(config.has_backend("GDS_MT"));
+        assert!(!config.has_backend("other"));
+    }
+
+    #[test]
+    fn test_merge_configs() {
+        let config1 = NixlBackendConfig::new().with_backend("ucx");
+        let config2 = NixlBackendConfig::new().with_backend("gds");
+
+        let merged = config1.merge(config2);
+
+        assert!(merged.has_backend("ucx"));
+        assert!(merged.has_backend("gds"));
+    }
+
+    #[test]
+    fn test_backend_name_case_insensitive() {
+        let config = NixlBackendConfig::new()
+            .with_backend("ucx")
+            .with_backend("Gds_mt")
+            .with_backend("OTHER");
+
+        assert!(config.has_backend("UCX"));
+        assert!(config.has_backend("ucx"));
+        assert!(config.has_backend("GDS_MT"));
+        assert!(config.has_backend("gds_mt"));
+        assert!(config.has_backend("OTHER"));
+        assert!(config.has_backend("other"));
+    }
+
+    // Note: Testing from_env() would require setting environment variables,
+    // which is challenging in unit tests. This is better tested with integration tests.
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/mod.rs b/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/mod.rs
new file mode 100644
index 0000000000..99280ba8e8
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/nixl_agent/mod.rs
@@ -0,0 +1,258 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! NIXL agent wrapper and configuration.
+//!
+//! This module provides:
+//! - `NixlAgent`: Wrapper around nixl_sys::Agent that tracks initialized backends
+//! - `NixlBackendConfig`: Configuration for NIXL backends from environment variables
+
+mod config;
+
+pub use config::NixlBackendConfig;
+
+use anyhow::Result;
+use nixl_sys::Agent as RawNixlAgent;
+use std::collections::HashSet;
+
+/// A NIXL agent wrapper that tracks which backends were successfully initialized.
+///
+/// This wrapper provides:
+/// - Runtime validation of backend availability
+/// - Clear error messages when operations need unavailable backends
+/// - Single source of truth for backend state in tests and production
+///
+/// # Backend Tracking
+///
+/// Since `nixl_sys::Agent` doesn't provide a method to query active backends,
+/// we track them during initialization. The `available_backends` set is populated
+/// based on successful `create_backend()` calls.
+#[derive(Clone, Debug)]
+pub struct NixlAgent {
+    agent: RawNixlAgent,
+    available_backends: HashSet<String>,
+}
+
+impl NixlAgent {
+    /// Create a new NIXL agent with the specified backends.
+    ///
+    /// Attempts to initialize all requested backends. If a backend fails, it logs
+    /// a warning but continues with remaining backends. At least one backend must
+    /// succeed or this returns an error.
+    ///
+    /// # Arguments
+    /// * `name` - Agent name
+    /// * `backends` - List of backend names to try (e.g., `&["UCX", "GDS_MT, "POSIX"]`)
+    ///
+    /// # Returns
+    /// A `NixlAgent` that tracks which backends were successfully initialized.
+    ///
+    /// # Errors
+    /// Returns an error if:
+    /// - Agent creation fails
+    /// - All backend initialization attempts fail
+    pub fn new_with_backends(name: &str, backends: &[&str]) -> Result<Self> {
+        let agent = RawNixlAgent::new(name)?;
+        let mut available_backends = HashSet::new();
+
+        for backend in backends {
+            let backend_upper = backend.to_uppercase();
+            match agent.get_plugin_params(&backend_upper) {
+                Ok((_, params)) => match agent.create_backend(&backend_upper, &params) {
+                    Ok(_) => {
+                        available_backends.insert(backend_upper);
+                    }
+                    Err(e) => {
+                        eprintln!(
+                            "✗ Failed to create {} backend: {}. Operations requiring this backend will fail.",
+                            backend_upper, e
+                        );
+                    }
+                },
+                Err(_) => {
+                    eprintln!(
+                        "✗ No {} plugin found. Operations requiring this backend will fail.",
+                        backend_upper
+                    );
+                }
+            }
+        }
+
+        if available_backends.is_empty() {
+            anyhow::bail!("Failed to initialize any NIXL backends from {:?}", backends);
+        }
+
+        Ok(Self {
+            agent,
+            available_backends,
+        })
+    }
+
+    /// Create a NIXL agent requiring ALL specified backends to be available.
+    ///
+    /// Unlike `new_with_backends()` which continues if some backends fail, this method
+    /// will return an error if ANY backend fails to initialize. Use this in production
+    /// when specific backends are mandatory.
+    ///
+    /// # Arguments
+    /// * `name` - Agent name
+    /// * `backends` - List of backend names that MUST be available
+    ///
+    /// # Returns
+    /// A `NixlAgent` with all requested backends initialized.
+    ///
+    /// # Errors
+    /// Returns an error if:
+    /// - Agent creation fails
+    /// - Any backend fails to initialize
+    ///
+    /// # Example
+    /// ```ignore
+    /// // In production: require both UCX and GDS, fail if either is missing
+    /// let agent = NixlAgent::require_backends("worker-0", &["UCX", "GDS_MT])?;
+    /// ```
+    pub fn require_backends(name: &str, backends: &[&str]) -> Result<Self> {
+        let agent = RawNixlAgent::new(name)?;
+        let mut available_backends = HashSet::new();
+        let mut failed_backends = Vec::new();
+
+        for backend in backends {
+            let backend_upper = backend.to_uppercase();
+            match agent.get_plugin_params(&backend_upper) {
+                Ok((_, params)) => match agent.create_backend(&backend_upper, &params) {
+                    Ok(_) => {
+                        available_backends.insert(backend_upper);
+                    }
+                    Err(e) => {
+                        eprintln!("✗ Failed to create {} backend: {}", backend_upper, e);
+                        failed_backends
+                            .push((backend_upper.clone(), format!("create failed: {}", e)));
+                    }
+                },
+                Err(e) => {
+                    eprintln!("✗ No {} plugin found", backend_upper);
+                    failed_backends
+                        .push((backend_upper.clone(), format!("plugin not found: {}", e)));
+                }
+            }
+        }
+
+        if !failed_backends.is_empty() {
+            let error_details: Vec<String> = failed_backends
+                .iter()
+                .map(|(name, reason)| format!("{}: {}", name, reason))
+                .collect();
+            anyhow::bail!(
+                "Failed to initialize required backends: [{}]",
+                error_details.join(", ")
+            );
+        }
+
+        Ok(Self {
+            agent,
+            available_backends,
+        })
+    }
+
+    /// Create a NIXL agent with default backends for testing/development.
+    ///
+    /// Attempts to initialize UCX, GDS, and POSIX backends. If some are unavailable,
+    /// continues with whatever succeeds. This ensures code works in various environments.
+    pub fn new_default(name: &str) -> Result<Self> {
+        Self::new_with_backends(name, &["UCX", "GDS_MT", "POSIX"])
+    }
+
+    /// Get a reference to the underlying raw NIXL agent.
+    pub fn raw_agent(&self) -> &RawNixlAgent {
+        &self.agent
+    }
+
+    /// Consume and return the underlying raw NIXL agent.
+    ///
+    /// **Warning**: Once consumed, backend tracking is lost. Use this only when
+    /// interfacing with code that requires `nixl_sys::Agent` directly.
+    pub fn into_raw_agent(self) -> RawNixlAgent {
+        self.agent
+    }
+
+    /// Check if a specific backend is available.
+    pub fn has_backend(&self, backend: &str) -> bool {
+        self.available_backends.contains(&backend.to_uppercase())
+    }
+
+    /// Get all available backends.
+    pub fn backends(&self) -> &HashSet<String> {
+        &self.available_backends
+    }
+
+    /// Require a specific backend, returning an error if unavailable.
+    ///
+    /// Use this at the start of operations that need specific backends.
+    ///
+    /// # Example
+    /// ```ignore
+    /// agent.require_backend("GDS_MT)?;
+    /// // Proceed with GDS-specific operations
+    /// ```
+    pub fn require_backend(&self, backend: &str) -> Result<()> {
+        let backend_upper = backend.to_uppercase();
+        if self.has_backend(&backend_upper) {
+            Ok(())
+        } else {
+            anyhow::bail!(
+                "Operation requires {} backend, but it was not initialized. Available backends: {:?}",
+                backend_upper,
+                self.available_backends
+            )
+        }
+    }
+}
+
+// Delegate common methods to the underlying agent
+impl std::ops::Deref for NixlAgent {
+    type Target = RawNixlAgent;
+
+    fn deref(&self) -> &Self::Target {
+        &self.agent
+    }
+}
+
+#[cfg(all(test, feature = "testing-nixl"))]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_agent_backend_tracking() {
+        // Try to create agent with UCX
+        let agent = NixlAgent::new_with_backends("test", &["UCX"]);
+
+        // Should succeed if UCX is available
+        if let Ok(agent) = agent {
+            assert!(agent.has_backend("UCX"));
+            assert!(agent.has_backend("ucx")); // Case insensitive
+        }
+    }
+
+    #[test]
+    fn test_require_backend() {
+        let agent = NixlAgent::new_with_backends("test", &["UCX"]).expect("Need UCX for test");
+
+        // Should succeed for available backend
+        assert!(agent.require_backend("UCX").is_ok());
+
+        // Should fail for unavailable backend
+        assert!(agent.require_backend("GDS_MT").is_err());
+    }
+
+    #[test]
+    fn test_require_backends_strict() {
+        // Should succeed if UCX is available
+        let agent = NixlAgent::require_backends("test_strict", &["UCX"])
+            .expect("Failed to require backends");
+        assert!(agent.has_backend("UCX"));
+
+        // Should fail if any backend is missing (GDS likely not available)
+        let result = NixlAgent::require_backends("test_strict_fail", &["UCX", "DUDE"]);
+        assert!(result.is_err());
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/notifications/cuda_event.rs b/lib/llm/src/block_manager/v2/physical/transfer/notifications/cuda_event.rs
new file mode 100644
index 0000000000..dd4f30e38c
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/notifications/cuda_event.rs
@@ -0,0 +1,88 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! CUDA event polling-based completion checker.
+
+use anyhow::Result;
+use cudarc::driver::{CudaEvent, DriverError, result as cuda_result, sys::CUresult};
+
+use super::CompletionChecker;
+
+/// Completion checker that polls CUDA event status.
+pub struct CudaEventChecker {
+    event: CudaEvent,
+}
+
+impl CudaEventChecker {
+    pub fn new(event: CudaEvent) -> Self {
+        Self { event }
+    }
+}
+
+impl CompletionChecker for CudaEventChecker {
+    fn is_complete(&self) -> Result<bool> {
+        // Query the CUDA event to check if it's complete
+        // cudaEventQuery returns cudaSuccess if complete, cudaErrorNotReady if still pending
+        unsafe {
+            match cuda_result::event::query(self.event.cu_event()) {
+                Ok(()) => Ok(true), // Event is complete
+                Err(DriverError(CUresult::CUDA_ERROR_NOT_READY)) => Ok(false),
+                Err(e) => Err(anyhow::anyhow!("CUDA event query failed: {:?}", e)),
+            }
+        }
+    }
+}
+
+#[cfg(all(test, feature = "testing-cuda"))]
+mod tests {
+    use crate::block_manager::v2::physical::manager::TransportManager;
+    use crate::block_manager::v2::physical::transfer::nixl_agent::NixlAgent;
+    use crate::block_manager::v2::physical::transfer::tests::cuda::CudaSleep;
+    use std::time::{Duration, Instant};
+
+    #[tokio::test]
+    async fn test_cuda_event_delayed_notification() {
+        let agent = NixlAgent::require_backends("test_agent", &[]).unwrap();
+        let manager = TransportManager::builder()
+            .worker_id(0)
+            .cuda_device_id(0)
+            .nixl_agent(agent)
+            .build()
+            .unwrap();
+
+        let stream = manager.h2d_stream();
+        let cuda_ctx = manager.cuda_context();
+
+        // Get or create the CudaSleep utility (compiles kernel and calibrates on first use)
+        let cuda_sleep = CudaSleep::for_context(cuda_ctx).unwrap();
+
+        // Test 1: Launch sleep and wait via async notification
+        let t0_queue_start = Instant::now();
+        cuda_sleep
+            .launch(Duration::from_millis(600), stream)
+            .unwrap();
+        let queue_time = t0_queue_start.elapsed();
+
+        let event = stream.record_event(None).unwrap();
+        let notification = manager.register_cuda_event(event);
+        notification.await.unwrap();
+        let wait_time = t0_queue_start.elapsed() - queue_time;
+
+        println!(
+            "GPU sleep test: queue {:?}, wait {:?}",
+            queue_time, wait_time
+        );
+
+        assert!(
+            queue_time < Duration::from_millis(10),
+            "launching the sleep kernel should be fast: {:?}",
+            queue_time
+        );
+
+        assert!(
+            wait_time >= Duration::from_millis(500),
+            "wait time should reflect >=500ms of GPU work: {:?}",
+            wait_time
+        );
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/notifications/mod.rs b/lib/llm/src/block_manager/v2/physical/transfer/notifications/mod.rs
new file mode 100644
index 0000000000..d23ee4309e
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/notifications/mod.rs
@@ -0,0 +1,176 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer completion notification system.
+//!
+//! This module provides abstractions for waiting on transfer completions using different
+//! mechanisms: polling-based (NIXL status, CUDA events) and event-based (NIXL notifications).
+
+use std::collections::HashMap;
+use std::time::{Duration, Instant};
+
+use anyhow::Result;
+use tokio::sync::{mpsc, oneshot};
+use tokio::time::interval;
+use tracing::warn;
+use uuid::Uuid;
+
+pub mod cuda_event;
+pub mod nixl_events;
+pub mod nixl_status;
+pub mod notification;
+
+pub use cuda_event::CudaEventChecker;
+pub use nixl_events::{RegisterNixlNotification, process_nixl_notification_events};
+pub use nixl_status::NixlStatusChecker;
+pub use notification::TransferCompleteNotification;
+
+/// Trait for checking if a transfer operation has completed.
+/// Supports polling-based completion checks (NIXL status, CUDA events).
+pub trait CompletionChecker: Send {
+    /// Returns true if the transfer is complete, false if still pending.
+    fn is_complete(&self) -> Result<bool>;
+}
+
+/// Registration message for polling-based transfer completion.
+pub struct RegisterPollingNotification<C: CompletionChecker> {
+    pub uuid: Uuid,
+    pub checker: C,
+    pub done: oneshot::Sender<Result<()>>,
+}
+
+/// Tracking struct for outstanding polling-based transfers.
+struct OutstandingPollingTransfer<C: CompletionChecker> {
+    checker: C,
+    done: oneshot::Sender<Result<()>>,
+    arrived_at: Instant,
+    last_warned_at: Option<Instant>,
+}
+
+/// Helper function to check if a transfer should be warned about and log the warning.
+/// Returns the new last_warned_at time if a warning was issued.
+fn check_and_warn_slow_transfer(
+    uuid: &Uuid,
+    arrived_at: Instant,
+    last_warned_at: Option<Instant>,
+) -> Option<Instant> {
+    let elapsed = arrived_at.elapsed();
+    if elapsed > Duration::from_secs(60) {
+        let should_warn = last_warned_at
+            .map(|last| last.elapsed() > Duration::from_secs(30))
+            .unwrap_or(true);
+
+        if should_warn {
+            warn!(
+                uuid = %uuid,
+                elapsed_secs = elapsed.as_secs(),
+                "Transfer has been pending for over 1 minute"
+            );
+            return Some(Instant::now());
+        }
+    }
+    last_warned_at
+}
+
+/// Generic polling-based transfer completion handler.
+/// Works with any CompletionChecker implementation (NIXL status, CUDA events, etc.)
+pub async fn process_polling_notifications<C: CompletionChecker>(
+    mut rx: mpsc::Receiver<RegisterPollingNotification<C>>,
+) {
+    let mut outstanding: HashMap<Uuid, OutstandingPollingTransfer<C>> = HashMap::new();
+    let mut check_interval = interval(Duration::from_millis(1));
+
+    loop {
+        tokio::select! {
+            // Handle new transfer requests
+            notification = rx.recv() => {
+                match notification {
+                    Some(notif) => {
+                        outstanding.insert(notif.uuid, OutstandingPollingTransfer {
+                            checker: notif.checker,
+                            done: notif.done,
+                            arrived_at: Instant::now(),
+                            last_warned_at: None,
+                        });
+                    }
+                    None => {
+                        // Channel closed, finish processing outstanding transfers then exit
+                        break;
+                    }
+                }
+            }
+
+            // Periodically check status of outstanding transfers
+            _ = check_interval.tick(), if !outstanding.is_empty() => {
+                let mut completed = Vec::new();
+
+                for (uuid, transfer) in outstanding.iter_mut() {
+                    // Check transfer status
+                    match transfer.checker.is_complete() {
+                        Ok(true) => {
+                            // Transfer complete - mark for removal
+                            completed.push((*uuid, Ok(())));
+                        }
+                        Ok(false) => {
+                            // Transfer still in progress - check if we should warn
+                            transfer.last_warned_at = check_and_warn_slow_transfer(
+                                uuid,
+                                transfer.arrived_at,
+                                transfer.last_warned_at,
+                            );
+                        }
+                        Err(e) => {
+                            warn!(
+                                uuid = %uuid,
+                                error = %e,
+                                "Transfer status check failed"
+                            );
+                            completed.push((*uuid, Err(e)));
+                        }
+                    }
+                }
+
+                // Remove completed transfers and signal completion
+                for (uuid, result) in completed {
+                    if let Some(transfer) = outstanding.remove(&uuid) {
+                        // Signal completion (ignore if receiver dropped)
+                        let _ = transfer.done.send(result);
+                    }
+                }
+            }
+        }
+    }
+
+    // Channel closed, but we may still have outstanding transfers
+    // Continue processing them until all are complete
+    while !outstanding.is_empty() {
+        check_interval.tick().await;
+
+        let mut completed = Vec::new();
+
+        for (uuid, transfer) in outstanding.iter() {
+            match transfer.checker.is_complete() {
+                Ok(true) => {
+                    completed.push((*uuid, Ok(())));
+                }
+                Ok(false) => {
+                    // Still pending
+                }
+                Err(e) => {
+                    warn!(
+                        uuid = %uuid,
+                        error = %e,
+                        "Transfer status check failed during shutdown"
+                    );
+                    completed.push((*uuid, Err(e)));
+                }
+            }
+        }
+
+        for (uuid, result) in completed {
+            if let Some(transfer) = outstanding.remove(&uuid) {
+                let _ = transfer.done.send(result);
+            }
+        }
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_events.rs b/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_events.rs
new file mode 100644
index 0000000000..65a02936d6
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_events.rs
@@ -0,0 +1,188 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! NIXL notification-based completion handler.
+
+use std::collections::HashMap;
+use std::time::{Duration, Instant};
+
+use anyhow::Result;
+use nixl_sys::{Agent as NixlAgent, NotificationMap, XferRequest};
+use tokio::sync::{mpsc, oneshot};
+use tokio::time::interval;
+use tracing::warn;
+use uuid::Uuid;
+
+/// Registration message for NIXL notification-based transfer completion.
+pub struct RegisterNixlNotification {
+    pub uuid: Uuid,
+    pub xfer_req: XferRequest,
+    pub done: oneshot::Sender<Result<()>>,
+}
+
+/// Tracking struct for outstanding NIXL notification transfers.
+struct OutstandingTransfer {
+    #[allow(dead_code)] // Kept for potential future cleanup or debugging
+    xfer_req: XferRequest,
+    done: oneshot::Sender<Result<()>>,
+    arrived_at: Instant,
+    last_warned_at: Option<Instant>,
+}
+
+/// Helper function to check if a transfer should be warned about and log the warning.
+/// Returns the new last_warned_at time if a warning was issued.
+fn check_and_warn_slow_transfer(
+    uuid: &Uuid,
+    arrived_at: Instant,
+    last_warned_at: Option<Instant>,
+) -> Option<Instant> {
+    let elapsed = arrived_at.elapsed();
+    if elapsed > Duration::from_secs(60) {
+        let should_warn = last_warned_at
+            .map(|last| last.elapsed() > Duration::from_secs(30))
+            .unwrap_or(true);
+
+        if should_warn {
+            warn!(
+                uuid = %uuid,
+                elapsed_secs = elapsed.as_secs(),
+                "Transfer has been pending for over 1 minute"
+            );
+            return Some(Instant::now());
+        }
+    }
+    last_warned_at
+}
+
+/// NIXL notification-based transfer completion handler.
+/// Fetches notifications in batches and matches them against outstanding transfers.
+pub async fn process_nixl_notification_events(
+    agent: NixlAgent,
+    mut rx: mpsc::Receiver<RegisterNixlNotification>,
+) {
+    let mut outstanding: HashMap<Uuid, OutstandingTransfer> = HashMap::new();
+    let mut check_interval = interval(Duration::from_millis(1));
+
+    loop {
+        tokio::select! {
+            // Handle new transfer requests
+            notification = rx.recv() => {
+                match notification {
+                    Some(notif) => {
+                        outstanding.insert(notif.uuid, OutstandingTransfer {
+                            xfer_req: notif.xfer_req,
+                            done: notif.done,
+                            arrived_at: Instant::now(),
+                            last_warned_at: None,
+                        });
+                    }
+                    None => {
+                        // Channel closed, finish processing outstanding transfers then exit
+                        break;
+                    }
+                }
+            }
+
+            // Periodically fetch and process notifications
+            _ = check_interval.tick(), if !outstanding.is_empty() => {
+                // Create notification map inside this branch to avoid Send issues
+                let mut notif_map = match NotificationMap::new() {
+                    Ok(map) => map,
+                    Err(e) => {
+                        warn!(error = %e, "Failed to create notification map");
+                        continue;
+                    }
+                };
+
+                // Fetch all pending notifications
+                if let Err(e) = agent.get_notifications(&mut notif_map, None) {
+                    warn!(error = %e, "Failed to fetch NIXL notifications");
+                    continue;
+                }
+
+                // Process notifications and match against outstanding transfers
+                let notifications = match notif_map.take_notifs() {
+                    Ok(notifs) => notifs,
+                    Err(e) => {
+                        warn!(error = %e, "Failed to extract notifications from map");
+                        continue;
+                    }
+                };
+
+                let mut completed = Vec::new();
+
+                // Iterate through all notifications
+                for (_agent_name, notif_strings) in notifications {
+                    for notif_str in notif_strings {
+                        // Try to parse notification as UUID
+                        // NOTE: This assumes notifications contain UUIDs.
+                        // The actual format may be different and may need adjustment.
+                        if let Ok(notif_uuid) = Uuid::parse_str(&notif_str) {
+                            if outstanding.contains_key(&notif_uuid) {
+                                completed.push(notif_uuid);
+                            } else {
+                                // Notification arrived before we started waiting for it
+                                // This is the race condition we need to handle
+                                warn!(
+                                    uuid = %notif_uuid,
+                                    "Received notification for transfer not in outstanding map (early arrival)"
+                                );
+                            }
+                        }
+                    }
+                }
+
+                // Check for slow transfers and update warnings
+                for (uuid, transfer) in outstanding.iter_mut() {
+                    if !completed.contains(uuid) {
+                        transfer.last_warned_at = check_and_warn_slow_transfer(
+                            uuid,
+                            transfer.arrived_at,
+                            transfer.last_warned_at,
+                        );
+                    }
+                }
+
+                // Remove completed transfers and signal completion
+                for uuid in completed {
+                    if let Some(transfer) = outstanding.remove(&uuid) {
+                        let _ = transfer.done.send(Ok(()));
+                    }
+                }
+            }
+        }
+    }
+
+    // Channel closed, but we may still have outstanding transfers
+    // Continue processing them until all are complete
+    while !outstanding.is_empty() {
+        check_interval.tick().await;
+
+        let mut notif_map = match NotificationMap::new() {
+            Ok(map) => map,
+            Err(_) => continue,
+        };
+
+        if let Ok(()) = agent.get_notifications(&mut notif_map, None)
+            && let Ok(notifications) = notif_map.take_notifs()
+        {
+            let mut completed = Vec::new();
+
+            for (_agent_name, notif_strings) in notifications {
+                for notif_str in notif_strings {
+                    if let Ok(notif_uuid) = Uuid::parse_str(&notif_str)
+                        && outstanding.contains_key(&notif_uuid)
+                    {
+                        completed.push(notif_uuid);
+                    }
+                }
+            }
+
+            for uuid in completed {
+                if let Some(transfer) = outstanding.remove(&uuid) {
+                    let _ = transfer.done.send(Ok(()));
+                }
+            }
+        }
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_status.rs b/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_status.rs
new file mode 100644
index 0000000000..b1b6027a1a
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/notifications/nixl_status.rs
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! NIXL status polling-based completion checker.
+
+use anyhow::{Result, anyhow};
+use nixl_sys::{Agent as NixlAgent, XferRequest};
+
+use super::CompletionChecker;
+
+/// Completion checker that polls NIXL transfer status.
+pub struct NixlStatusChecker {
+    agent: NixlAgent,
+    xfer_req: XferRequest,
+}
+
+impl NixlStatusChecker {
+    pub fn new(agent: NixlAgent, xfer_req: XferRequest) -> Self {
+        Self { agent, xfer_req }
+    }
+}
+
+impl CompletionChecker for NixlStatusChecker {
+    fn is_complete(&self) -> Result<bool> {
+        // get_xfer_status returns XferStatus enum:
+        // - XferStatus::Success means transfer is complete
+        // - XferStatus::InProgress means still pending
+        match self.agent.get_xfer_status(&self.xfer_req) {
+            Ok(status) => Ok(status.is_success()),
+            Err(e) => Err(anyhow!("NIXL transfer status check failed: {}", e)),
+        }
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/notifications/notification.rs b/lib/llm/src/block_manager/v2/physical/transfer/notifications/notification.rs
new file mode 100644
index 0000000000..d95a1d0316
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/notifications/notification.rs
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer completion notification handle.
+
+use anyhow::Result;
+use tokio::sync::oneshot;
+
+/// Notification handle for an in-progress transfer.
+///
+/// This object can be awaited to block until the transfer completes.
+/// The transfer is tracked by a background handler that polls for completion
+/// or processes notification events.
+pub struct TransferCompleteNotification {
+    pub(crate) status: oneshot::Receiver<Result<()>>,
+}
+
+impl TransferCompleteNotification {
+    /// Create a notification that is already completed (for synchronous transfers).
+    ///
+    /// This is useful for transfers that complete immediately without needing
+    /// background polling, such as memcpy operations.
+    pub fn completed() -> Self {
+        let (tx, rx) = oneshot::channel();
+        // Signal completion immediately
+        let _ = tx.send(Ok(()));
+        Self { status: rx }
+    }
+
+    /// Wait for the transfer to complete (blocking).
+    ///
+    /// This method blocks the current thread until the transfer completes.
+    /// Use `.await` for async contexts.
+    ///
+    /// Returns `Ok(())` when the transfer successfully completes, or an error
+    /// if the background handler was dropped before completion or if the transfer failed.
+    pub fn wait(self) -> Result<()> {
+        self.status
+            .blocking_recv()
+            .map_err(|_| anyhow::anyhow!("Transfer handler dropped before completion"))?
+    }
+}
+
+impl std::future::Future for TransferCompleteNotification {
+    type Output = Result<()>;
+
+    fn poll(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Self::Output> {
+        use std::pin::Pin;
+        Pin::new(&mut self.status).poll(cx).map(|result| {
+            result
+                .map_err(|_| anyhow::anyhow!("Transfer handler dropped before completion"))
+                .and_then(|r| r)
+        })
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/options.rs b/lib/llm/src/block_manager/v2/physical/transfer/options.rs
new file mode 100644
index 0000000000..3eee954b4d
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/options.rs
@@ -0,0 +1,117 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer options for configuring block and layer transfers.
+
+use super::BounceBufferSpec;
+use derive_builder::Builder;
+use std::{ops::Range, sync::Arc};
+
+/// Options for configuring transfer operations.
+///
+/// This structure provides configuration for block and layer transfers,
+/// including layer ranges, NIXL write notifications, and bounce buffers.
+///
+/// # Examples
+///
+/// ```rust,ignore
+/// let options = TransferOptions::builder()
+///     .nixl_write_notification(42)
+///     .layer_range(0..10)
+///     .build();
+/// ```
+#[derive(Clone, Default, Builder)]
+#[builder(pattern = "owned", default)]
+pub struct TransferOptions {
+    /// Range of layers to transfer (None = all layers).
+    ///
+    /// When specified, only the layers in this range will be transferred.
+    /// This is useful for partial block transfers or layer-specific operations.
+    #[builder(default, setter(strip_option))]
+    pub layer_range: Option<Range<usize>>,
+
+    /// NIXL write notification value delivered after RDMA write completes.
+    ///
+    /// When specified, NIXL will deliver this notification value to the remote
+    /// node after the RDMA write operation completes. This enables efficient
+    /// notification of transfer completion without requiring polling.
+    #[builder(default, setter(strip_option))]
+    pub nixl_write_notification: Option<u64>,
+
+    /// Bounce buffer specification for multi-hop transfers.
+    ///
+    /// When direct transfers are not allowed or efficient, this specifies
+    /// an intermediate staging area. The transfer will be split into two hops:
+    /// source → bounce buffer → destination.
+    #[builder(default, setter(strip_option, into))]
+    pub bounce_buffer: Option<Arc<dyn BounceBufferSpec>>,
+}
+
+impl TransferOptions {
+    /// Create a new builder for transfer options.
+    pub fn builder() -> TransferOptionsBuilder {
+        TransferOptionsBuilder::default()
+    }
+
+    /// Create transfer options from an optional layer range.
+    pub fn from_layer_range(layer_range: Option<Range<usize>>) -> Self {
+        Self {
+            layer_range,
+            ..Self::default()
+        }
+    }
+
+    /// Create default transfer options.
+    ///
+    /// This transfers all layers with no special configuration.
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_options() {
+        let options = TransferOptions::default();
+        assert!(options.layer_range.is_none());
+        assert!(options.nixl_write_notification.is_none());
+        assert!(options.bounce_buffer.is_none());
+    }
+
+    #[test]
+    fn test_builder_with_notification() {
+        let options = TransferOptions::builder()
+            .nixl_write_notification(42)
+            .build()
+            .unwrap();
+
+        assert_eq!(options.nixl_write_notification, Some(42));
+        assert!(options.layer_range.is_none());
+    }
+
+    #[test]
+    fn test_builder_with_layer_range() {
+        let options = TransferOptions::builder()
+            .layer_range(0..10)
+            .build()
+            .unwrap();
+
+        assert_eq!(options.layer_range, Some(0..10));
+        assert!(options.nixl_write_notification.is_none());
+    }
+
+    #[test]
+    fn test_builder_with_all_options() {
+        let options = TransferOptions::builder()
+            .nixl_write_notification(100)
+            .layer_range(5..15)
+            .build()
+            .unwrap();
+
+        assert_eq!(options.nixl_write_notification, Some(100));
+        assert_eq!(options.layer_range, Some(5..15));
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/preferences.rs b/lib/llm/src/block_manager/v2/physical/transfer/preferences.rs
new file mode 100644
index 0000000000..1f14db205e
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/preferences.rs
@@ -0,0 +1,120 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer preferences for resolving redundant strategy choices.
+//!
+//! Some source/destination combinations can use multiple transfer strategies.
+//! For example:
+//! - System ↔ Pinned: memcpy or NIXL
+//! - Pinned ↔ Device: CUDA or NIXL
+//!
+//! This module provides preferences to control which strategy to prefer.
+
+use serde::{Deserialize, Serialize};
+
+/// Policy for choosing between native transports (memcpy/CUDA) and NIXL.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
+pub enum NativeVsNixlPolicy {
+    /// Always prefer native transports (memcpy/CUDA) when available
+    PreferNative,
+
+    /// Always prefer NIXL when available
+    PreferNixl,
+
+    /// Use native for local-to-local, NIXL for remote/disk
+    #[default]
+    Automatic,
+}
+
+/// Transfer preferences for strategy selection.
+///
+/// These preferences allow fine-grained control over transfer strategy selection
+/// when multiple valid strategies exist for a source/destination pair.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TransferPreferences {
+    /// Policy for native vs NIXL transport selection
+    pub native_vs_nixl: NativeVsNixlPolicy,
+
+    /// Whether to prefer async CUDA operations over blocking ones
+    pub prefer_async_cuda: bool,
+}
+
+impl Default for TransferPreferences {
+    fn default() -> Self {
+        Self {
+            native_vs_nixl: NativeVsNixlPolicy::default(),
+            prefer_async_cuda: true,
+        }
+    }
+}
+
+impl TransferPreferences {
+    /// Create preferences with all defaults.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Create preferences that always prefer native transports.
+    pub fn prefer_native() -> Self {
+        Self {
+            native_vs_nixl: NativeVsNixlPolicy::PreferNative,
+            prefer_async_cuda: true,
+        }
+    }
+
+    /// Create preferences that always prefer NIXL.
+    pub fn prefer_nixl() -> Self {
+        Self {
+            native_vs_nixl: NativeVsNixlPolicy::PreferNixl,
+            prefer_async_cuda: true,
+        }
+    }
+
+    /// Set the native vs NIXL policy.
+    pub fn with_native_vs_nixl(mut self, policy: NativeVsNixlPolicy) -> Self {
+        self.native_vs_nixl = policy;
+        self
+    }
+
+    /// Set whether to prefer async CUDA operations.
+    pub fn with_async_cuda(mut self, prefer_async: bool) -> Self {
+        self.prefer_async_cuda = prefer_async;
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_preferences() {
+        let prefs = TransferPreferences::default();
+        assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::Automatic);
+        assert!(prefs.prefer_async_cuda);
+    }
+
+    #[test]
+    fn test_prefer_native() {
+        let prefs = TransferPreferences::prefer_native();
+        assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::PreferNative);
+        assert!(prefs.prefer_async_cuda);
+    }
+
+    #[test]
+    fn test_prefer_nixl() {
+        let prefs = TransferPreferences::prefer_nixl();
+        assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::PreferNixl);
+        assert!(prefs.prefer_async_cuda);
+    }
+
+    #[test]
+    fn test_builder_pattern() {
+        let prefs = TransferPreferences::new()
+            .with_native_vs_nixl(NativeVsNixlPolicy::PreferNixl)
+            .with_async_cuda(false);
+
+        assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::PreferNixl);
+        assert!(!prefs.prefer_async_cuda);
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/strategy.rs b/lib/llm/src/block_manager/v2/physical/transfer/strategy.rs
new file mode 100644
index 0000000000..12eeeb67eb
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/strategy.rs
@@ -0,0 +1,506 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer strategy selection based on source and destination storage locations.
+
+use crate::block_manager::v2::memory::StorageKind;
+
+use super::TransferCapabilities;
+use crate::block_manager::v2::physical::{layout::PhysicalLayout, transfer::TransferContext};
+
+/// Transfer strategy to use for copying memory between locations.
+///
+/// The strategy is determined by the source and destination storage locations.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TransferStrategy {
+    /// CPU memcpy (for host-to-host transfers)
+    Memcpy,
+
+    /// CUDA async host-to-device transfer
+    CudaAsyncH2D,
+
+    /// CUDA async device-to-host transfer
+    CudaAsyncD2H,
+
+    /// CUDA async device-to-device transfer
+    CudaAsyncD2D,
+
+    /// CUDA blocking host-to-device transfer
+    CudaBlockingH2D,
+
+    /// CUDA blocking device-to-host transfer
+    CudaBlockingD2H,
+
+    /// NIXL read operation (pull from remote)
+    NixlRead,
+
+    /// NIXL write operation (push to remote)
+    NixlWrite,
+
+    /// NIXL write (flipped local and remote order)
+    /// This is needed for some NIXL backends.
+    /// For example, the POSIX backend requires that host memory
+    /// always be the "local" descriptor list, regardless of whether
+    /// it's a read or write.
+    NixlWriteFlipped,
+
+    /// NIXL read (flipped local and remote order)
+    NixlReadFlipped,
+
+    /// Invalid/unsupported transfer
+    Invalid,
+}
+
+/// Plan for executing a transfer, either direct or via bounce buffer.
+///
+/// Some transfers require staging through host memory when direct paths
+/// are not enabled via capabilities.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TransferPlan {
+    /// Direct single-hop transfer using the specified strategy.
+    Direct(TransferStrategy),
+
+    /// Two-hop transfer requiring a bounce buffer in host memory.
+    ///
+    /// This is used when:
+    /// - Device → Remote (without GPU RDMA)
+    /// - Disk → Remote
+    /// - Device ↔ Disk (without GDS)
+    TwoHop {
+        /// First hop strategy (src → bounce)
+        first: TransferStrategy,
+
+        /// Bounce buffer location (always Pinned for best performance)
+        bounce_location: StorageKind,
+
+        /// Second hop strategy (bounce → dst)
+        second: TransferStrategy,
+    },
+}
+
+pub(crate) fn select_strategy(
+    src: &PhysicalLayout,
+    dst: &PhysicalLayout,
+    ctx: &TransferContext,
+) -> anyhow::Result<TransferPlan> {
+    let is_src_local = src.nixl_metadata().agent_name() == ctx.nixl_agent().name();
+    let is_dst_local = dst.nixl_metadata().agent_name() == ctx.nixl_agent().name();
+
+    if !is_src_local && !is_dst_local {
+        return Err(anyhow::anyhow!(
+            "Both src and dst are remote - this is not supported."
+        ));
+    }
+
+    if is_src_local && is_dst_local {
+        return Ok(select_direct_strategy(
+            src.location(),
+            dst.location(),
+            false,
+            ctx.capabilities(),
+        ));
+    }
+
+    select_remote_strategy_v2(
+        src.location(),
+        is_src_local,
+        dst.location(),
+        is_dst_local,
+        ctx.capabilities(),
+    )
+}
+
+/// Select the appropriate transfer plan based on source and destination locations.
+///
+/// # Arguments
+/// * `src` - Source storage location (always local)
+/// * `dst` - Destination storage location (can be local or remote)
+/// * `dst_is_remote` - Whether destination is on a remote node
+/// * `capabilities` - Transfer capability flags
+///
+/// # Returns
+/// A transfer plan (direct or two-hop)
+///
+/// # Conservative Default Policy
+///
+/// With default capabilities (all disabled):
+/// - Device can only transfer to/from Host
+/// - Disk can only transfer to/from Host
+/// - Host can transfer to Device, Disk, or Remote
+/// - Device ↔ Device is allowed (native CUDA)
+///
+/// Transfers that would violate this policy are staged through host:
+/// - Device → Remote: Device → Host → Remote (2 hops)
+/// - Disk → Remote: Disk → Host → Remote (2 hops)
+/// - Device ↔ Disk: Device → Host → Disk (2 hops)
+///
+/// # Optional Direct Paths
+///
+/// - `allow_gds`: Enables Disk ↔ Device direct transfers
+/// - `allow_gpu_rdma`: Enables Device → Remote direct transfers
+fn select_direct_strategy(
+    src: StorageKind,
+    dst: StorageKind,
+    dst_is_remote: bool,
+    capabilities: &TransferCapabilities,
+) -> TransferPlan {
+    use StorageKind::*;
+    use TransferStrategy::*;
+
+    // Handle remote destination
+    if dst_is_remote {
+        return select_remote_strategy(src, capabilities);
+    }
+
+    // Local-to-local transfers
+    match (src, dst) {
+        // Host ↔ Host - direct memcpy
+        (System, System) | (System, Pinned) | (Pinned, System) | (Pinned, Pinned) => {
+            TransferPlan::Direct(Memcpy)
+        }
+
+        // Host → Device - direct CUDA
+        (System, Device(_)) => TransferPlan::Direct(CudaBlockingH2D),
+        (Pinned, Device(_)) => TransferPlan::Direct(CudaAsyncH2D),
+
+        // Device → Host - direct CUDA
+        (Device(_), System) => TransferPlan::Direct(CudaBlockingD2H),
+        (Device(_), Pinned) => TransferPlan::Direct(CudaAsyncD2H),
+
+        // Device ↔ Device - direct CUDA
+        (Device(_), Device(_)) => TransferPlan::Direct(CudaAsyncD2D),
+
+        // Host ↔ Disk - direct NIXL
+        (System, Disk(_)) | (Pinned, Disk(_)) => TransferPlan::Direct(NixlWrite),
+        (Disk(_), System) | (Disk(_), Pinned) => TransferPlan::Direct(NixlReadFlipped),
+
+        // Disk ↔ Disk - NIXL doesn't seem to support direct transfers here.
+        // Leaving this as two-hop for now.
+        (Disk(_), Disk(_)) => TransferPlan::TwoHop {
+            first: NixlReadFlipped,
+            bounce_location: Pinned,
+            second: NixlWrite,
+        },
+
+        // Device ↔ Disk - check GDS capability
+        (Device(_), Disk(_)) => {
+            if capabilities.allows_device_disk_direct() {
+                // Direct GDS transfer
+                TransferPlan::Direct(NixlWrite)
+            } else {
+                // Stage through host: Device → Pinned → Disk
+                TransferPlan::TwoHop {
+                    first: CudaAsyncD2H,
+                    bounce_location: Pinned,
+                    second: NixlWrite,
+                }
+            }
+        }
+        (Disk(_), Device(_)) => {
+            if capabilities.allows_device_disk_direct() {
+                // Direct GDS transfer
+                TransferPlan::Direct(NixlRead)
+            } else {
+                // Stage through host: Disk → Pinned → Device
+                TransferPlan::TwoHop {
+                    first: NixlReadFlipped,
+                    bounce_location: Pinned,
+                    second: CudaAsyncH2D,
+                }
+            }
+        }
+    }
+}
+
+/// Select transfer strategy for remote destination.
+fn select_remote_strategy(src: StorageKind, capabilities: &TransferCapabilities) -> TransferPlan {
+    use StorageKind::*;
+    use TransferStrategy::*;
+
+    match src {
+        // Host → Remote - direct NIXL
+        System | Pinned => TransferPlan::Direct(NixlWrite),
+
+        // Device → Remote - check GPU RDMA capability
+        Device(_) => {
+            if capabilities.allows_device_remote_direct() {
+                // Direct GPU RDMA transfer
+                TransferPlan::Direct(NixlWrite)
+            } else {
+                // Stage through host: Device → Pinned → Remote
+                TransferPlan::TwoHop {
+                    first: CudaAsyncD2H,
+                    bounce_location: Pinned,
+                    second: NixlWrite,
+                }
+            }
+        }
+
+        // Disk → Remote - always stage through host
+        Disk(_) => TransferPlan::TwoHop {
+            first: NixlWrite,
+            bounce_location: Pinned,
+            second: NixlWrite,
+        },
+    }
+}
+
+fn select_remote_strategy_v2(
+    src: StorageKind,
+    is_src_local: bool,
+    dst: StorageKind,
+    is_dst_local: bool,
+    capabilities: &TransferCapabilities,
+) -> anyhow::Result<TransferPlan> {
+    // We only support System, Pinned and Device for remote transfers.
+    // Later we might support staged/bounce buffer transfers.
+
+    if matches!(src, StorageKind::Disk(_)) | matches!(dst, StorageKind::Disk(_)) {
+        return Err(anyhow::anyhow!(
+            "Neither local nor remote disk transfers are supported over NIXL at this time."
+        ));
+    }
+
+    if !capabilities.allow_gpu_rdma
+        && (matches!(src, StorageKind::Device(_)) || matches!(dst, StorageKind::Device(_)))
+    {
+        return Err(anyhow::anyhow!(
+            "GPU RDMA is disabled - this transfer requires GPU RDMA."
+        ));
+    }
+
+    if is_src_local && !is_dst_local {
+        return Ok(TransferPlan::Direct(TransferStrategy::NixlWrite));
+    }
+
+    if is_dst_local && !is_src_local {
+        return Ok(TransferPlan::Direct(TransferStrategy::NixlReadFlipped));
+    }
+
+    unreachable!("Both src and dst are remote - this is not supported.");
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn default_caps() -> TransferCapabilities {
+        TransferCapabilities::default()
+    }
+
+    #[test]
+    fn test_host_to_host_transfers() {
+        let caps = default_caps();
+        assert_eq!(
+            select_direct_strategy(StorageKind::System, StorageKind::System, false, &caps),
+            TransferPlan::Direct(TransferStrategy::Memcpy)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::System, StorageKind::Pinned, false, &caps),
+            TransferPlan::Direct(TransferStrategy::Memcpy)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Pinned, StorageKind::System, false, &caps),
+            TransferPlan::Direct(TransferStrategy::Memcpy)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Pinned, StorageKind::Pinned, false, &caps),
+            TransferPlan::Direct(TransferStrategy::Memcpy)
+        );
+    }
+
+    #[test]
+    fn test_host_to_device_transfers() {
+        let caps = default_caps();
+        // System (unpinned) to device should be blocking
+        assert_eq!(
+            select_direct_strategy(StorageKind::System, StorageKind::Device(0), false, &caps),
+            TransferPlan::Direct(TransferStrategy::CudaBlockingH2D)
+        );
+
+        // Pinned to device should be async
+        assert_eq!(
+            select_direct_strategy(StorageKind::Pinned, StorageKind::Device(0), false, &caps),
+            TransferPlan::Direct(TransferStrategy::CudaAsyncH2D)
+        );
+    }
+
+    #[test]
+    fn test_device_to_host_transfers() {
+        let caps = default_caps();
+        // Device to system should be blocking
+        assert_eq!(
+            select_direct_strategy(StorageKind::Device(0), StorageKind::System, false, &caps),
+            TransferPlan::Direct(TransferStrategy::CudaBlockingD2H)
+        );
+
+        // Device to pinned should be async
+        assert_eq!(
+            select_direct_strategy(StorageKind::Device(0), StorageKind::Pinned, false, &caps),
+            TransferPlan::Direct(TransferStrategy::CudaAsyncD2H)
+        );
+    }
+
+    #[test]
+    fn test_device_to_device_transfers() {
+        let caps = default_caps();
+        assert_eq!(
+            select_direct_strategy(StorageKind::Device(0), StorageKind::Device(1), false, &caps),
+            TransferPlan::Direct(TransferStrategy::CudaAsyncD2D)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Device(3), StorageKind::Device(3), false, &caps),
+            TransferPlan::Direct(TransferStrategy::CudaAsyncD2D)
+        );
+    }
+
+    #[test]
+    fn test_disk_to_host_transfers() {
+        let caps = default_caps();
+        // Disk to host - direct NIXL
+        assert_eq!(
+            select_direct_strategy(StorageKind::Disk(42), StorageKind::System, false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlReadFlipped)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Disk(42), StorageKind::Pinned, false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlReadFlipped)
+        );
+    }
+
+    #[test]
+    fn test_host_to_disk_transfers() {
+        let caps = default_caps();
+        // Host to disk - direct NIXL
+        assert_eq!(
+            select_direct_strategy(StorageKind::System, StorageKind::Disk(42), false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Pinned, StorageKind::Disk(42), false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+    }
+
+    #[test]
+    fn test_device_to_disk_without_gds() {
+        let caps = default_caps(); // GDS disabled
+        // Device → Disk should use bounce buffer
+        let plan =
+            select_direct_strategy(StorageKind::Device(0), StorageKind::Disk(42), false, &caps);
+        match plan {
+            TransferPlan::TwoHop {
+                first,
+                bounce_location,
+                second,
+            } => {
+                assert_eq!(first, TransferStrategy::CudaAsyncD2H);
+                assert_eq!(bounce_location, StorageKind::Pinned);
+                assert_eq!(second, TransferStrategy::NixlWrite);
+            }
+            _ => panic!("Expected TwoHop plan"),
+        }
+    }
+
+    #[test]
+    fn test_disk_to_device_without_gds() {
+        let caps = default_caps(); // GDS disabled
+        // Disk → Device should use bounce buffer
+        let plan =
+            select_direct_strategy(StorageKind::Disk(42), StorageKind::Device(0), false, &caps);
+        match plan {
+            TransferPlan::TwoHop {
+                first,
+                bounce_location,
+                second,
+            } => {
+                assert_eq!(first, TransferStrategy::NixlReadFlipped);
+                assert_eq!(bounce_location, StorageKind::Pinned);
+                assert_eq!(second, TransferStrategy::CudaAsyncH2D);
+            }
+            _ => panic!("Expected TwoHop plan"),
+        }
+    }
+
+    #[test]
+    fn test_device_to_disk_with_gds() {
+        let caps = TransferCapabilities::default().with_gds(true);
+        // Device → Disk should be direct with GDS
+        assert_eq!(
+            select_direct_strategy(StorageKind::Device(0), StorageKind::Disk(42), false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+    }
+
+    #[test]
+    fn test_disk_to_device_with_gds() {
+        let caps = TransferCapabilities::default().with_gds(true);
+        // Disk → Device should be direct with GDS
+        assert_eq!(
+            select_direct_strategy(StorageKind::Disk(42), StorageKind::Device(0), false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlRead)
+        );
+    }
+
+    #[test]
+    fn test_host_to_remote() {
+        let caps = default_caps();
+        // Host → Remote - always direct
+        assert_eq!(
+            select_direct_strategy(StorageKind::System, StorageKind::System, true, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Pinned, StorageKind::Pinned, true, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+    }
+
+    #[test]
+    fn test_device_to_remote_without_rdma() {
+        let caps = default_caps(); // GPU RDMA disabled
+        // Device → Remote should use bounce buffer
+        let plan = select_direct_strategy(StorageKind::Device(0), StorageKind::System, true, &caps);
+        match plan {
+            TransferPlan::TwoHop {
+                first,
+                bounce_location,
+                second,
+            } => {
+                assert_eq!(first, TransferStrategy::CudaAsyncD2H);
+                assert_eq!(bounce_location, StorageKind::Pinned);
+                assert_eq!(second, TransferStrategy::NixlWrite);
+            }
+            _ => panic!("Expected TwoHop plan"),
+        }
+    }
+
+    #[test]
+    fn test_device_to_remote_with_rdma() {
+        let caps = TransferCapabilities::default().with_gpu_rdma(true);
+        // Device → Remote should be direct with GPU RDMA
+        assert_eq!(
+            select_direct_strategy(StorageKind::Device(0), StorageKind::Device(0), true, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+    }
+
+    #[test]
+    fn test_disk_to_remote() {
+        let caps = default_caps();
+        // Disk → Remote always uses bounce buffer
+        let plan = select_direct_strategy(StorageKind::Disk(42), StorageKind::System, true, &caps);
+        match plan {
+            TransferPlan::TwoHop {
+                first,
+                bounce_location,
+                second,
+            } => {
+                assert_eq!(first, TransferStrategy::NixlWrite);
+                assert_eq!(bounce_location, StorageKind::Pinned);
+                assert_eq!(second, TransferStrategy::NixlWrite);
+            }
+            _ => panic!("Expected TwoHop plan"),
+        }
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/testing.rs b/lib/llm/src/block_manager/v2/physical/transfer/testing.rs
new file mode 100644
index 0000000000..c675a8c5b6
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/testing.rs
@@ -0,0 +1,363 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Round-trip testing infrastructure for transfer verification.
+//!
+//! This module provides utilities for testing data integrity across transfers
+//! by comparing checksums after round-trip operations:
+//! 1. Source blocks (host) → Intermediate (device/disk/remote)
+//! 2. Intermediate → Destination blocks (host, different IDs)
+//! 3. Verify checksums match between source and destination
+
+use super::{
+    BlockChecksum, FillPattern, PhysicalLayout, StorageKind, compute_block_checksums,
+    fill_blocks, transfer_blocks,
+};
+use super::context::TransferContext;
+use anyhow::{Result, anyhow};
+use std::collections::HashMap;
+
+/// Result of a round-trip test.
+#[derive(Debug)]
+pub struct RoundTripTestResult {
+    /// Source block checksums (keyed by source block ID)
+    pub source_checksums: HashMap<usize, BlockChecksum>,
+
+    /// Destination block checksums (keyed by destination block ID)
+    pub dest_checksums: HashMap<usize, BlockChecksum>,
+
+    /// Block ID mapping used (src_id, dst_id)
+    pub block_mapping: Vec<(usize, usize)>,
+
+    /// Whether all checksums matched
+    pub success: bool,
+
+    /// Mismatched blocks (if any)
+    pub mismatches: Vec<(usize, usize)>, // (src_id, dst_id) pairs that didn't match
+}
+
+impl RoundTripTestResult {
+    /// Check if the round-trip test passed.
+    pub fn is_success(&self) -> bool {
+        self.success
+    }
+
+    /// Get the number of blocks tested.
+    pub fn num_blocks(&self) -> usize {
+        self.block_mapping.len()
+    }
+
+    /// Get a detailed report of the test results.
+    pub fn report(&self) -> String {
+        if self.success {
+            format!(
+                "Round-trip test PASSED: {}/{} blocks verified successfully",
+                self.num_blocks(),
+                self.num_blocks()
+            )
+        } else {
+            format!(
+                "Round-trip test FAILED: {}/{} blocks mismatched\nMismatches: {:?}",
+                self.mismatches.len(),
+                self.num_blocks(),
+                self.mismatches
+            )
+        }
+    }
+}
+
+/// Builder for round-trip tests.
+///
+/// This allows configuring a test that transfers data from source blocks
+/// to intermediate storage and back to different destination blocks,
+/// verifying data integrity via checksums.
+pub struct RoundTripTest {
+    /// Source physical layout (must be local)
+    source: PhysicalLayout,
+
+    /// Intermediate physical layout (can be remote/device/disk)
+    intermediate: PhysicalLayout,
+
+    /// Destination physical layout (must be local)
+    destination: PhysicalLayout,
+
+    /// Block mapping: (src_id, intermediate_id, dst_id)
+    block_mapping: Vec<(usize, usize, usize)>,
+
+    /// Fill pattern for source blocks
+    fill_pattern: FillPattern,
+}
+
+impl RoundTripTest {
+    /// Create a new round-trip test.
+    ///
+    /// # Arguments
+    /// * `source` - Source physical layout (must be local)
+    /// * `intermediate` - Intermediate physical layout
+    /// * `destination` - Destination physical layout (must be local)
+    pub fn new(
+        source: PhysicalLayout,
+        intermediate: PhysicalLayout,
+        destination: PhysicalLayout,
+    ) -> Result<Self> {
+        if source.is_remote() {
+            return Err(anyhow!("Source layout must be local"));
+        }
+        if destination.is_remote() {
+            return Err(anyhow!("Destination layout must be local"));
+        }
+
+        Ok(Self {
+            source,
+            intermediate,
+            destination,
+            block_mapping: Vec::new(),
+            fill_pattern: FillPattern::Sequential,
+        })
+    }
+
+    /// Set the fill pattern for source blocks.
+    pub fn with_fill_pattern(mut self, pattern: FillPattern) -> Self {
+        self.fill_pattern = pattern;
+        self
+    }
+
+    /// Add a block mapping for the round-trip test.
+    ///
+    /// # Arguments
+    /// * `src_id` - Source block ID
+    /// * `intermediate_id` - Intermediate block ID
+    /// * `dst_id` - Destination block ID
+    pub fn add_block_mapping(
+        mut self,
+        src_id: usize,
+        intermediate_id: usize,
+        dst_id: usize,
+    ) -> Self {
+        self.block_mapping.push((src_id, intermediate_id, dst_id));
+        self
+    }
+
+    /// Add multiple block mappings at once.
+    ///
+    /// This is a convenience method for adding several mappings.
+    pub fn with_block_mappings(mut self, mappings: &[(usize, usize, usize)]) -> Self {
+        self.block_mapping.extend_from_slice(mappings);
+        self
+    }
+
+    /// Run the round-trip test.
+    ///
+    /// # Workflow
+    /// 1. Fill source blocks with the specified pattern
+    /// 2. Compute source checksums
+    /// 3. Transfer source → intermediate
+    /// 4. Transfer intermediate → destination
+    /// 5. Compute destination checksums
+    /// 6. Compare checksums
+    ///
+    /// # Arguments
+    /// * `ctx` - Transfer context with CUDA stream and NIXL agent
+    pub async fn run(self, ctx: &TransferContext) -> Result<RoundTripTestResult> {
+        if self.block_mapping.is_empty() {
+            return Err(anyhow!("No block mappings specified"));
+        }
+
+        // Step 1: Fill source blocks
+        let src_ids: Vec<usize> = self.block_mapping.iter().map(|(src, _, _)| *src).collect();
+        fill_blocks(&self.source, &src_ids, self.fill_pattern)?;
+
+        // Step 2: Compute source checksums
+        let source_checksums = compute_block_checksums(&self.source, &src_ids)?;
+
+        // Step 3: Transfer source → intermediate
+        let src_ids_intermediate: Vec<usize> =
+            self.block_mapping.iter().map(|(src, _, _)| *src).collect();
+        let inter_ids_from_src: Vec<usize> = self
+            .block_mapping
+            .iter()
+            .map(|(_, inter, _)| *inter)
+            .collect();
+        let notification = transfer_blocks(
+            &self.source,
+            &self.intermediate,
+            &src_ids_intermediate,
+            &inter_ids_from_src,
+            ctx,
+        )?;
+        notification.await?;
+
+        // Step 4: Transfer intermediate → destination
+        let inter_ids_to_dst: Vec<usize> = self
+            .block_mapping
+            .iter()
+            .map(|(_, inter, _)| *inter)
+            .collect();
+        let dst_ids_from_inter: Vec<usize> =
+            self.block_mapping.iter().map(|(_, _, dst)| *dst).collect();
+        let notification = transfer_blocks(
+            &self.intermediate,
+            &self.destination,
+            &inter_ids_to_dst,
+            &dst_ids_from_inter,
+            ctx,
+        )?;
+        notification.await?;
+
+        // Step 5: Compute destination checksums
+        let dst_ids: Vec<usize> = self.block_mapping.iter().map(|(_, _, dst)| *dst).collect();
+        let dest_checksums = compute_block_checksums(&self.destination, &dst_ids)?;
+
+        // Step 6: Compare checksums
+        let mut mismatches = Vec::new();
+        for (src_id, _, dst_id) in &self.block_mapping {
+            let src_checksum = &source_checksums[src_id];
+            let dst_checksum = &dest_checksums[dst_id];
+
+            if src_checksum != dst_checksum {
+                mismatches.push((*src_id, *dst_id));
+            }
+        }
+
+        let success = mismatches.is_empty();
+        let block_mapping: Vec<(usize, usize)> = self
+            .block_mapping
+            .iter()
+            .map(|(src, _, dst)| (*src, *dst))
+            .collect();
+
+        Ok(RoundTripTestResult {
+            source_checksums,
+            dest_checksums,
+            block_mapping,
+            success,
+            mismatches,
+        })
+    }
+}
+
+#[cfg(test, features = "testing-cuda")]
+mod tests {
+    use super::*;
+    use crate::block_manager::v2::layout::{
+        FullyContiguousLayout, Layout, LayoutConfig, MemoryRegion, OwnedMemoryRegion,
+    };
+    use std::sync::Arc;
+
+    // Helper to create a minimal transfer context for testing
+    // In real tests with CUDA/NIXL, this would be properly constructed
+    fn create_test_context() -> TransferContext {
+        // For now, we'll skip these tests if CUDA is not available
+        // In the future, we can mock TransferContext or use conditional compilation
+        todo!("Create test context - requires CUDA/NIXL setup")
+    }
+
+    #[tokio::test]
+    async fn test_round_trip_host_to_host() {
+        // Create three layouts: source, intermediate, destination
+        let (src_layout, _src_mem) = create_test_layout(4);
+        let (inter_layout, _inter_mem) = create_test_layout(4);
+        let (dst_layout, _dst_mem) = create_test_layout(4);
+
+        let source = PhysicalLayout::new_local(src_layout, StorageKind::System);
+        let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned);
+        let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System);
+
+        // Build round-trip test with different block IDs
+        // Source: blocks [0, 1, 2, 3]
+        // Intermediate: blocks [0, 1, 2, 3]
+        // Destination: blocks [0, 1, 2, 3] (different memory than source)
+        let test = RoundTripTest::new(source, intermediate, destination)
+            .unwrap()
+            .with_fill_pattern(FillPattern::Sequential)
+            .add_block_mapping(0, 0, 0)
+            .add_block_mapping(1, 1, 1)
+            .add_block_mapping(2, 2, 2)
+            .add_block_mapping(3, 3, 3);
+
+        // Create a transfer context (requires actual CUDA/NIXL setup)
+        let ctx = create_test_context();
+
+        // Run the test
+        let result = test.run(&ctx).await.unwrap();
+
+        assert!(result.is_success(), "{}", result.report());
+        assert_eq!(result.num_blocks(), 4);
+    }
+
+    #[tokio::test]
+    async fn test_round_trip_different_block_ids() {
+        // Create layouts with enough blocks
+        let (src_layout, _src_mem) = create_test_layout(8);
+        let (inter_layout, _inter_mem) = create_test_layout(8);
+        let (dst_layout, _dst_mem) = create_test_layout(8);
+
+        let source = PhysicalLayout::new_local(src_layout, StorageKind::System);
+        let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned);
+        let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System);
+
+        // Test with non-overlapping block IDs
+        // Source: blocks [0, 1, 2, 3]
+        // Intermediate: blocks [2, 3, 4, 5]
+        // Destination: blocks [4, 5, 6, 7]
+        let test = RoundTripTest::new(source, intermediate, destination)
+            .unwrap()
+            .with_fill_pattern(FillPattern::BlockBased)
+            .with_block_mappings(&[(0, 2, 4), (1, 3, 5), (2, 4, 6), (3, 5, 7)]);
+
+        let ctx = create_test_context();
+        let result = test.run(&ctx).await.unwrap();
+
+        assert!(result.is_success(), "{}", result.report());
+        assert_eq!(result.num_blocks(), 4);
+    }
+
+    #[test]
+    fn test_round_trip_builder() {
+        let (src_layout, _) = create_test_layout(4);
+        let (inter_layout, _) = create_test_layout(4);
+        let (dst_layout, _) = create_test_layout(4);
+
+        let source = PhysicalLayout::new_local(src_layout, StorageKind::System);
+        let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned);
+        let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System);
+
+        let test = RoundTripTest::new(source, intermediate, destination)
+            .unwrap()
+            .with_fill_pattern(FillPattern::Constant(42))
+            .add_block_mapping(0, 0, 1)
+            .add_block_mapping(1, 1, 2);
+
+        assert_eq!(test.block_mapping.len(), 2);
+    }
+
+    #[test]
+    fn test_round_trip_requires_local_source() {
+        let (src_layout, _) = create_test_layout(1);
+        let (inter_layout, _) = create_test_layout(1);
+        let (dst_layout, _) = create_test_layout(1);
+
+        let source =
+            PhysicalLayout::new_remote(src_layout, StorageKind::System, "remote".to_string());
+        let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned);
+        let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System);
+
+        let result = RoundTripTest::new(source, intermediate, destination);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_round_trip_requires_local_destination() {
+        let (src_layout, _) = create_test_layout(1);
+        let (inter_layout, _) = create_test_layout(1);
+        let (dst_layout, _) = create_test_layout(1);
+
+        let source = PhysicalLayout::new_local(src_layout, StorageKind::System);
+        let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned);
+        let destination =
+            PhysicalLayout::new_remote(dst_layout, StorageKind::System, "remote".to_string());
+
+        let result = RoundTripTest::new(source, intermediate, destination);
+        assert!(result.is_err());
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/tests/local_transfers.rs b/lib/llm/src/block_manager/v2/physical/transfer/tests/local_transfers.rs
new file mode 100644
index 0000000000..846a97206f
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/tests/local_transfers.rs
@@ -0,0 +1,976 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Local transfer tests where source and destination use the same NIXL agent.
+//!
+//! These tests verify data integrity across:
+//! - Different storage types (System, Pinned, Device)
+//! - Different layout types (Fully Contiguous, Layer-wise)
+//! - Different transfer strategies (Memcpy, CUDA H2D/D2H)
+
+use super::*;
+use crate::block_manager::v2::physical::layout::BlockDimension;
+use crate::block_manager::v2::physical::transfer::executor::execute_transfer;
+use crate::block_manager::v2::physical::transfer::{
+    BlockChecksum, BounceBufferSpec, FillPattern, StorageKind, TransferCapabilities,
+    TransferOptions, compute_block_checksums, compute_layer_checksums, fill_blocks, fill_layers,
+};
+use anyhow::Result;
+use rstest::rstest;
+use std::collections::HashMap;
+use std::ops::Range;
+use std::sync::Arc;
+
+// ============================================================================
+// System <=> System Tests (Memcpy)
+// ============================================================================
+
+#[derive(Clone)]
+enum LayoutType {
+    FC,
+    LW,
+}
+
+fn build_layout(
+    agent: NixlAgent,
+    layout_type: LayoutType,
+    storage_kind: StorageKind,
+    num_blocks: usize,
+) -> PhysicalLayout {
+    match layout_type {
+        LayoutType::FC => create_fc_layout(agent, storage_kind, num_blocks),
+        LayoutType::LW => create_lw_layout(agent, storage_kind, num_blocks),
+    }
+}
+
+/// Layout kind for parameterized testing.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum LayoutKind {
+    /// Fully contiguous layout
+    FC,
+    /// Layer-wise (layer-separate) layout
+    LW,
+}
+
+/// Storage and layout specification for creating test layouts.
+#[derive(Debug, Clone, Copy)]
+pub struct LayoutSpec {
+    pub kind: LayoutKind,
+    pub storage: StorageKind,
+}
+
+impl LayoutSpec {
+    pub fn new(kind: LayoutKind, storage: StorageKind) -> Self {
+        Self { kind, storage }
+    }
+}
+
+/// Transfer mode for parameterized testing.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TransferMode {
+    /// Transfer entire blocks (all layers)
+    FullBlocks,
+    /// Transfer only the first layer
+    FirstLayerOnly,
+    /// Transfer only the second layer
+    SecondLayerOnly,
+}
+
+impl TransferMode {
+    /// Convert to optional layer range for execute_transfer.
+    pub fn layer_range(&self) -> Option<Range<usize>> {
+        match self {
+            TransferMode::FullBlocks => None,
+            TransferMode::FirstLayerOnly => Some(0..1),
+            TransferMode::SecondLayerOnly => Some(1..2),
+        }
+    }
+
+    /// Get a descriptive suffix for test names.
+    pub fn suffix(&self) -> &'static str {
+        match self {
+            TransferMode::FullBlocks => "full",
+            TransferMode::FirstLayerOnly => "layer0",
+            TransferMode::SecondLayerOnly => "layer1",
+        }
+    }
+}
+
+/// Create a fully contiguous physical layout with the specified storage type.
+pub fn create_fc_layout(
+    agent: NixlAgent,
+    storage_kind: StorageKind,
+    num_blocks: usize,
+) -> PhysicalLayout {
+    let config = standard_config(num_blocks);
+    let builder = PhysicalLayout::builder(agent)
+        .with_config(config)
+        .fully_contiguous();
+
+    match storage_kind {
+        StorageKind::System => builder.allocate_system().build().unwrap(),
+        StorageKind::Pinned => builder.allocate_pinned(false).build().unwrap(),
+        StorageKind::Device(device_id) => builder.allocate_device(device_id).build().unwrap(),
+        StorageKind::Disk(_) => builder.allocate_disk(None).build().unwrap(),
+    }
+}
+
+/// Create a layer-separate physical layout with the specified storage type.
+pub fn create_lw_layout(
+    agent: NixlAgent,
+    storage_kind: StorageKind,
+    num_blocks: usize,
+) -> PhysicalLayout {
+    let config = standard_config(num_blocks);
+    let builder = PhysicalLayout::builder(agent)
+        .with_config(config)
+        .layer_separate(BlockDimension::BlockIsFirstDim);
+
+    match storage_kind {
+        StorageKind::System => builder.allocate_system().build().unwrap(),
+        StorageKind::Pinned => builder.allocate_pinned(false).build().unwrap(),
+        StorageKind::Device(device_id) => builder.allocate_device(device_id).build().unwrap(),
+        StorageKind::Disk(_) => builder.allocate_disk(None).build().unwrap(),
+    }
+}
+
+/// Create a physical layout based on the specification.
+///
+/// This is a DRY helper that dispatches to create_fc_layout or create_lw_layout
+/// based on the layout kind in the spec.
+pub fn create_layout(agent: NixlAgent, spec: LayoutSpec, num_blocks: usize) -> PhysicalLayout {
+    match spec.kind {
+        LayoutKind::FC => create_fc_layout(agent, spec.storage, num_blocks),
+        LayoutKind::LW => create_lw_layout(agent, spec.storage, num_blocks),
+    }
+}
+
+/// Fill blocks or layers based on transfer mode and compute checksums.
+///
+/// This is a mode-aware version of fill_and_checksum that handles both
+/// full block transfers and layer-wise transfers.
+pub fn fill_and_checksum_with_mode(
+    layout: &PhysicalLayout,
+    block_ids: &[usize],
+    pattern: FillPattern,
+    mode: TransferMode,
+) -> Result<HashMap<usize, BlockChecksum>> {
+    match mode {
+        TransferMode::FullBlocks => {
+            fill_blocks(layout, block_ids, pattern)?;
+            compute_block_checksums(layout, block_ids)
+        }
+        TransferMode::FirstLayerOnly => {
+            fill_layers(layout, block_ids, 0..1, pattern)?;
+            compute_layer_checksums(layout, block_ids, 0..1)
+        }
+        TransferMode::SecondLayerOnly => {
+            fill_layers(layout, block_ids, 1..2, pattern)?;
+            compute_layer_checksums(layout, block_ids, 1..2)
+        }
+    }
+}
+
+/// Verify checksums with transfer mode awareness.
+///
+/// This is a mode-aware version that handles both full block and layer-wise verification.
+pub fn verify_checksums_by_position_with_mode(
+    src_checksums: &HashMap<usize, BlockChecksum>,
+    src_block_ids: &[usize],
+    dst_layout: &PhysicalLayout,
+    dst_block_ids: &[usize],
+    mode: TransferMode,
+) -> Result<()> {
+    assert_eq!(
+        src_block_ids.len(),
+        dst_block_ids.len(),
+        "Source and destination block arrays must have same length"
+    );
+
+    let dst_checksums = match mode {
+        TransferMode::FullBlocks => compute_block_checksums(dst_layout, dst_block_ids)?,
+        TransferMode::FirstLayerOnly => compute_layer_checksums(dst_layout, dst_block_ids, 0..1)?,
+        TransferMode::SecondLayerOnly => compute_layer_checksums(dst_layout, dst_block_ids, 1..2)?,
+    };
+
+    for (src_id, dst_id) in src_block_ids.iter().zip(dst_block_ids.iter()) {
+        let src_checksum = src_checksums
+            .get(src_id)
+            .unwrap_or_else(|| panic!("Missing source checksum for block {}", src_id));
+        let dst_checksum = dst_checksums
+            .get(dst_id)
+            .unwrap_or_else(|| panic!("Missing destination checksum for block {}", dst_id));
+
+        assert_eq!(
+            src_checksum, dst_checksum,
+            "Checksum mismatch (mode={:?}): src[{}] != dst[{}]: {} != {}",
+            mode, src_id, dst_id, src_checksum, dst_checksum
+        );
+    }
+
+    Ok(())
+}
+
+/// Create a test agent with specific backends.
+pub fn create_test_agent_with_backends(name: &str, backends: &[&str]) -> Result<NixlAgent> {
+    NixlAgent::new_with_backends(name, backends)
+}
+
+/// Create a transport manager for testing with the specified agent.
+///
+/// Note: The agent should already have backends configured. Use `create_test_agent`
+/// or `build_agent_with_backends` to create properly configured agents.
+pub fn create_transfer_context(
+    agent: NixlAgent,
+    capabilities: Option<TransferCapabilities>,
+) -> Result<crate::block_manager::v2::physical::manager::TransportManager> {
+    crate::block_manager::v2::physical::manager::TransportManager::builder()
+        .capabilities(capabilities.unwrap_or_default())
+        .worker_id(0) // Default worker ID for local tests
+        .nixl_agent(agent)
+        .cuda_device_id(0)
+        .build()
+}
+
+/// Fill blocks and compute checksums.
+///
+/// This can only be called on System or Pinned layouts.
+pub fn fill_and_checksum(
+    layout: &PhysicalLayout,
+    block_ids: &[usize],
+    pattern: FillPattern,
+) -> Result<HashMap<usize, BlockChecksum>> {
+    fill_blocks(layout, block_ids, pattern)?;
+    compute_block_checksums(layout, block_ids)
+}
+
+/// Verify that destination block checksums match the expected source checksums.
+///
+/// This function compares checksums in order, assuming the source and destination
+/// block arrays have a 1:1 correspondence (src[i] was transferred to dst[i]).
+pub fn verify_checksums_by_position(
+    src_checksums: &HashMap<usize, BlockChecksum>,
+    src_block_ids: &[usize],
+    dst_layout: &PhysicalLayout,
+    dst_block_ids: &[usize],
+) -> Result<()> {
+    assert_eq!(
+        src_block_ids.len(),
+        dst_block_ids.len(),
+        "Source and destination block arrays must have same length"
+    );
+
+    let dst_checksums = compute_block_checksums(dst_layout, dst_block_ids)?;
+
+    for (src_id, dst_id) in src_block_ids.iter().zip(dst_block_ids.iter()) {
+        let src_checksum = src_checksums
+            .get(src_id)
+            .unwrap_or_else(|| panic!("Missing source checksum for block {}", src_id));
+        let dst_checksum = dst_checksums
+            .get(dst_id)
+            .unwrap_or_else(|| panic!("Missing destination checksum for block {}", dst_id));
+
+        assert_eq!(
+            src_checksum, dst_checksum,
+            "Checksum mismatch: src[{}] != dst[{}]: {} != {}",
+            src_id, dst_id, src_checksum, dst_checksum
+        );
+    }
+
+    Ok(())
+}
+
+/// Fill guard blocks and return their checksums for later verification.
+///
+/// Guard blocks are blocks adjacent to transfer destinations that should
+/// remain unchanged during transfers. This function fills them with a
+/// distinctive pattern and returns their checksums for later validation.
+///
+/// # Arguments
+/// * `layout` - The physical layout containing the guard blocks
+/// * `guard_block_ids` - Block IDs to use as guards
+/// * `pattern` - Fill pattern for guard blocks (typically a constant like 0xFF)
+///
+/// # Returns
+/// A map of block ID to checksum for all guard blocks
+pub fn create_guard_blocks(
+    layout: &PhysicalLayout,
+    guard_block_ids: &[usize],
+    pattern: FillPattern,
+) -> Result<HashMap<usize, BlockChecksum>> {
+    fill_blocks(layout, guard_block_ids, pattern)?;
+    compute_block_checksums(layout, guard_block_ids)
+}
+
+/// Verify that guard blocks remain unchanged after transfers.
+///
+/// This function compares the current checksums of guard blocks against
+/// their expected values. Any mismatch indicates memory corruption or
+/// unintended overwrites during transfer operations.
+///
+/// # Arguments
+/// * `layout` - The physical layout containing the guard blocks
+/// * `guard_block_ids` - Block IDs to verify
+/// * `expected_checksums` - Expected checksums from create_guard_blocks
+///
+/// # Errors
+/// Returns an error if any guard block checksum has changed
+pub fn verify_guard_blocks_unchanged(
+    layout: &PhysicalLayout,
+    guard_block_ids: &[usize],
+    expected_checksums: &HashMap<usize, BlockChecksum>,
+) -> Result<()> {
+    let current_checksums = compute_block_checksums(layout, guard_block_ids)?;
+
+    for &block_id in guard_block_ids {
+        let expected = expected_checksums
+            .get(&block_id)
+            .unwrap_or_else(|| panic!("Missing expected checksum for guard block {}", block_id));
+        let current = current_checksums
+            .get(&block_id)
+            .unwrap_or_else(|| panic!("Missing current checksum for guard block {}", block_id));
+
+        if expected != current {
+            return Err(anyhow::anyhow!(
+                "Guard block {} was modified during transfer! Expected: {}, Got: {}",
+                block_id,
+                expected,
+                current
+            ));
+        }
+    }
+
+    Ok(())
+}
+
+struct DummyBounceBufferSpec {
+    pub layout: PhysicalLayout,
+    pub block_ids: Vec<usize>,
+}
+
+impl BounceBufferSpec for DummyBounceBufferSpec {
+    fn layout(&self) -> &PhysicalLayout {
+        &self.layout
+    }
+    fn block_ids(&self) -> &[usize] {
+        &self.block_ids
+    }
+}
+
+fn build_agent_for_kinds(src_kind: StorageKind, dst_kind: StorageKind) -> Result<NixlAgent> {
+    use std::collections::HashSet;
+
+    let mut backends = HashSet::new();
+
+    // Determine required backends for both source and destination
+    for kind in [src_kind, dst_kind] {
+        match kind {
+            StorageKind::System | StorageKind::Pinned => {
+                backends.insert("POSIX"); // Lightweight for DRAM
+            }
+            StorageKind::Device(_) => {
+                backends.insert("UCX"); // Required for VRAM (expensive)
+            }
+            StorageKind::Disk(_) => {
+                backends.insert("POSIX"); // Required for disk I/O
+            }
+        }
+    }
+
+    // Optional: Add GDS for Device <-> Disk optimization
+    match (src_kind, dst_kind) {
+        (StorageKind::Device(_), StorageKind::Disk(_))
+        | (StorageKind::Disk(_), StorageKind::Device(_)) => {
+            backends.insert("GDS_MT");
+        }
+        _ => {}
+    }
+
+    let backend_vec: Vec<&str> = backends.into_iter().collect();
+    create_test_agent_with_backends("agent", &backend_vec)
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_p2p(
+    #[values(LayoutType::FC, LayoutType::LW)] src_layout: LayoutType,
+    #[values(
+        StorageKind::System,
+        StorageKind::Pinned,
+        StorageKind::Device(0),
+        StorageKind::Disk(0)
+    )]
+    src_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] dst_layout: LayoutType,
+    #[values(
+        StorageKind::System,
+        StorageKind::Pinned,
+        StorageKind::Device(0),
+        StorageKind::Disk(0)
+    )]
+    dst_kind: StorageKind,
+) -> Result<()> {
+    use crate::block_manager::v2::physical::transfer::TransferOptions;
+
+    let agent = build_agent_for_kinds(src_kind, dst_kind)?;
+
+    let src = build_layout(agent.clone(), src_layout, src_kind, 4);
+    let dst = build_layout(agent.clone(), dst_layout, dst_kind, 4);
+
+    let bounce_layout = build_layout(agent.clone(), LayoutType::FC, StorageKind::Pinned, 4);
+
+    let bounce_buffer_spec: Arc<dyn BounceBufferSpec> = Arc::new(DummyBounceBufferSpec {
+        layout: bounce_layout,
+        block_ids: vec![0, 1],
+    });
+
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?;
+    let ctx = create_transfer_context(agent, None).unwrap();
+
+    let options = TransferOptions::builder()
+        .bounce_buffer(bounce_buffer_spec)
+        .build()?;
+
+    let notification =
+        execute_transfer(&src, &dst, &src_blocks, &dst_blocks, options, ctx.context())?;
+    notification.await?;
+
+    verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?;
+
+    Ok(())
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_roundtrip(
+    #[values(LayoutType::FC, LayoutType::LW)] src_layout: LayoutType,
+    #[values(StorageKind::System, StorageKind::Pinned, StorageKind::Device(0))]
+    src_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] inter_layout: LayoutType,
+    #[values(StorageKind::System, StorageKind::Pinned, StorageKind::Device(0))]
+    inter_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] dst_layout: LayoutType,
+    #[values(StorageKind::System, StorageKind::Pinned, StorageKind::Device(0))]
+    dst_kind: StorageKind,
+) -> Result<()> {
+    let agent = build_agent_for_kinds(src_kind, dst_kind)?;
+
+    // Create layouts: source pinned, device intermediate, destination pinned
+    let src = build_layout(agent.clone(), src_layout, src_kind, 4);
+    let device = build_layout(agent.clone(), inter_layout, inter_kind, 4);
+    let dst = build_layout(agent.clone(), dst_layout, dst_kind, 4);
+
+    let src_blocks = vec![0, 1];
+    let device_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    // Fill source and compute checksums
+    let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?;
+    let ctx = create_transfer_context(agent, None).unwrap();
+
+    // Transfer: Pinned[0,1] -> Device[0,1]
+    let notification = execute_transfer(
+        &src,
+        &device,
+        &src_blocks,
+        &device_blocks,
+        TransferOptions::default(),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Transfer: Device[0,1] -> Pinned[2,3]
+    let notification = execute_transfer(
+        &device,
+        &dst,
+        &device_blocks,
+        &dst_blocks,
+        TransferOptions::default(),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Verify checksums match
+    verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?;
+
+    Ok(())
+}
+
+#[rstest]
+#[case(StorageKind::Device(0), StorageKind::Disk(0))]
+#[case(StorageKind::Disk(0), StorageKind::Device(0))]
+#[tokio::test]
+async fn test_gds(
+    #[case] src_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] src_layout: LayoutType,
+    #[case] dst_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] dst_layout: LayoutType,
+) -> Result<()> {
+    let capabilities = TransferCapabilities::default().with_gds_if_supported();
+
+    if !capabilities.allow_gds {
+        println!("System does not support GDS. Skipping test.");
+        return Ok(());
+    }
+
+    let agent = build_agent_for_kinds(src_kind, dst_kind)?;
+
+    let src = build_layout(agent.clone(), src_layout, src_kind, 4);
+    let dst = build_layout(agent.clone(), dst_layout, dst_kind, 4);
+
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?;
+    let ctx = create_transfer_context(agent, Some(capabilities)).unwrap();
+
+    let notification = execute_transfer(
+        &src,
+        &dst,
+        &src_blocks,
+        &dst_blocks,
+        TransferOptions::default(),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?;
+
+    Ok(())
+}
+
+#[rstest]
+#[case(StorageKind::Device(0), StorageKind::Disk(0))]
+#[case(StorageKind::Disk(0), StorageKind::Device(0))]
+#[tokio::test]
+async fn test_buffered_transfer(
+    #[case] src_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] src_layout: LayoutType,
+    #[case] dst_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] dst_layout: LayoutType,
+) -> Result<()> {
+    let agent = build_agent_for_kinds(src_kind, dst_kind)?;
+
+    let src = build_layout(agent.clone(), src_layout, src_kind, 5);
+    let dst = build_layout(agent.clone(), dst_layout, dst_kind, 5);
+
+    let src_blocks = vec![0, 1, 2, 3, 4];
+    let dst_blocks = vec![4, 3, 2, 1, 0];
+
+    let bounce_layout = build_layout(agent.clone(), LayoutType::FC, StorageKind::Pinned, 3);
+    let bounce_buffer_spec: Arc<dyn BounceBufferSpec> = Arc::new(DummyBounceBufferSpec {
+        layout: bounce_layout,
+        block_ids: vec![0, 1, 2],
+    });
+
+    let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?;
+    let ctx = create_transfer_context(agent, None).unwrap();
+
+    let notification = execute_transfer(
+        &src,
+        &dst,
+        &src_blocks,
+        &dst_blocks,
+        TransferOptions::builder()
+            .bounce_buffer(bounce_buffer_spec)
+            .build()?,
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?;
+
+    Ok(())
+}
+
+#[rstest]
+#[case(1024)]
+#[case(2048)]
+#[case(4096)]
+#[case(8192)]
+#[case(16384)]
+#[tokio::test]
+async fn test_large_block_counts(#[case] block_count: usize) {
+    let agent = create_test_agent(&format!("test_large_block_counts_{}", block_count));
+
+    let src = create_fc_layout(agent.clone(), StorageKind::Pinned, block_count);
+    let device = create_fc_layout(agent.clone(), StorageKind::Device(0), block_count);
+
+    let src_blocks = (0..block_count).collect::<Vec<_>>();
+    let device_blocks = (0..block_count).collect::<Vec<_>>();
+
+    let ctx = create_transfer_context(agent, None).unwrap();
+    let notification = execute_transfer(
+        &src,
+        &device,
+        &src_blocks,
+        &device_blocks,
+        TransferOptions::default(),
+        ctx.context(),
+    )
+    .unwrap();
+    notification.await.unwrap();
+}
+
+// ============================================================================
+// Parameterized Bounce Tests with Guard Block Validation
+// ============================================================================
+
+/// Test bounce transfers with guard block validation.
+///
+/// This test validates that:
+/// 1. Data can be transferred: host[src_blocks] → bounce[src_blocks] → host[dst_blocks]
+/// 2. Guard blocks adjacent to dst_blocks remain unchanged (no memory corruption)
+/// 3. Works correctly with different storage types, layouts, and transfer modes
+///
+/// Test pattern (6 blocks total):
+/// - Source blocks: [0, 1]
+/// - Destination blocks: [3, 4]
+/// - Guard blocks: [2, 5] (adjacent to destination, should remain unchanged)
+#[rstest]
+// Storage combinations (host, bounce)
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_fc_fc_full(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::FC,
+        LayoutKind::FC,
+        TransferMode::FullBlocks,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_fc_lw_full(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::FC,
+        LayoutKind::LW,
+        TransferMode::FullBlocks,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_lw_fc_full(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::LW,
+        LayoutKind::FC,
+        TransferMode::FullBlocks,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_lw_lw_full(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::LW,
+        LayoutKind::LW,
+        TransferMode::FullBlocks,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_fc_fc_layer0(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::FC,
+        LayoutKind::FC,
+        TransferMode::FirstLayerOnly,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_lw_lw_layer0(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::LW,
+        LayoutKind::LW,
+        TransferMode::FirstLayerOnly,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+/// Implementation helper for bounce tests with guard blocks.
+async fn test_bounce_with_guards_impl(
+    host_storage: StorageKind,
+    bounce_storage: StorageKind,
+    host_layout: LayoutKind,
+    bounce_layout: LayoutKind,
+    mode: TransferMode,
+    name_suffix: &str,
+) -> Result<()> {
+    let num_blocks = 6;
+    let test_name = format!(
+        "bounce_{}_{:?}_{:?}_{}_{}",
+        name_suffix,
+        host_layout,
+        bounce_layout,
+        mode.suffix(),
+        std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap()
+            .as_millis()
+    );
+    let agent = create_test_agent(&test_name);
+
+    // Create layouts
+    let host = create_layout(
+        agent.clone(),
+        LayoutSpec::new(host_layout, host_storage),
+        num_blocks,
+    );
+    let bounce = create_layout(
+        agent.clone(),
+        LayoutSpec::new(bounce_layout, bounce_storage),
+        num_blocks,
+    );
+
+    // Block assignments:
+    // - Transfer: host[0,1] → bounce[0,1] → host[3,4]
+    // - Guards: host[2,5] (should remain unchanged)
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![3, 4];
+    let guard_blocks = vec![2, 5];
+
+    // Setup: Fill source blocks and guard blocks
+    let src_checksums =
+        fill_and_checksum_with_mode(&host, &src_blocks, FillPattern::Sequential, mode)?;
+    let guard_checksums = create_guard_blocks(&host, &guard_blocks, FillPattern::Constant(0xFF))?;
+
+    let ctx = create_transfer_context(agent, None)?;
+
+    // Execute bounce: host[0,1] → bounce[0,1]
+    let notification = execute_transfer(
+        &host,
+        &bounce,
+        &src_blocks,
+        &src_blocks,
+        TransferOptions::from_layer_range(mode.layer_range()),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Execute bounce: bounce[0,1] → host[3,4]
+    let notification = execute_transfer(
+        &bounce,
+        &host,
+        &src_blocks,
+        &dst_blocks,
+        TransferOptions::from_layer_range(mode.layer_range()),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Verify: Data integrity + guards unchanged
+    verify_checksums_by_position_with_mode(&src_checksums, &src_blocks, &host, &dst_blocks, mode)?;
+    verify_guard_blocks_unchanged(&host, &guard_blocks, &guard_checksums)?;
+
+    Ok(())
+}
+
+// ============================================================================
+// Parameterized Direct Transfer Tests
+// ============================================================================
+
+/// Test direct transfers with parameterization over storage, layout, and transfer mode.
+///
+/// This demonstrates the DRY parameterized approach that can replace the 18 individual
+/// tests above (System<=>System, Pinned<=>Pinned, cross-type, etc).
+///
+/// Note: Only tests System<=>System, Pinned<=>Pinned, and System<=>Pinned since we can only
+/// fill/checksum System and Pinned storage. For Device tests, use bounce tests instead.
+#[rstest]
+// Storage combinations (only fillable storage types)
+#[case(StorageKind::System, StorageKind::System, "sys_sys")]
+#[case(StorageKind::Pinned, StorageKind::Pinned, "pin_pin")]
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[tokio::test]
+async fn test_direct_transfer_fc_fc_full(
+    #[case] src_storage: StorageKind,
+    #[case] dst_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_direct_transfer_impl(
+        src_storage,
+        dst_storage,
+        LayoutKind::FC,
+        LayoutKind::FC,
+        TransferMode::FullBlocks,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[tokio::test]
+async fn test_direct_transfer_fc_lw_layer0(
+    #[case] src_storage: StorageKind,
+    #[case] dst_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_direct_transfer_impl(
+        src_storage,
+        dst_storage,
+        LayoutKind::FC,
+        LayoutKind::LW,
+        TransferMode::FirstLayerOnly,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::Pinned, StorageKind::Pinned, "pin_pin")]
+#[tokio::test]
+async fn test_direct_transfer_lw_lw_layer1(
+    #[case] src_storage: StorageKind,
+    #[case] dst_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_direct_transfer_impl(
+        src_storage,
+        dst_storage,
+        LayoutKind::LW,
+        LayoutKind::LW,
+        TransferMode::SecondLayerOnly,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+/// Implementation helper for direct transfer tests.
+async fn test_direct_transfer_impl(
+    src_storage: StorageKind,
+    dst_storage: StorageKind,
+    src_layout: LayoutKind,
+    dst_layout: LayoutKind,
+    mode: TransferMode,
+    name_suffix: &str,
+) -> Result<()> {
+    let num_blocks = 4;
+    let test_name = format!(
+        "direct_{}_{:?}_{:?}_{}_{}",
+        name_suffix,
+        src_layout,
+        dst_layout,
+        mode.suffix(),
+        std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap()
+            .as_millis()
+    );
+    let agent = create_test_agent(&test_name);
+
+    // Create layouts
+    let src = create_layout(
+        agent.clone(),
+        LayoutSpec::new(src_layout, src_storage),
+        num_blocks,
+    );
+    let dst = create_layout(
+        agent.clone(),
+        LayoutSpec::new(dst_layout, dst_storage),
+        num_blocks,
+    );
+
+    // Transfer src[0,1] -> dst[2,3]
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    // Fill source and compute checksums
+    let src_checksums =
+        fill_and_checksum_with_mode(&src, &src_blocks, FillPattern::Sequential, mode)?;
+
+    let ctx = create_transfer_context(agent, None)?;
+
+    // Execute transfer
+    let notification = execute_transfer(
+        &src,
+        &dst,
+        &src_blocks,
+        &dst_blocks,
+        TransferOptions::from_layer_range(mode.layer_range()),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Verify data integrity
+    verify_checksums_by_position_with_mode(&src_checksums, &src_blocks, &dst, &dst_blocks, mode)?;
+
+    Ok(())
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/tests/mod.rs b/lib/llm/src/block_manager/v2/physical/transfer/tests/mod.rs
new file mode 100644
index 0000000000..709e8f65f8
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/tests/mod.rs
@@ -0,0 +1,220 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Comprehensive transfer tests for verifying data integrity across storage types and layout configurations.
+
+#[cfg(all(feature = "testing-cuda", feature = "testing-nixl"))]
+mod local_transfers;
+
+use super::{NixlAgent, PhysicalLayout};
+use crate::block_manager::v2::physical::layout::{
+    LayoutConfig,
+    builder::{HasConfig, NoLayout, NoMemory, PhysicalLayoutBuilder},
+};
+
+/// Standard layout configuration for all tests.
+pub fn standard_config(num_blocks: usize) -> LayoutConfig {
+    LayoutConfig::builder()
+        .num_blocks(num_blocks)
+        .num_layers(2)
+        .outer_dim(2)
+        .page_size(16)
+        .inner_dim(128)
+        .dtype_width_bytes(2)
+        .build()
+        .unwrap()
+}
+
+/// Helper function for creating a PhysicalLayout builder with standard config.
+///
+/// This is used by other test modules (fill, checksum, validation) for backwards compatibility.
+pub fn builder(num_blocks: usize) -> PhysicalLayoutBuilder<HasConfig, NoLayout, NoMemory> {
+    let agent = create_test_agent("test_agent");
+    let config = standard_config(num_blocks);
+    PhysicalLayout::builder(agent).with_config(config)
+}
+
+/// Create a test agent with optimal backends for testing.
+///
+/// Attempts to initialize UCX, GDS, and POSIX backends. Falls back gracefully
+/// if some backends are unavailable (e.g., GDS on non-DGX machines).
+pub fn create_test_agent(name: &str) -> NixlAgent {
+    NixlAgent::require_backends(name, &[]).expect("Failed to require backends")
+}
+
+#[cfg(feature = "testing-cuda")]
+pub(crate) mod cuda {
+    use anyhow::Result;
+    use cudarc::driver::sys::CUdevice_attribute_enum;
+    use cudarc::driver::{CudaContext, CudaStream, LaunchConfig, PushKernelArg};
+    use cudarc::nvrtc::{CompileOptions, compile_ptx_with_opts};
+    use std::collections::HashMap;
+    use std::sync::{Arc, OnceLock};
+    use std::time::{Duration, Instant};
+
+    /// CUDA sleep kernel source code.
+    pub const SLEEP_KERNEL_SRC: &str = r#"
+    extern "C" __global__ void sleep_kernel(unsigned long long min_cycles) {
+        const unsigned long long start = clock64();
+        while ((clock64() - start) < min_cycles) {
+            asm volatile("");
+        }
+    }
+    "#;
+
+    /// A reusable CUDA sleep utility for tests.
+    ///
+    /// This struct provides a simple interface to execute GPU sleep operations
+    /// with calibrated timing. It compiles the sleep kernel once per CUDA context
+    /// and caches the calibration for reuse.
+    ///
+    /// The calibration is conservative (prefers longer sleep durations over shorter)
+    /// to ensure minimum sleep times are met.
+    pub struct CudaSleep {
+        function: cudarc::driver::CudaFunction,
+        cycles_per_ms: f64,
+    }
+
+    impl CudaSleep {
+        /// Get or create a CudaSleep instance for the given CUDA context.
+        ///
+        /// This function uses lazy initialization and caches instances per device ID.
+        /// The first call for each device will compile the kernel and run calibration.
+        ///
+        /// # Arguments
+        /// * `cuda_ctx` - The CUDA context to use
+        ///
+        /// # Returns
+        /// A shared reference to the CudaSleep instance for this context's device.
+        pub fn for_context(cuda_ctx: &Arc<CudaContext>) -> Result<Arc<Self>> {
+            static INSTANCES: OnceLock<parking_lot::Mutex<HashMap<usize, Arc<CudaSleep>>>> =
+                OnceLock::new();
+
+            let instances = INSTANCES.get_or_init(|| parking_lot::Mutex::new(HashMap::new()));
+            let device_ordinal = cuda_ctx.ordinal();
+
+            // Fast path: check if instance already exists
+            {
+                let instances_guard = instances.lock();
+                if let Some(instance) = instances_guard.get(&device_ordinal) {
+                    return Ok(Arc::clone(instance));
+                }
+            }
+
+            // Slow path: create new instance with calibration
+            let instance = Arc::new(Self::new(cuda_ctx)?);
+
+            // Store in cache
+            let mut instances_guard = instances.lock();
+            instances_guard
+                .entry(device_ordinal)
+                .or_insert_with(|| Arc::clone(&instance));
+
+            Ok(instance)
+        }
+
+        /// Create a new CudaSleep instance with calibration.
+        ///
+        /// This compiles the sleep kernel and runs a calibration loop to determine
+        /// the relationship between clock cycles and wall-clock time.
+        fn new(cuda_ctx: &Arc<CudaContext>) -> Result<Self> {
+            // Get device compute capability
+            let major = cuda_ctx
+                .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)?;
+            let minor = cuda_ctx
+                .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)?;
+
+            // Compile PTX for this device
+            let mut compile_opts = CompileOptions {
+                name: Some("sleep_kernel.cu".into()),
+                ..Default::default()
+            };
+            compile_opts
+                .options
+                .push(format!("--gpu-architecture=compute_{}{}", major, minor));
+            let ptx = compile_ptx_with_opts(SLEEP_KERNEL_SRC, compile_opts)?;
+            let module = cuda_ctx.load_module(ptx)?;
+            let function = module.load_function("sleep_kernel")?;
+
+            // Get device clock rate
+            let clock_rate_khz =
+                cuda_ctx.attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_CLOCK_RATE)? as u64;
+
+            // Create a temporary stream for calibration
+            let stream = cuda_ctx.new_stream()?;
+
+            // Warm up to absorb JIT overhead
+            let warm_cycles = clock_rate_khz.saturating_mul(10).max(1);
+            Self::launch_kernel(&function, &stream, warm_cycles)?;
+            stream.synchronize()?;
+
+            // Run calibration loop
+            let desired_delay = Duration::from_millis(600);
+            let mut target_cycles = clock_rate_khz.saturating_mul(50).max(1); // ~50ms starting point
+            let mut actual_duration = Duration::ZERO;
+
+            for _ in 0..8 {
+                let start = Instant::now();
+                Self::launch_kernel(&function, &stream, target_cycles)?;
+                stream.synchronize()?;
+                actual_duration = start.elapsed();
+
+                if actual_duration >= desired_delay {
+                    break;
+                }
+
+                target_cycles = target_cycles.saturating_mul(2);
+            }
+
+            // Calculate cycles per millisecond with conservative 20% margin
+            // (prefer longer sleeps over shorter)
+            let cycles_per_ms = if actual_duration.as_millis() > 0 {
+                (target_cycles as f64 / actual_duration.as_millis() as f64) * 1.2
+            } else {
+                clock_rate_khz as f64 // Fallback to clock rate
+            };
+
+            Ok(Self {
+                function,
+                cycles_per_ms,
+            })
+        }
+
+        /// Launch the sleep kernel with the specified number of cycles.
+        fn launch_kernel(
+            function: &cudarc::driver::CudaFunction,
+            stream: &Arc<CudaStream>,
+            cycles: u64,
+        ) -> Result<()> {
+            let launch_cfg = LaunchConfig {
+                grid_dim: (1, 1, 1),
+                block_dim: (1, 1, 1),
+                shared_mem_bytes: 0,
+            };
+
+            let mut launch = stream.launch_builder(function);
+            unsafe {
+                launch.arg(&cycles);
+                launch.launch(launch_cfg)?;
+            }
+
+            Ok(())
+        }
+
+        /// Launch a sleep operation on the given stream.
+        ///
+        /// This queues a GPU kernel that will sleep for approximately the specified
+        /// duration. The sleep is conservative and may take longer than requested.
+        ///
+        /// # Arguments
+        /// * `duration` - The minimum duration to sleep
+        /// * `stream` - The CUDA stream to launch the kernel on
+        ///
+        /// # Returns
+        /// Ok(()) if the kernel was successfully queued
+        pub fn launch(&self, duration: Duration, stream: &Arc<CudaStream>) -> Result<()> {
+            let target_cycles = (duration.as_millis() as f64 * self.cycles_per_ms) as u64;
+            Self::launch_kernel(&self.function, stream, target_cycles)
+        }
+    }
+}
diff --git a/lib/llm/src/block_manager/v2/physical/transfer/validation.rs b/lib/llm/src/block_manager/v2/physical/transfer/validation.rs
new file mode 100644
index 0000000000..dc460222ab
--- /dev/null
+++ b/lib/llm/src/block_manager/v2/physical/transfer/validation.rs
@@ -0,0 +1,463 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Block ID validation for transfers.
+//!
+//! This module provides validation functions to ensure block transfers are safe and correct.
+
+use super::PhysicalLayout;
+use std::collections::HashSet;
+use thiserror::Error;
+
+/// Validation errors for block transfers.
+#[derive(Debug, Error, PartialEq)]
+pub enum BlockValidationError {
+    /// Destination block IDs contain duplicates.
+    #[error("Destination block IDs are not unique: duplicates = {duplicates:?}")]
+    DuplicateDestinationBlocks { duplicates: Vec<usize> },
+
+    /// Source and destination blocks overlap when using the same layout.
+    #[error("Source and destination blocks overlap (same layout): overlapping = {overlapping:?}")]
+    OverlappingBlocks { overlapping: Vec<usize> },
+
+    /// Lists have mismatched lengths.
+    #[error(
+        "Block ID lists have mismatched lengths: src={src_len}, dst={dst_len}, bounce={bounce_len:?}"
+    )]
+    LengthMismatch {
+        src_len: usize,
+        dst_len: usize,
+        bounce_len: Option<usize>,
+    },
+
+    /// Block ID is out of range for the layout.
+    #[error("Block ID {block_id} out of range for {layout_name} (max={max})")]
+    BlockOutOfRange {
+        block_id: usize,
+        layout_name: &'static str,
+        max: usize,
+    },
+
+    /// Bounce block IDs contain duplicates.
+    #[error("Bounce block IDs are not unique: duplicates = {duplicates:?}")]
+    DuplicateBounceBlocks { duplicates: Vec<usize> },
+}
+
+/// Validate that destination block IDs are unique (no duplicates).
+///
+/// # Arguments
+/// * `dst_block_ids` - Destination block IDs
+///
+/// # Returns
+/// Ok(()) if unique, Err with duplicate IDs otherwise
+pub fn validate_dst_unique(dst_block_ids: &[usize]) -> Result<(), BlockValidationError> {
+    let mut seen = HashSet::new();
+    let mut duplicates = Vec::new();
+
+    for &id in dst_block_ids {
+        if !seen.insert(id) && !duplicates.contains(&id) {
+            duplicates.push(id);
+        }
+    }
+
+    if duplicates.is_empty() {
+        Ok(())
+    } else {
+        Err(BlockValidationError::DuplicateDestinationBlocks { duplicates })
+    }
+}
+
+/// Validate that bounce block IDs are unique (no duplicates).
+pub fn validate_bounce_unique(bounce_block_ids: &[usize]) -> Result<(), BlockValidationError> {
+    let mut seen = HashSet::new();
+    let mut duplicates = Vec::new();
+
+    for &id in bounce_block_ids {
+        if !seen.insert(id) && !duplicates.contains(&id) {
+            duplicates.push(id);
+        }
+    }
+
+    if duplicates.is_empty() {
+        Ok(())
+    } else {
+        Err(BlockValidationError::DuplicateBounceBlocks { duplicates })
+    }
+}
+
+/// Check if two layouts are the same by comparing their Arc pointers.
+///
+/// This is a conservative check - if pointers differ, layouts might still be the same
+/// but we treat them as different to avoid false positives in disjoint validation.
+fn are_same_layout(layout1: &PhysicalLayout, layout2: &PhysicalLayout) -> bool {
+    // Compare Arc pointer addresses
+    std::ptr::eq(
+        std::sync::Arc::as_ptr(layout1.layout()),
+        std::sync::Arc::as_ptr(layout2.layout()),
+    )
+}
+
+/// Validate that src and dst block IDs are disjoint when using the same layout.
+///
+/// Only enforced in debug mode when src and dst point to the same layout.
+///
+/// # Arguments
+/// * `src_block_ids` - Source block IDs
+/// * `dst_block_ids` - Destination block IDs
+/// * `src_layout` - Source physical layout
+/// * `dst_layout` - Destination physical layout
+#[cfg(debug_assertions)]
+pub fn validate_disjoint_same_layout(
+    src_block_ids: &[usize],
+    dst_block_ids: &[usize],
+    src_layout: &PhysicalLayout,
+    dst_layout: &PhysicalLayout,
+) -> Result<(), BlockValidationError> {
+    // Only check if same layout
+    if !are_same_layout(src_layout, dst_layout) {
+        return Ok(());
+    }
+
+    let src_set: HashSet<_> = src_block_ids.iter().copied().collect();
+    let overlapping: Vec<_> = dst_block_ids
+        .iter()
+        .filter(|id| src_set.contains(id))
+        .copied()
+        .collect();
+
+    if overlapping.is_empty() {
+        Ok(())
+    } else {
+        Err(BlockValidationError::OverlappingBlocks { overlapping })
+    }
+}
+
+/// Validate block IDs are in range for a layout.
+#[cfg(debug_assertions)]
+pub fn validate_block_ids_in_range(
+    block_ids: &[usize],
+    layout: &PhysicalLayout,
+    layout_name: &'static str,
+) -> Result<(), BlockValidationError> {
+    let max_blocks = layout.layout().config().num_blocks;
+
+    for &block_id in block_ids {
+        if block_id >= max_blocks {
+            return Err(BlockValidationError::BlockOutOfRange {
+                block_id,
+                layout_name,
+                max: max_blocks,
+            });
+        }
+    }
+
+    Ok(())
+}
+
+/// Full validation for block transfer (debug mode).
+///
+/// Validates:
+/// - List lengths match
+/// - Destination IDs are unique
+/// - Bounce IDs are unique (if provided)
+/// - Source and destination are disjoint (if same layout)
+/// - All block IDs are in range for their respective layouts
+#[cfg(debug_assertions)]
+pub fn validate_block_transfer(
+    src_block_ids: &[usize],
+    dst_block_ids: &[usize],
+    bounce_block_ids: Option<&[usize]>,
+    src_layout: &PhysicalLayout,
+    dst_layout: &PhysicalLayout,
+    bounce_layout: Option<&PhysicalLayout>,
+) -> Result<(), BlockValidationError> {
+    // Validate lengths
+    if src_block_ids.len() != dst_block_ids.len() {
+        return Err(BlockValidationError::LengthMismatch {
+            src_len: src_block_ids.len(),
+            dst_len: dst_block_ids.len(),
+            bounce_len: bounce_block_ids.map(|ids| ids.len()),
+        });
+    }
+
+    if let Some(bounce_ids) = bounce_block_ids
+        && bounce_ids.len() != src_block_ids.len()
+    {
+        return Err(BlockValidationError::LengthMismatch {
+            src_len: src_block_ids.len(),
+            dst_len: dst_block_ids.len(),
+            bounce_len: Some(bounce_ids.len()),
+        });
+    }
+
+    #[cfg(debug_assertions)]
+    {
+        // Validate destination uniqueness
+        validate_dst_unique(dst_block_ids)?;
+
+        // Validate bounce uniqueness if provided
+        if let Some(bounce_ids) = bounce_block_ids {
+            validate_bounce_unique(bounce_ids)?;
+        }
+
+        // Validate disjoint if same layout
+        validate_disjoint_same_layout(src_block_ids, dst_block_ids, src_layout, dst_layout)?;
+
+        // Validate block IDs in range
+        validate_block_ids_in_range(src_block_ids, src_layout, "source")?;
+        validate_block_ids_in_range(dst_block_ids, dst_layout, "destination")?;
+        if let (Some(bounce_ids), Some(bounce_layout)) = (bounce_block_ids, bounce_layout) {
+            validate_block_ids_in_range(bounce_ids, bounce_layout, "bounce")?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Minimal validation for block transfer (release mode).
+///
+/// Only validates:
+/// - List lengths match
+/// - Destination IDs are unique
+#[cfg(not(debug_assertions))]
+pub fn validate_block_transfer(
+    src_block_ids: &[usize],
+    dst_block_ids: &[usize],
+    bounce_block_ids: Option<&[usize]>,
+    _src_layout: &PhysicalLayout,
+    _dst_layout: &PhysicalLayout,
+    _bounce_layout: Option<&PhysicalLayout>,
+) -> Result<(), BlockValidationError> {
+    // Validate lengths
+    if src_block_ids.len() != dst_block_ids.len() {
+        return Err(BlockValidationError::LengthMismatch {
+            src_len: src_block_ids.len(),
+            dst_len: dst_block_ids.len(),
+            bounce_len: bounce_block_ids.map(|ids| ids.len()),
+        });
+    }
+
+    if let Some(bounce_ids) = bounce_block_ids {
+        if bounce_ids.len() != src_block_ids.len() {
+            return Err(BlockValidationError::LengthMismatch {
+                src_len: src_block_ids.len(),
+                dst_len: dst_block_ids.len(),
+                bounce_len: Some(bounce_ids.len()),
+            });
+        }
+    }
+
+    // Validate destination uniqueness
+    validate_dst_unique(dst_block_ids)?;
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::super::tests::*;
+    use super::*;
+
+    #[test]
+    fn test_dst_unique_valid() {
+        let ids = vec![0, 1, 2, 3, 4];
+        assert!(validate_dst_unique(&ids).is_ok());
+    }
+
+    #[test]
+    fn test_dst_unique_duplicate() {
+        let ids = vec![0, 1, 2, 1, 3];
+        let result = validate_dst_unique(&ids);
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            BlockValidationError::DuplicateDestinationBlocks { duplicates } => {
+                assert_eq!(duplicates, vec![1]);
+            }
+            _ => panic!("Wrong error type"),
+        }
+    }
+
+    #[test]
+    fn test_dst_unique_multiple_duplicates() {
+        let ids = vec![0, 1, 2, 1, 3, 2];
+        let result = validate_dst_unique(&ids);
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            BlockValidationError::DuplicateDestinationBlocks { duplicates } => {
+                assert!(duplicates.contains(&1));
+                assert!(duplicates.contains(&2));
+            }
+            _ => panic!("Wrong error type"),
+        }
+    }
+
+    #[test]
+    #[cfg(debug_assertions)]
+    fn test_disjoint_same_layout_valid() {
+        let physical = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let src_ids = vec![0, 1, 2];
+        let dst_ids = vec![5, 6, 7];
+
+        assert!(validate_disjoint_same_layout(&src_ids, &dst_ids, &physical, &physical).is_ok());
+    }
+
+    #[test]
+    #[cfg(debug_assertions)]
+    fn test_disjoint_same_layout_overlap() {
+        let physical = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let src_ids = vec![0, 1, 2];
+        let dst_ids = vec![2, 3, 4]; // 2 overlaps
+
+        let result = validate_disjoint_same_layout(&src_ids, &dst_ids, &physical, &physical);
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            BlockValidationError::OverlappingBlocks { overlapping } => {
+                assert_eq!(overlapping, vec![2]);
+            }
+            _ => panic!("Wrong error type"),
+        }
+    }
+
+    #[test]
+    fn test_disjoint_different_layouts_ok() {
+        let physical1 = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let physical2 = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let src_ids = vec![0, 1, 2];
+        let dst_ids = vec![0, 1, 2]; // Same IDs but different layouts
+
+        // Should be OK since different layouts
+        #[cfg(debug_assertions)]
+        assert!(validate_disjoint_same_layout(&src_ids, &dst_ids, &physical1, &physical2).is_ok());
+    }
+
+    #[test]
+    fn test_length_mismatch() {
+        let physical1 = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let physical2 = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let src_ids = vec![0, 1, 2];
+        let dst_ids = vec![5, 6]; // Different length
+
+        let result =
+            validate_block_transfer(&src_ids, &dst_ids, None, &physical1, &physical2, None);
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            BlockValidationError::LengthMismatch {
+                src_len,
+                dst_len,
+                bounce_len,
+            } => {
+                assert_eq!(src_len, 3);
+                assert_eq!(dst_len, 2);
+                assert_eq!(bounce_len, None);
+            }
+            _ => panic!("Wrong error type"),
+        }
+    }
+
+    // #[test]
+    // #[cfg(debug_assertions)]
+    // fn test_block_out_of_range() {
+    //     let (_layout, physical) = create_test_layout(5); // Only 5 blocks
+    //     let src_ids = vec![0, 1, 2];
+    //     let dst_ids = vec![3, 4, 10]; // 10 is out of range
+
+    //     let result = validate_block_ids_in_range(&dst_ids, &physical, "destination");
+    //     assert!(result.is_err());
+    //     match result.unwrap_err() {
+    //         BlockValidationError::BlockOutOfRange {
+    //             block_id,
+    //             layout_name,
+    //             max,
+    //         } => {
+    //             assert_eq!(block_id, 10);
+    //             assert_eq!(layout_name, "destination");
+    //             assert_eq!(max, 5);
+    //         }
+    //         _ => panic!("Wrong error type"),
+    //     }
+    // }
+
+    // #[test]
+    // fn test_bounce_length_mismatch() {
+    //     let (_layout1, physical1) = create_test_layout(10);
+    //     let (_layout2, physical2) = create_test_layout(10);
+    //     let (_layout3, physical3) = create_test_layout(10);
+    //     let src_ids = vec![0, 1, 2];
+    //     let dst_ids = vec![5, 6, 7];
+    //     let bounce_ids = vec![8, 9]; // Wrong length
+
+    //     let result = validate_block_transfer(
+    //         &src_ids,
+    //         &dst_ids,
+    //         Some(&bounce_ids),
+    //         &physical1,
+    //         &physical2,
+    //         Some(&physical3),
+    //     );
+    //     assert!(result.is_err());
+    //     match result.unwrap_err() {
+    //         BlockValidationError::LengthMismatch {
+    //             src_len,
+    //             dst_len,
+    //             bounce_len,
+    //         } => {
+    //             assert_eq!(src_len, 3);
+    //             assert_eq!(dst_len, 3);
+    //             assert_eq!(bounce_len, Some(2));
+    //         }
+    //         _ => panic!("Wrong error type"),
+    //     }
+    // }
+
+    // #[test]
+    // fn test_full_validation_success() {
+    //     let (_layout1, physical1) = create_test_layout(10);
+    //     let (_layout2, physical2) = create_test_layout(10);
+    //     let (_layout3, physical3) = create_test_layout(10);
+    //     let src_ids = vec![0, 1, 2];
+    //     let dst_ids = vec![5, 6, 7];
+    //     let bounce_ids = vec![8, 9, 3];
+
+    //     assert!(
+    //         validate_block_transfer(
+    //             &src_ids,
+    //             &dst_ids,
+    //             Some(&bounce_ids),
+    //             &physical1,
+    //             &physical2,
+    //             Some(&physical3),
+    //         )
+    //         .is_ok()
+    //     );
+    // }
+}
diff --git a/lib/runtime/src/config.rs b/lib/runtime/src/config.rs
index fbfc457fe7..a30e72991b 100644
--- a/lib/runtime/src/config.rs
+++ b/lib/runtime/src/config.rs
@@ -397,6 +397,19 @@ pub fn is_truthy(val: &str) -> bool {
     matches!(val.to_lowercase().as_str(), "1" | "true" | "on" | "yes")
 }
 
+pub fn parse_bool(val: &str) -> anyhow::Result<bool> {
+    if is_truthy(val) {
+        Ok(true)
+    } else if is_falsey(val) {
+        Ok(false)
+    } else {
+        anyhow::bail!(
+            "Invalid boolean value: '{}'. Expected one of: true/false, 1/0, on/off, yes/no",
+            val
+        )
+    }
+}
+
 /// Check if a string is falsey
 /// This will be used to evaluate environment variables or any other subjective
 /// configuration parameters that can be set by the user that should be evaluated
diff --git a/lib/runtime/src/lib.rs b/lib/runtime/src/lib.rs
index d44da24c80..86c3ced1fa 100644
--- a/lib/runtime/src/lib.rs
+++ b/lib/runtime/src/lib.rs
@@ -17,7 +17,7 @@ pub use anyhow::{
 
 use async_once_cell::OnceCell;
 
-mod config;
+pub mod config;
 pub use config::RuntimeConfig;
 
 pub mod component;