From 426ed44d3efc5a9b1ccc814a1459103aa2ae84ac Mon Sep 17 00:00:00 2001
From: AmeenP <ameenp360@gmail.com>
Date: Wed, 13 May 2026 15:26:02 -0700
Subject: [PATCH 1/4] feat(llm): return chat completion token ids via nvext

Signed-off-by: AmeenP <ameenp360@gmail.com>
---
 docs/components/frontend/nvext.md             |   5 +-
 .../openai/chat_completions/delta.rs          | 122 +++++++++++++++---
 lib/llm/src/protocols/openai/nvext.rs         |  30 ++++-
 3 files changed, 134 insertions(+), 23 deletions(-)

diff --git a/docs/components/frontend/nvext.md b/docs/components/frontend/nvext.md
index 65144aad632c..25a1b182c66f 100644
--- a/docs/components/frontend/nvext.md
+++ b/docs/components/frontend/nvext.md
@@ -33,9 +33,9 @@ Include `nvext` as a top-level field alongside standard OpenAI-compatible fields
 | `use_raw_prompt` | `bool` | `None` | Preprocessor | Bypasses the prompt template and passes the prompt directly to the tokenizer. |
 | `annotations` | `string[]` | `None` | Preprocessor | Triggers out-of-band information in the SSE stream via the `event:` field. |
 | `backend_instance_id` | `u64` | `None` | Router | Routes the request to a specific backend instance. |
-| `token_data` | `u32[]` | `None` | Preprocessor | Pre-tokenized prompt tokens. When provided with `backend_instance_id`, tokenization is skipped. |
+| `token_data` | `u32[]` | `None` | Preprocessor | Pre-tokenized prompt tokens. When provided, tokenization is skipped. `backend_instance_id` remains an independent routing hint. |
 | `max_thinking_tokens` | `u32` | `None` | Backend | Maximum thinking tokens allowed (passed through to backends). |
-| `extra_fields` | `string[]` | `None` | Response builder | Fields to include in the response `nvext`. Supported: `"worker_id"`, `"timing"`, `"routed_experts"`, `"engine_data"`, `"stop_reason"`. |
+| `extra_fields` | `string[]` | `None` | Response builder | Fields to include in the response `nvext`. Supported: `"worker_id"`, `"timing"`, `"routed_experts"`, `"engine_data"`, `"stop_reason"`, `"completion_token_ids"`. |
 | `prefill_worker_id` | `u64` | `None` | Router | Routes the request to a specific prefill worker (disaggregated serving). |
 | `decode_worker_id` | `u64` | `None` | Router | Routes the request to a specific decode worker (disaggregated serving). |
 | `agent_context` | object | `None` | Preprocessor | Passive session and trajectory identity for agent traces. See [Agent Context](#agent-context) below and [Agent Tracing](../../agents/agent-tracing.md). |
@@ -208,6 +208,7 @@ When the client requests response metadata via `extra_fields`, the response incl
 | `routed_experts` | `extra_fields: ["routed_experts"]` | Routed expert capture payload returned by SGLang-backed requests. |
 | `engine_data` | `extra_fields: ["engine_data"]` | Opaque backend-provided engine metadata. |
 | `stop_reason` | `extra_fields: ["stop_reason"]` | Backend-specific matched stop condition, returned under `nvext` because it is not part of the OpenAI completions schema. Dynamo currently serves this as a response-level field for single-choice requests; supporting `n > 1` will require an indexed per-choice shape. |
+| `completion_token_ids` | `extra_fields: ["completion_token_ids"]` | Generated token IDs accumulated across the chat-completions response and emitted on the final chunk. |
 | `token_ids` | Automatic (GAIE Stage 1) | Tokenized prompt for reuse in Stage 2 query-only mode. |
 
 ### Example response `nvext`
diff --git a/lib/llm/src/protocols/openai/chat_completions/delta.rs b/lib/llm/src/protocols/openai/chat_completions/delta.rs
index 5d3cfc75cb50..0edad90fdd62 100644
--- a/lib/llm/src/protocols/openai/chat_completions/delta.rs
+++ b/lib/llm/src/protocols/openai/chat_completions/delta.rs
@@ -10,7 +10,7 @@ use crate::{
         openai::{
             convert_backend_top_logprobs,
             delta_common::{self, DeltaGeneratorOptions},
-            nvext::NvExtProvider,
+            nvext::{NvExtProvider, NvExtResponse},
             token_to_utf8_bytes,
         },
     },
@@ -59,6 +59,9 @@ pub struct DeltaGenerator {
     options: DeltaGeneratorOptions,
     /// Request tracker for per-request metrics (shared with PreprocessedRequest).
     tracker: Arc<RequestTracker>,
+    /// Accumulated output token IDs across chunks, emitted on the final chunk
+    /// when `nvext.extra_fields` includes `completion_token_ids`.
+    accumulated_completion_token_ids: Vec<TokenIdType>,
 }
 
 impl DeltaGenerator {
@@ -75,6 +78,7 @@ impl DeltaGenerator {
             msg_counter: 0,
             options,
             tracker,
+            accumulated_completion_token_ids: Vec::new(),
         }
     }
 
@@ -257,6 +261,11 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
 
         self.usage.completion_tokens += token_length;
 
+        if self.options.response_fields.completion_token_ids && !delta.token_ids.is_empty() {
+            self.accumulated_completion_token_ids
+                .extend_from_slice(&delta.token_ids);
+        }
+
         // If backend provides completion_usage, use it to update usage stats
         // This is critical for prompt embeddings where prompt_tokens comes from
         // the embedding sequence length computed by the worker
@@ -311,27 +320,58 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
         // `NvExtResponseFieldSelection` (see `nvext.rs`). Both chat and
         // completions delta generators go through the same helper so the gating
         // rules stay in one place.
-        if let Some(nvext_response) = self.options.response_fields.build_response_nvext(
-            Some(&self.tracker),
-            delta.disaggregated_params.as_ref(),
-            finish_reason.is_some(),
-            delta.engine_data,
-            stop_reason,
-        ) && let Ok(nvext_json) = serde_json::to_value(&nvext_response)
+        if let Some(mut nvext_response) = self
+            .options
+            .response_fields
+            .build_response_nvext(
+                Some(&self.tracker),
+                delta.disaggregated_params.as_ref(),
+                finish_reason.is_some(),
+                delta.engine_data,
+                stop_reason,
+            )
+            .or_else(|| {
+                if self.options.response_fields.completion_token_ids && finish_reason.is_some() {
+                    Some(NvExtResponse {
+                        worker_id: None,
+                        timing: None,
+                        token_ids: None,
+                        routed_experts: None,
+                        engine_data: None,
+                        stop_reason: None,
+                        completion_token_ids: None,
+                    })
+                } else {
+                    None
+                }
+            })
         {
-            stream_response.nvext = Some(nvext_json);
-            if let Some(ref info) = nvext_response.worker_id {
-                tracing::debug!(
-                    "Injected worker_id into chat completion nvext: prefill={:?}, decode={:?}",
-                    info.prefill_worker_id,
-                    info.decode_worker_id
-                );
+            if self.options.response_fields.completion_token_ids && finish_reason.is_some() {
+                nvext_response.completion_token_ids =
+                    Some(self.accumulated_completion_token_ids.clone());
             }
-            if let Some(ref tokens) = nvext_response.token_ids {
-                tracing::debug!(
-                    "Injected token_ids into chat completion nvext: {} tokens",
-                    tokens.len()
-                );
+
+            if let Ok(nvext_json) = serde_json::to_value(&nvext_response) {
+                stream_response.nvext = Some(nvext_json);
+                if let Some(ref info) = nvext_response.worker_id {
+                    tracing::debug!(
+                        "Injected worker_id into chat completion nvext: prefill={:?}, decode={:?}",
+                        info.prefill_worker_id,
+                        info.decode_worker_id
+                    );
+                }
+                if let Some(ref tokens) = nvext_response.token_ids {
+                    tracing::debug!(
+                        "Injected token_ids into chat completion nvext: {} tokens",
+                        tokens.len()
+                    );
+                }
+                if let Some(ref tokens) = nvext_response.completion_token_ids {
+                    tracing::debug!(
+                        "Injected completion_token_ids into chat completion nvext: {} tokens",
+                        tokens.len()
+                    );
+                }
             }
         }
 
@@ -615,6 +655,48 @@ mod tests {
         assert!(nvext_json.get("routed_experts").is_none());
     }
 
+    #[test]
+    fn test_completion_token_ids_extra_field_emits_accumulated_ids_on_final_chunk() {
+        let request =
+            create_test_request_with_extra_fields(vec!["completion_token_ids".to_string()]);
+        let mut generator = request.response_generator("req-completion-ids".to_string());
+
+        let mut first_output = final_backend_output();
+        first_output.token_ids = vec![7];
+        first_output.tokens = vec![Some("A".to_string())];
+        first_output.text = Some("A".to_string());
+        first_output.finish_reason = None;
+        first_output.disaggregated_params = None;
+
+        let first_response = generator
+            .choice_from_postprocessor(first_output)
+            .expect("first choice generation");
+        assert!(
+            first_response.nvext.is_none(),
+            "completion_token_ids should be emitted only on the final chunk"
+        );
+
+        let mut final_output = final_backend_output();
+        final_output.token_ids = vec![8, 9];
+        final_output.tokens = vec![Some("B".to_string()), Some("C".to_string())];
+        final_output.text = Some("BC".to_string());
+        final_output.disaggregated_params = None;
+
+        let final_response = generator
+            .choice_from_postprocessor(final_output)
+            .expect("final choice generation");
+
+        let nvext_json = final_response
+            .nvext
+            .expect("nvext present for completion_token_ids request");
+        assert_eq!(
+            nvext_json.get("completion_token_ids"),
+            Some(&serde_json::json!([7, 8, 9]))
+        );
+        assert!(nvext_json.get("token_ids").is_none());
+        assert!(nvext_json.get("routed_experts").is_none());
+    }
+
     #[test]
     fn test_routed_experts_extra_field_emits_routed_experts() {
         use crate::protocols::openai::nvext::NvExt;
diff --git a/lib/llm/src/protocols/openai/nvext.rs b/lib/llm/src/protocols/openai/nvext.rs
index 7c561dd4a59d..0491dbf8b392 100644
--- a/lib/llm/src/protocols/openai/nvext.rs
+++ b/lib/llm/src/protocols/openai/nvext.rs
@@ -129,6 +129,11 @@ pub struct NvExtResponse {
     /// If `n > 1` is supported here, this needs an indexed/per-choice shape.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub stop_reason: Option<serde_json::Value>,
+
+    /// Output token IDs generated by the engine.
+    /// Populated when the client requests `extra_fields: ["completion_token_ids"]`.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub completion_token_ids: Option<Vec<u32>>,
 }
 
 pub(crate) fn merge_response_nvext(
@@ -165,6 +170,7 @@ pub struct NvExtResponseFieldSelection {
     pub routed_experts: bool,
     pub engine_data: bool,
     pub stop_reason: bool,
+    pub completion_token_ids: bool,
 }
 
 impl NvExtResponseFieldSelection {
@@ -182,6 +188,7 @@ impl NvExtResponseFieldSelection {
                     "routed_experts" => selection.routed_experts = true,
                     "engine_data" => selection.engine_data = true,
                     "stop_reason" => selection.stop_reason = true,
+                    "completion_token_ids" => selection.completion_token_ids = true,
                     _ => {}
                 }
             }
@@ -215,6 +222,8 @@ impl NvExtResponseFieldSelection {
     /// - `timing` requires the selection flag, `finish_reason_present == true`, **and** a tracker.
     /// - `engine_data` requires the selection flag **and** a non-`None` `engine_data_from_backend`.
     /// - `stop_reason` requires the selection flag **and** a non-`None` `stop_reason_from_backend`.
+    /// - `completion_token_ids` is accumulated by the chat-completions delta generator
+    ///   and attached to the final chunk after this helper returns.
     pub fn build_response_nvext(
         &self,
         tracker: Option<&std::sync::Arc<crate::protocols::common::timing::RequestTracker>>,
@@ -280,6 +289,7 @@ impl NvExtResponseFieldSelection {
             routed_experts,
             engine_data,
             stop_reason,
+            completion_token_ids: None,
         })
     }
 }
@@ -331,7 +341,7 @@ pub struct NvExt {
     /// Extra fields to be included in the response's nvext
     /// This is a list of field names that should be populated in the response
     /// Supported fields include "worker_id", "timing", "routed_experts", "engine_data",
-    /// "stop_reason", which map to fields in NvExtResponse.
+    /// "stop_reason", and "completion_token_ids", which map to fields in NvExtResponse.
     #[serde(default, skip_serializing_if = "Option::is_none")]
     #[builder(default, setter(strip_option))]
     pub extra_fields: Option<Vec<String>>,
@@ -764,6 +774,22 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_nvext_response_field_selection_completion_token_ids_only() {
+        let nvext = NvExt::builder()
+            .extra_fields(vec!["completion_token_ids".to_string()])
+            .build()
+            .unwrap();
+
+        assert_eq!(
+            NvExtResponseFieldSelection::from_nvext(Some(&nvext)),
+            NvExtResponseFieldSelection {
+                completion_token_ids: true,
+                ..Default::default()
+            }
+        );
+    }
+
     // Helpers for build_response_nvext tests -----------------------------
 
     fn sel_all_false() -> NvExtResponseFieldSelection {
@@ -966,6 +992,7 @@ mod tests {
             routed_experts: true,
             engine_data: false,
             stop_reason: false,
+            completion_token_ids: false,
         };
         let tracker = tracker_with_prefill_worker();
         let params = disagg_params_full();
@@ -1003,6 +1030,7 @@ mod tests {
                 routed_experts: true,
                 engine_data: false,
                 stop_reason: false,
+                completion_token_ids: false,
             }
         );
     }

From 7d67d6ef8b0a31d3905829a2e4e62843f825ec84 Mon Sep 17 00:00:00 2001
From: AmeenP <ameenp360@gmail.com>
Date: Fri, 15 May 2026 02:20:22 -0700
Subject: [PATCH 2/4] feat(llm): accept renderer cache salt passthrough

Signed-off-by: AmeenP <ameenp360@gmail.com>
---
 components/src/dynamo/vllm/handlers.py        |  5 ++
 docs/components/frontend/nvext.md             |  1 +
 lib/llm/src/preprocessor.rs                   | 31 ++++++++++-
 .../src/protocols/openai/chat_completions.rs  | 51 +++++++++++++++++++
 lib/llm/src/protocols/openai/nvext.rs         | 19 +++++++
 lib/llm/src/protocols/openai/validate.rs      | 16 ++++--
 6 files changed, 117 insertions(+), 6 deletions(-)

diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
index 04955398aca3..e9d21cdd5166 100644
--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -1925,6 +1925,11 @@ def _build_prompt_from_request(
             prompt_token_ids=request["token_ids"],
             multi_modal_data=multi_modal_data,
         )
+        nvext_args = extra_args.get("nvext") if isinstance(extra_args, dict) else None
+        if isinstance(nvext_args, dict):
+            cache_salt = nvext_args.get("cache_salt")
+            if cache_salt is not None:
+                prompt_kwargs["cache_salt"] = cache_salt
         if mm_uuids is not None:
             prompt_kwargs["multi_modal_uuids"] = mm_uuids
         if mm_processor_kwargs is not None:
diff --git a/docs/components/frontend/nvext.md b/docs/components/frontend/nvext.md
index 25a1b182c66f..ec4bf0957949 100644
--- a/docs/components/frontend/nvext.md
+++ b/docs/components/frontend/nvext.md
@@ -35,6 +35,7 @@ Include `nvext` as a top-level field alongside standard OpenAI-compatible fields
 | `backend_instance_id` | `u64` | `None` | Router | Routes the request to a specific backend instance. |
 | `token_data` | `u32[]` | `None` | Preprocessor | Pre-tokenized prompt tokens. When provided, tokenization is skipped. `backend_instance_id` remains an independent routing hint. |
 | `max_thinking_tokens` | `u32` | `None` | Backend | Maximum thinking tokens allowed (passed through to backends). |
+| `cache_salt` | `string` | `None` | Backend | Prefix-cache isolation hint for token-in clients. The top-level `cache_salt` request field is also accepted for renderer compatibility. |
 | `extra_fields` | `string[]` | `None` | Response builder | Fields to include in the response `nvext`. Supported: `"worker_id"`, `"timing"`, `"routed_experts"`, `"engine_data"`, `"stop_reason"`, `"completion_token_ids"`. |
 | `prefill_worker_id` | `u64` | `None` | Router | Routes the request to a specific prefill worker (disaggregated serving). |
 | `decode_worker_id` | `u64` | `None` | Router | Routes the request to a specific decode worker (disaggregated serving). |
diff --git a/lib/llm/src/preprocessor.rs b/lib/llm/src/preprocessor.rs
index ea850db67c3b..5c1b6965edee 100644
--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -556,12 +556,36 @@ impl OpenAIPreprocessor {
             }));
         }
 
+        if let Some(extra_args) = Self::nvext_passthrough_extra_args(request) {
+            builder.extra_args(Some(extra_args));
+        }
+
         // Forward mm_processor_kwargs (e.g. use_audio_in_video) to the backend.
         builder.mm_processor_kwargs(request.mm_processor_kwargs().cloned());
 
         Ok(builder)
     }
 
+    fn nvext_passthrough_extra_args<R: NvExtProvider>(request: &R) -> Option<serde_json::Value> {
+        let mut nvext_args = serde_json::Map::new();
+
+        if let Some(fields) = request.nvext_extra_fields()
+            && !fields.is_empty()
+        {
+            nvext_args.insert("extra_fields".to_string(), serde_json::json!(fields));
+        }
+
+        if let Some(cache_salt) = request.cache_salt() {
+            nvext_args.insert("cache_salt".to_string(), serde_json::json!(cache_salt));
+        }
+
+        if nvext_args.is_empty() {
+            None
+        } else {
+            Some(serde_json::json!({ "nvext": serde_json::Value::Object(nvext_args) }))
+        }
+    }
+
     pub fn apply_template<
         R: OAIChatLikeRequest
             + AnnotationsProvider
@@ -623,7 +647,7 @@ impl OpenAIPreprocessor {
         }
     }
 
-    pub async fn gather_multi_modal_data<R: OAIChatLikeRequest>(
+    pub async fn gather_multi_modal_data<R: OAIChatLikeRequest + NvExtProvider>(
         &self,
         request: &R,
         builder: &mut PreprocessedRequestBuilder,
@@ -847,6 +871,11 @@ impl OpenAIPreprocessor {
             let mut extra_args = serde_json::json!({
                 "messages": messages_json
             });
+            if let Some(nvext_passthrough) = Self::nvext_passthrough_extra_args(request)
+                && let Some(nvext) = nvext_passthrough.get("nvext")
+            {
+                extra_args["nvext"] = nvext.clone();
+            }
 
             // Strip redundant inline data: URLs only when frontend decoding is active
             // (media_loader decoded the images into RDMA descriptors). TRT-LLM and
diff --git a/lib/llm/src/protocols/openai/chat_completions.rs b/lib/llm/src/protocols/openai/chat_completions.rs
index 291b837d38df..b4b82b5115b2 100644
--- a/lib/llm/src/protocols/openai/chat_completions.rs
+++ b/lib/llm/src/protocols/openai/chat_completions.rs
@@ -101,6 +101,17 @@ impl NvExtProvider for NvCreateChatCompletionRequest {
     fn raw_prompt(&self) -> Option<String> {
         None
     }
+
+    fn cache_salt(&self) -> Option<&str> {
+        self.nvext
+            .as_ref()
+            .and_then(|nvext| nvext.cache_salt.as_deref())
+            .or_else(|| {
+                self.unsupported_fields
+                    .get("cache_salt")
+                    .and_then(|value| value.as_str())
+            })
+    }
 }
 
 /// Implements `AnnotationsProvider` for `NvCreateChatCompletionRequest`,
@@ -514,4 +525,44 @@ mod tests {
                 .expect("Failed to deserialize request");
         assert!(ValidateRequest::validate(&request).is_err());
     }
+
+    #[test]
+    fn test_cache_salt_accepts_renderer_top_level_shape() {
+        let request_json = json!({
+            "model": "test-model",
+            "messages": [{"role": "user", "content": "(token-in mode)"}],
+            "nvext": {
+                "token_data": [1, 2, 3],
+                "extra_fields": ["completion_token_ids"]
+            },
+            "cache_salt": "ckpt-42"
+        });
+        let request: NvCreateChatCompletionRequest =
+            serde_json::from_value(request_json).expect("Failed to deserialize request");
+
+        ValidateRequest::validate(&request).expect("cache_salt should be accepted");
+        assert_eq!(
+            <NvCreateChatCompletionRequest as NvExtProvider>::cache_salt(&request),
+            Some("ckpt-42")
+        );
+    }
+
+    #[test]
+    fn test_nvext_cache_salt_takes_precedence() {
+        let request_json = json!({
+            "model": "test-model",
+            "messages": [{"role": "user", "content": "(token-in mode)"}],
+            "cache_salt": "top-level",
+            "nvext": {
+                "cache_salt": "nvext-level"
+            }
+        });
+        let request: NvCreateChatCompletionRequest =
+            serde_json::from_value(request_json).expect("Failed to deserialize request");
+
+        assert_eq!(
+            <NvCreateChatCompletionRequest as NvExtProvider>::cache_salt(&request),
+            Some("nvext-level")
+        );
+    }
 }
diff --git a/lib/llm/src/protocols/openai/nvext.rs b/lib/llm/src/protocols/openai/nvext.rs
index 0491dbf8b392..aeabee15fa1c 100644
--- a/lib/llm/src/protocols/openai/nvext.rs
+++ b/lib/llm/src/protocols/openai/nvext.rs
@@ -74,6 +74,14 @@ pub fn apply_header_routing_overrides(nvext: Option<NvExt>, headers: &HeaderMap)
 pub trait NvExtProvider {
     fn nvext(&self) -> Option<&NvExt>;
     fn raw_prompt(&self) -> Option<String>;
+
+    fn nvext_extra_fields(&self) -> Option<&[String]> {
+        self.nvext().and_then(|nvext| nvext.extra_fields.as_deref())
+    }
+
+    fn cache_salt(&self) -> Option<&str> {
+        self.nvext().and_then(|nvext| nvext.cache_salt.as_deref())
+    }
 }
 
 /// Worker ID information for disaggregated serving
@@ -338,6 +346,14 @@ pub struct NvExt {
     #[builder(default, setter(strip_option))]
     pub max_thinking_tokens: Option<u32>,
 
+    /// KV prefix-cache isolation hint from token-in clients.
+    ///
+    /// A changed salt lets backends isolate prompt cache entries for identical
+    /// token sequences generated under different checkpoint or rollout state.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    #[builder(default, setter(strip_option))]
+    pub cache_salt: Option<String>,
+
     /// Extra fields to be included in the response's nvext
     /// This is a list of field names that should be populated in the response
     /// Supported fields include "worker_id", "timing", "routed_experts", "engine_data",
@@ -517,6 +533,7 @@ mod tests {
         assert_eq!(nv_ext.backend_instance_id, None);
         assert_eq!(nv_ext.token_data, None);
         assert_eq!(nv_ext.max_thinking_tokens, None);
+        assert_eq!(nv_ext.cache_salt, None);
         assert_eq!(nv_ext.extra_fields, None);
         assert_eq!(nv_ext.prefill_worker_id, None);
         assert_eq!(nv_ext.decode_worker_id, None);
@@ -535,6 +552,7 @@ mod tests {
             .backend_instance_id(42)
             .token_data(vec![1, 2, 3, 4])
             .max_thinking_tokens(1024)
+            .cache_salt("ckpt-42".to_string())
             .extra_fields(vec!["worker_id".to_string()])
             .build()
             .unwrap();
@@ -544,6 +562,7 @@ mod tests {
         assert_eq!(nv_ext.backend_instance_id, Some(42));
         assert_eq!(nv_ext.token_data, Some(vec![1, 2, 3, 4]));
         assert_eq!(nv_ext.max_thinking_tokens, Some(1024));
+        assert_eq!(nv_ext.cache_salt, Some("ckpt-42".to_string()));
         assert_eq!(nv_ext.extra_fields, Some(vec!["worker_id".to_string()]));
         // Validate the built struct
         assert!(nv_ext.validate().is_ok());
diff --git a/lib/llm/src/protocols/openai/validate.rs b/lib/llm/src/protocols/openai/validate.rs
index 559dd109ac1b..19707ec667a5 100644
--- a/lib/llm/src/protocols/openai/validate.rs
+++ b/lib/llm/src/protocols/openai/validate.rs
@@ -97,15 +97,21 @@ pub const MAX_REPETITION_PENALTY: f32 = 2.0;
 // Shared Fields
 //
 
+/// Root-level fields accepted for compatibility with token-in clients even
+/// though Dynamo has not modeled them as first-class OpenAI fields.
+pub const PASSTHROUGH_EXTRA_FIELDS: &[&str] = &["cache_salt"];
+
 /// Validates that no unsupported fields are present in the request
 pub fn validate_no_unsupported_fields(
     unsupported_fields: &std::collections::HashMap<String, serde_json::Value>,
 ) -> Result<(), anyhow::Error> {
-    if !unsupported_fields.is_empty() {
-        let fields: Vec<_> = unsupported_fields
-            .keys()
-            .map(|s| format!("`{}`", s))
-            .collect();
+    let fields: Vec<_> = unsupported_fields
+        .keys()
+        .filter(|field| !PASSTHROUGH_EXTRA_FIELDS.contains(&field.as_str()))
+        .map(|s| format!("`{}`", s))
+        .collect();
+
+    if !fields.is_empty() {
         anyhow::bail!("Unsupported parameter(s): {}", fields.join(", "));
     }
     Ok(())

From df5fb9e3c318b1e2af2b6d84d3f8bad171ef5b75 Mon Sep 17 00:00:00 2001
From: AmeenP <ameenp360@gmail.com>
Date: Fri, 15 May 2026 04:29:09 -0700
Subject: [PATCH 3/4] fix(llm): reject completion token ids for multi-choice

Signed-off-by: AmeenP <ameenp360@gmail.com>
---
 docs/components/frontend/nvext.md             |  2 +-
 .../src/protocols/openai/chat_completions.rs  | 29 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/docs/components/frontend/nvext.md b/docs/components/frontend/nvext.md
index ec4bf0957949..1729491c7f7b 100644
--- a/docs/components/frontend/nvext.md
+++ b/docs/components/frontend/nvext.md
@@ -209,7 +209,7 @@ When the client requests response metadata via `extra_fields`, the response incl
 | `routed_experts` | `extra_fields: ["routed_experts"]` | Routed expert capture payload returned by SGLang-backed requests. |
 | `engine_data` | `extra_fields: ["engine_data"]` | Opaque backend-provided engine metadata. |
 | `stop_reason` | `extra_fields: ["stop_reason"]` | Backend-specific matched stop condition, returned under `nvext` because it is not part of the OpenAI completions schema. Dynamo currently serves this as a response-level field for single-choice requests; supporting `n > 1` will require an indexed per-choice shape. |
-| `completion_token_ids` | `extra_fields: ["completion_token_ids"]` | Generated token IDs accumulated across the chat-completions response and emitted on the final chunk. |
+| `completion_token_ids` | `extra_fields: ["completion_token_ids"]` | Generated token IDs accumulated across the chat-completions response and emitted on the final chunk. Supported only for single-choice requests (`n <= 1`). |
 | `token_ids` | Automatic (GAIE Stage 1) | Tokenized prompt for reuse in Stage 2 query-only mode. |
 
 ### Example response `nvext`
diff --git a/lib/llm/src/protocols/openai/chat_completions.rs b/lib/llm/src/protocols/openai/chat_completions.rs
index b4b82b5115b2..156518166983 100644
--- a/lib/llm/src/protocols/openai/chat_completions.rs
+++ b/lib/llm/src/protocols/openai/chat_completions.rs
@@ -364,6 +364,15 @@ impl ValidateRequest for NvCreateChatCompletionRequest {
         // validate::validate_max_tokens(self.inner.max_tokens)?; // warning depricated field
         validate::validate_max_completion_tokens(self.inner.max_completion_tokens)?;
         validate::validate_n(self.inner.n)?;
+        if self.inner.n.unwrap_or(1) > 1
+            && self
+                .nvext
+                .as_ref()
+                .and_then(|nvext| nvext.extra_fields.as_ref())
+                .is_some_and(|fields| fields.iter().any(|field| field == "completion_token_ids"))
+        {
+            anyhow::bail!("`nvext.extra_fields=[\"completion_token_ids\"]` requires `n <= 1`");
+        }
         // none for modalities
         // none for prediction
         // none for audio
@@ -565,4 +574,24 @@ mod tests {
             Some("nvext-level")
         );
     }
+
+    #[test]
+    fn test_completion_token_ids_rejects_multiple_choices() {
+        let request_json = json!({
+            "model": "test-model",
+            "messages": [{"role": "user", "content": "(token-in mode)"}],
+            "n": 2,
+            "nvext": {
+                "extra_fields": ["completion_token_ids"]
+            }
+        });
+        let request: NvCreateChatCompletionRequest =
+            serde_json::from_value(request_json).expect("Failed to deserialize request");
+
+        let err = ValidateRequest::validate(&request).expect_err("n > 1 should be rejected");
+        assert!(
+            err.to_string().contains("completion_token_ids"),
+            "unexpected error: {err}"
+        );
+    }
 }

From ca477a64c743926362a39f393f73d63f445f209c Mon Sep 17 00:00:00 2001
From: AmeenP <ameenp360@gmail.com>
Date: Fri, 15 May 2026 05:18:41 -0700
Subject: [PATCH 4/4] feat(llm): accept rl-sdk token-in nvext shape

Signed-off-by: AmeenP <ameenp360@gmail.com>
---
 docs/components/frontend/nvext.md             |   6 +-
 .../src/protocols/openai/chat_completions.rs  |  22 +++-
 .../openai/chat_completions/delta.rs          | 107 ++++++++++++++++--
 lib/llm/src/protocols/openai/completions.rs   |  22 +++-
 lib/llm/src/protocols/openai/validate.rs      |   2 +-
 5 files changed, 138 insertions(+), 21 deletions(-)

diff --git a/docs/components/frontend/nvext.md b/docs/components/frontend/nvext.md
index 1729491c7f7b..800bbb528d5c 100644
--- a/docs/components/frontend/nvext.md
+++ b/docs/components/frontend/nvext.md
@@ -43,11 +43,13 @@ Include `nvext` as a top-level field alongside standard OpenAI-compatible fields
 | `agent_hints` | object | `None` | Router | Per-request hints for scheduling and load balancing. See [Agent Hints](#agent-hints). |
 | `session_control` | object | `None` | Router | Session lifecycle and sticky routing for subagent KV isolation. See [Session Control](#session-control). |
 
-Related root-level Dynamo output option:
+Related root-level Dynamo compatibility fields:
 
 | Field | Type | Default | Consumed By | Description |
 |-------|------|---------|-------------|-------------|
 | `return_tokens_as_token_ids` | `bool` | `false` | Response builder | Formats logprob token strings as `token_id:<id>` instead of decoded text. |
+| `cache_salt` | `string` | `None` | Backend | Compatibility alias for `nvext.cache_salt`; `nvext.cache_salt` takes precedence when both are present. |
+| `stop_token_ids` | `u32[]` | `None` | Preprocessor | Compatibility alias for integer token stop IDs, equivalent to passing token IDs in the normal `stop` array. |
 
 `return_tokens_as_token_ids` only changes returned logprob token display. To stop on
 token IDs, pass integer IDs in the normal `stop` array, for example
@@ -207,7 +209,7 @@ When the client requests response metadata via `extra_fields`, the response incl
 | `worker_id` | `extra_fields: ["worker_id"]` | Prefill/decode worker IDs and data parallel ranks that processed the request. |
 | `timing` | `extra_fields: ["timing"]` | Per-request timing information (TTFT, ITL, queue time, etc.). |
 | `routed_experts` | `extra_fields: ["routed_experts"]` | Routed expert capture payload returned by SGLang-backed requests. |
-| `engine_data` | `extra_fields: ["engine_data"]` | Opaque backend-provided engine metadata. |
+| `engine_data` | `extra_fields: ["engine_data"]` | Opaque backend-provided engine metadata. For chat token-in requests, Dynamo also includes generated `completion_token_ids` and, when available, `completion_logprobs` under this object for compatibility with rl-sdk token clients. |
 | `stop_reason` | `extra_fields: ["stop_reason"]` | Backend-specific matched stop condition, returned under `nvext` because it is not part of the OpenAI completions schema. Dynamo currently serves this as a response-level field for single-choice requests; supporting `n > 1` will require an indexed per-choice shape. |
 | `completion_token_ids` | `extra_fields: ["completion_token_ids"]` | Generated token IDs accumulated across the chat-completions response and emitted on the final chunk. Supported only for single-choice requests (`n <= 1`). |
 | `token_ids` | Automatic (GAIE Stage 1) | Tokenized prompt for reuse in Stage 2 query-only mode. |
diff --git a/lib/llm/src/protocols/openai/chat_completions.rs b/lib/llm/src/protocols/openai/chat_completions.rs
index 156518166983..981554762ad4 100644
--- a/lib/llm/src/protocols/openai/chat_completions.rs
+++ b/lib/llm/src/protocols/openai/chat_completions.rs
@@ -299,7 +299,14 @@ impl OpenAIStopConditionsProvider for NvCreateChatCompletionRequest {
     }
 
     fn get_stop_token_ids(&self) -> Option<Vec<crate::types::TokenIdType>> {
-        self.inner.stop.as_ref().and_then(|stop| stop.token_ids())
+        if let Some(ids) = self.inner.stop.as_ref().and_then(|stop| stop.token_ids()) {
+            return Some(ids);
+        }
+        self.unsupported_fields
+            .get("stop_token_ids")
+            .and_then(|value| {
+                serde_json::from_value::<Vec<crate::types::TokenIdType>>(value.clone()).ok()
+            })
     }
 
     /// Returns a reference to the optional `NvExt` extension, if available.
@@ -524,15 +531,22 @@ mod tests {
             serde_json::from_value(scalar_token_id_stop);
         assert!(result.is_err());
 
-        let unsupported_stop_token_ids = json!({
+        let passthrough_stop_token_ids = json!({
             "model": "test-model",
             "messages": [{"role": "user", "content": "Hello"}],
             "stop_token_ids": [576]
         });
         let request: NvCreateChatCompletionRequest =
-            serde_json::from_value(unsupported_stop_token_ids)
+            serde_json::from_value(passthrough_stop_token_ids)
                 .expect("Failed to deserialize request");
-        assert!(ValidateRequest::validate(&request).is_err());
+        ValidateRequest::validate(&request).expect("stop_token_ids should be accepted");
+        assert_eq!(request.get_stop_token_ids(), Some(vec![576]));
+
+        let stop_conditions = request
+            .extract_stop_conditions()
+            .expect("extract stop conditions");
+        assert_eq!(stop_conditions.stop, None);
+        assert_eq!(stop_conditions.stop_token_ids, Some(vec![576]));
     }
 
     #[test]
diff --git a/lib/llm/src/protocols/openai/chat_completions/delta.rs b/lib/llm/src/protocols/openai/chat_completions/delta.rs
index 0edad90fdd62..10960719f6cb 100644
--- a/lib/llm/src/protocols/openai/chat_completions/delta.rs
+++ b/lib/llm/src/protocols/openai/chat_completions/delta.rs
@@ -60,8 +60,11 @@ pub struct DeltaGenerator {
     /// Request tracker for per-request metrics (shared with PreprocessedRequest).
     tracker: Arc<RequestTracker>,
     /// Accumulated output token IDs across chunks, emitted on the final chunk
-    /// when `nvext.extra_fields` includes `completion_token_ids`.
+    /// when `nvext.extra_fields` includes `completion_token_ids` or `engine_data`.
     accumulated_completion_token_ids: Vec<TokenIdType>,
+    /// Accumulated per-token logprobs across chunks, emitted under
+    /// `nvext.engine_data.completion_logprobs` on the final chunk.
+    accumulated_completion_logprobs: Vec<f64>,
 }
 
 impl DeltaGenerator {
@@ -79,6 +82,7 @@ impl DeltaGenerator {
             options,
             tracker,
             accumulated_completion_token_ids: Vec::new(),
+            accumulated_completion_logprobs: Vec::new(),
         }
     }
 
@@ -261,10 +265,19 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
 
         self.usage.completion_tokens += token_length;
 
-        if self.options.response_fields.completion_token_ids && !delta.token_ids.is_empty() {
+        if (self.options.response_fields.completion_token_ids
+            || self.options.response_fields.engine_data)
+            && !delta.token_ids.is_empty()
+        {
             self.accumulated_completion_token_ids
                 .extend_from_slice(&delta.token_ids);
         }
+        if self.options.response_fields.engine_data
+            && let Some(log_probs) = delta.log_probs.as_ref()
+        {
+            self.accumulated_completion_logprobs
+                .extend_from_slice(log_probs);
+        }
 
         // If backend provides completion_usage, use it to update usage stats
         // This is critical for prompt embeddings where prompt_tokens comes from
@@ -331,7 +344,10 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
                 stop_reason,
             )
             .or_else(|| {
-                if self.options.response_fields.completion_token_ids && finish_reason.is_some() {
+                if (self.options.response_fields.completion_token_ids
+                    || self.options.response_fields.engine_data)
+                    && finish_reason.is_some()
+                {
                     Some(NvExtResponse {
                         worker_id: None,
                         timing: None,
@@ -350,6 +366,28 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
                 nvext_response.completion_token_ids =
                     Some(self.accumulated_completion_token_ids.clone());
             }
+            if self.options.response_fields.engine_data && finish_reason.is_some() {
+                let mut engine_data = match nvext_response.engine_data.take() {
+                    Some(serde_json::Value::Object(map)) => map,
+                    Some(value) => {
+                        let mut map = serde_json::Map::new();
+                        map.insert("backend".to_string(), value);
+                        map
+                    }
+                    None => serde_json::Map::new(),
+                };
+                engine_data.insert(
+                    "completion_token_ids".to_string(),
+                    serde_json::json!(self.accumulated_completion_token_ids.clone()),
+                );
+                if !self.accumulated_completion_logprobs.is_empty() {
+                    engine_data.insert(
+                        "completion_logprobs".to_string(),
+                        serde_json::json!(self.accumulated_completion_logprobs.clone()),
+                    );
+                }
+                nvext_response.engine_data = Some(serde_json::Value::Object(engine_data));
+            }
 
             if let Ok(nvext_json) = serde_json::to_value(&nvext_response) {
                 stream_response.nvext = Some(nvext_json);
@@ -739,6 +777,7 @@ mod tests {
             .expect("engine_data should be present");
         assert_eq!(engine_data["kv_transfer_time_ms"], 12.3);
         assert_eq!(engine_data["prefill_compute_time_ms"], 45.6);
+        assert_eq!(engine_data["completion_token_ids"], serde_json::json!([42]));
     }
 
     #[test]
@@ -803,12 +842,60 @@ mod tests {
             .choice_from_postprocessor(backend_output)
             .expect("should produce a response");
 
-        // engine_data is None from backend, so nvext.engine_data should be absent
-        if let Some(nvext) = &response.nvext {
-            assert!(
-                nvext.get("engine_data").is_none() || nvext.get("engine_data").unwrap().is_null(),
-                "engine_data should not appear when backend provides None"
-            );
-        }
+        let nvext = response
+            .nvext
+            .expect("nvext present for engine_data request with generated tokens");
+        let engine_data = nvext
+            .get("engine_data")
+            .expect("engine_data should include generated token IDs");
+        assert_eq!(engine_data["completion_token_ids"], serde_json::json!([42]));
+        assert!(engine_data.get("completion_logprobs").is_none());
+    }
+
+    #[test]
+    fn test_engine_data_accumulates_completion_token_ids_and_logprobs() {
+        let request = create_test_request_with_extra_fields(vec!["engine_data".to_string()]);
+        let mut generator = request.response_generator("req-engine-5".to_string());
+
+        let mut first_output = final_backend_output();
+        first_output.token_ids = vec![7];
+        first_output.tokens = vec![Some("A".to_string())];
+        first_output.text = Some("A".to_string());
+        first_output.log_probs = Some(vec![-0.1]);
+        first_output.finish_reason = None;
+        first_output.disaggregated_params = None;
+
+        let first_response = generator
+            .choice_from_postprocessor(first_output)
+            .expect("first choice generation");
+        assert!(
+            first_response.nvext.is_none(),
+            "engine_data token IDs should be emitted only on the final chunk"
+        );
+
+        let mut final_output = final_backend_output();
+        final_output.token_ids = vec![8, 9];
+        final_output.tokens = vec![Some("B".to_string()), Some("C".to_string())];
+        final_output.text = Some("BC".to_string());
+        final_output.log_probs = Some(vec![-0.2, -0.3]);
+        final_output.disaggregated_params = None;
+
+        let final_response = generator
+            .choice_from_postprocessor(final_output)
+            .expect("final choice generation");
+        let nvext = final_response
+            .nvext
+            .expect("nvext present for engine_data request");
+        let engine_data = nvext
+            .get("engine_data")
+            .expect("engine_data should include generated token metadata");
+        assert_eq!(
+            engine_data["completion_token_ids"],
+            serde_json::json!([7, 8, 9])
+        );
+        assert_eq!(
+            engine_data["completion_logprobs"],
+            serde_json::json!([-0.1, -0.2, -0.3])
+        );
     }
 }
diff --git a/lib/llm/src/protocols/openai/completions.rs b/lib/llm/src/protocols/openai/completions.rs
index e60890214bb9..aa69138113da 100644
--- a/lib/llm/src/protocols/openai/completions.rs
+++ b/lib/llm/src/protocols/openai/completions.rs
@@ -252,7 +252,14 @@ impl OpenAIStopConditionsProvider for NvCreateCompletionRequest {
     }
 
     fn get_stop_token_ids(&self) -> Option<Vec<crate::types::TokenIdType>> {
-        self.inner.stop.as_ref().and_then(|stop| stop.token_ids())
+        if let Some(ids) = self.inner.stop.as_ref().and_then(|stop| stop.token_ids()) {
+            return Some(ids);
+        }
+        self.unsupported_fields
+            .get("stop_token_ids")
+            .and_then(|value| {
+                serde_json::from_value::<Vec<crate::types::TokenIdType>>(value.clone()).ok()
+            })
     }
 
     fn nvext(&self) -> Option<&NvExt> {
@@ -733,13 +740,20 @@ mod tests {
         assert_eq!(request.get_stop(), Some(vec!["token_id:576".to_string()]));
         assert_eq!(request.get_stop_token_ids(), None);
 
-        let unsupported_stop_token_ids = json!({
+        let passthrough_stop_token_ids = json!({
             "model": "test-model",
             "prompt": [1, 2, 3],
             "stop_token_ids": [576]
         });
-        let request: NvCreateCompletionRequest = serde_json::from_value(unsupported_stop_token_ids)
+        let request: NvCreateCompletionRequest = serde_json::from_value(passthrough_stop_token_ids)
             .expect("Failed to deserialize request");
-        assert!(ValidateRequest::validate(&request).is_err());
+        ValidateRequest::validate(&request).expect("stop_token_ids should be accepted");
+        assert_eq!(request.get_stop_token_ids(), Some(vec![576]));
+
+        let stop_conditions = request
+            .extract_stop_conditions()
+            .expect("extract stop conditions");
+        assert_eq!(stop_conditions.stop, None);
+        assert_eq!(stop_conditions.stop_token_ids, Some(vec![576]));
     }
 }
diff --git a/lib/llm/src/protocols/openai/validate.rs b/lib/llm/src/protocols/openai/validate.rs
index 19707ec667a5..5d5dc9263929 100644
--- a/lib/llm/src/protocols/openai/validate.rs
+++ b/lib/llm/src/protocols/openai/validate.rs
@@ -99,7 +99,7 @@ pub const MAX_REPETITION_PENALTY: f32 = 2.0;
 
 /// Root-level fields accepted for compatibility with token-in clients even
 /// though Dynamo has not modeled them as first-class OpenAI fields.
-pub const PASSTHROUGH_EXTRA_FIELDS: &[&str] = &["cache_salt"];
+pub const PASSTHROUGH_EXTRA_FIELDS: &[&str] = &["cache_salt", "stop_token_ids"];
 
 /// Validates that no unsupported fields are present in the request
 pub fn validate_no_unsupported_fields(