ai-dynamo · AmeenP · May 13, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
@@ -1925,6 +1925,11 @@ def _build_prompt_from_request(
             prompt_token_ids=request["token_ids"],
             multi_modal_data=multi_modal_data,
         )
+        nvext_args = extra_args.get("nvext") if isinstance(extra_args, dict) else None
+        if isinstance(nvext_args, dict):
+            cache_salt = nvext_args.get("cache_salt")
+            if cache_salt is not None:
+                prompt_kwargs["cache_salt"] = cache_salt
         if mm_uuids is not None:
             prompt_kwargs["multi_modal_uuids"] = mm_uuids
         if mm_processor_kwargs is not None:

diff --git a/docs/components/frontend/nvext.md b/docs/components/frontend/nvext.md
@@ -33,20 +33,23 @@ Include `nvext` as a top-level field alongside standard OpenAI-compatible fields
 | `use_raw_prompt` | `bool` | `None` | Preprocessor | Bypasses the prompt template and passes the prompt directly to the tokenizer. |
 | `annotations` | `string[]` | `None` | Preprocessor | Triggers out-of-band information in the SSE stream via the `event:` field. |
 | `backend_instance_id` | `u64` | `None` | Router | Routes the request to a specific backend instance. |
-| `token_data` | `u32[]` | `None` | Preprocessor | Pre-tokenized prompt tokens. When provided with `backend_instance_id`, tokenization is skipped. |
+| `token_data` | `u32[]` | `None` | Preprocessor | Pre-tokenized prompt tokens. When provided, tokenization is skipped. `backend_instance_id` remains an independent routing hint. |
 | `max_thinking_tokens` | `u32` | `None` | Backend | Maximum thinking tokens allowed (passed through to backends). |
-| `extra_fields` | `string[]` | `None` | Response builder | Fields to include in the response `nvext`. Supported: `"worker_id"`, `"timing"`, `"routed_experts"`, `"engine_data"`, `"stop_reason"`. |
+| `cache_salt` | `string` | `None` | Backend | Prefix-cache isolation hint for token-in clients. The top-level `cache_salt` request field is also accepted for renderer compatibility. |
+| `extra_fields` | `string[]` | `None` | Response builder | Fields to include in the response `nvext`. Supported: `"worker_id"`, `"timing"`, `"routed_experts"`, `"engine_data"`, `"stop_reason"`, `"completion_token_ids"`. |
 | `prefill_worker_id` | `u64` | `None` | Router | Routes the request to a specific prefill worker (disaggregated serving). |
 | `decode_worker_id` | `u64` | `None` | Router | Routes the request to a specific decode worker (disaggregated serving). |
 | `agent_context` | object | `None` | Preprocessor | Passive session and trajectory identity for agent traces. See [Agent Context](#agent-context) below and [Agent Tracing](../../agents/agent-tracing.md). |
 | `agent_hints` | object | `None` | Router | Per-request hints for scheduling and load balancing. See [Agent Hints](#agent-hints). |
 | `session_control` | object | `None` | Router | Session lifecycle and sticky routing for subagent KV isolation. See [Session Control](#session-control). |
 
-Related root-level Dynamo output option:
+Related root-level Dynamo compatibility fields:
 
 | Field | Type | Default | Consumed By | Description |
 |-------|------|---------|-------------|-------------|
 | `return_tokens_as_token_ids` | `bool` | `false` | Response builder | Formats logprob token strings as `token_id:<id>` instead of decoded text. |
+| `cache_salt` | `string` | `None` | Backend | Compatibility alias for `nvext.cache_salt`; `nvext.cache_salt` takes precedence when both are present. |
+| `stop_token_ids` | `u32[]` | `None` | Preprocessor | Compatibility alias for integer token stop IDs, equivalent to passing token IDs in the normal `stop` array. |
 
 `return_tokens_as_token_ids` only changes returned logprob token display. To stop on
 token IDs, pass integer IDs in the normal `stop` array, for example
@@ -206,8 +209,9 @@ When the client requests response metadata via `extra_fields`, the response incl
 | `worker_id` | `extra_fields: ["worker_id"]` | Prefill/decode worker IDs and data parallel ranks that processed the request. |
 | `timing` | `extra_fields: ["timing"]` | Per-request timing information (TTFT, ITL, queue time, etc.). |
 | `routed_experts` | `extra_fields: ["routed_experts"]` | Routed expert capture payload returned by SGLang-backed requests. |
-| `engine_data` | `extra_fields: ["engine_data"]` | Opaque backend-provided engine metadata. |
+| `engine_data` | `extra_fields: ["engine_data"]` | Opaque backend-provided engine metadata. For chat token-in requests, Dynamo also includes generated `completion_token_ids` and, when available, `completion_logprobs` under this object for compatibility with rl-sdk token clients. |
 | `stop_reason` | `extra_fields: ["stop_reason"]` | Backend-specific matched stop condition, returned under `nvext` because it is not part of the OpenAI completions schema. Dynamo currently serves this as a response-level field for single-choice requests; supporting `n > 1` will require an indexed per-choice shape. |
+| `completion_token_ids` | `extra_fields: ["completion_token_ids"]` | Generated token IDs accumulated across the chat-completions response and emitted on the final chunk. Supported only for single-choice requests (`n <= 1`). |
 | `token_ids` | Automatic (GAIE Stage 1) | Tokenized prompt for reuse in Stage 2 query-only mode. |
 
 ### Example response `nvext`

@@ -556,12 +556,36 @@ impl OpenAIPreprocessor {
             }));
         }
 
+        if let Some(extra_args) = Self::nvext_passthrough_extra_args(request) {
+            builder.extra_args(Some(extra_args));
+        }
+
         // Forward mm_processor_kwargs (e.g. use_audio_in_video) to the backend.
         builder.mm_processor_kwargs(request.mm_processor_kwargs().cloned());
 
         Ok(builder)
     }
 
+    fn nvext_passthrough_extra_args<R: NvExtProvider>(request: &R) -> Option<serde_json::Value> {
+        let mut nvext_args = serde_json::Map::new();
+
+        if let Some(fields) = request.nvext_extra_fields()
+            && !fields.is_empty()
+        {
+            nvext_args.insert("extra_fields".to_string(), serde_json::json!(fields));
+        }
+
+        if let Some(cache_salt) = request.cache_salt() {
+            nvext_args.insert("cache_salt".to_string(), serde_json::json!(cache_salt));
+        }
+
+        if nvext_args.is_empty() {
+            None
+        } else {
+            Some(serde_json::json!({ "nvext": serde_json::Value::Object(nvext_args) }))
+        }
+    }
+
     pub fn apply_template<
         R: OAIChatLikeRequest
             + AnnotationsProvider
@@ -623,7 +647,7 @@ impl OpenAIPreprocessor {
         }
     }
 
-    pub async fn gather_multi_modal_data<R: OAIChatLikeRequest>(
+    pub async fn gather_multi_modal_data<R: OAIChatLikeRequest + NvExtProvider>(
         &self,
         request: &R,
         builder: &mut PreprocessedRequestBuilder,
@@ -847,6 +871,11 @@ impl OpenAIPreprocessor {
             let mut extra_args = serde_json::json!({
                 "messages": messages_json
             });
+            if let Some(nvext_passthrough) = Self::nvext_passthrough_extra_args(request)
+                && let Some(nvext) = nvext_passthrough.get("nvext")
+            {
+                extra_args["nvext"] = nvext.clone();
+            }
 
             // Strip redundant inline data: URLs only when frontend decoding is active
             // (media_loader decoded the images into RDMA descriptors). TRT-LLM and

@@ -101,6 +101,17 @@ impl NvExtProvider for NvCreateChatCompletionRequest {
     fn raw_prompt(&self) -> Option<String> {
         None
     }
+
+    fn cache_salt(&self) -> Option<&str> {
+        self.nvext
+            .as_ref()
+            .and_then(|nvext| nvext.cache_salt.as_deref())
+            .or_else(|| {
+                self.unsupported_fields
+                    .get("cache_salt")
+                    .and_then(|value| value.as_str())
+            })
+    }
 }
 
 /// Implements `AnnotationsProvider` for `NvCreateChatCompletionRequest`,
@@ -288,7 +299,14 @@ impl OpenAIStopConditionsProvider for NvCreateChatCompletionRequest {
     }
 
     fn get_stop_token_ids(&self) -> Option<Vec<crate::types::TokenIdType>> {
-        self.inner.stop.as_ref().and_then(|stop| stop.token_ids())
+        if let Some(ids) = self.inner.stop.as_ref().and_then(|stop| stop.token_ids()) {
+            return Some(ids);
+        }
+        self.unsupported_fields
+            .get("stop_token_ids")
+            .and_then(|value| {
+                serde_json::from_value::<Vec<crate::types::TokenIdType>>(value.clone()).ok()
+            })
     }
 
     /// Returns a reference to the optional `NvExt` extension, if available.
@@ -353,6 +371,15 @@ impl ValidateRequest for NvCreateChatCompletionRequest {
         // validate::validate_max_tokens(self.inner.max_tokens)?; // warning depricated field
         validate::validate_max_completion_tokens(self.inner.max_completion_tokens)?;
         validate::validate_n(self.inner.n)?;
+        if self.inner.n.unwrap_or(1) > 1
+            && self
+                .nvext
+                .as_ref()
+                .and_then(|nvext| nvext.extra_fields.as_ref())
+                .is_some_and(|fields| fields.iter().any(|field| field == "completion_token_ids"))
+        {
+            anyhow::bail!("`nvext.extra_fields=[\"completion_token_ids\"]` requires `n <= 1`");
+        }
         // none for modalities
         // none for prediction
         // none for audio
@@ -504,14 +531,81 @@ mod tests {
             serde_json::from_value(scalar_token_id_stop);
         assert!(result.is_err());
 
-        let unsupported_stop_token_ids = json!({
+        let passthrough_stop_token_ids = json!({
             "model": "test-model",
             "messages": [{"role": "user", "content": "Hello"}],
             "stop_token_ids": [576]
         });
         let request: NvCreateChatCompletionRequest =
-            serde_json::from_value(unsupported_stop_token_ids)
+            serde_json::from_value(passthrough_stop_token_ids)
                 .expect("Failed to deserialize request");
-        assert!(ValidateRequest::validate(&request).is_err());
+        ValidateRequest::validate(&request).expect("stop_token_ids should be accepted");
+        assert_eq!(request.get_stop_token_ids(), Some(vec![576]));
+
+        let stop_conditions = request
+            .extract_stop_conditions()
+            .expect("extract stop conditions");
+        assert_eq!(stop_conditions.stop, None);
+        assert_eq!(stop_conditions.stop_token_ids, Some(vec![576]));
+    }
+
+    #[test]
+    fn test_cache_salt_accepts_renderer_top_level_shape() {
+        let request_json = json!({
+            "model": "test-model",
+            "messages": [{"role": "user", "content": "(token-in mode)"}],
+            "nvext": {
+                "token_data": [1, 2, 3],
+                "extra_fields": ["completion_token_ids"]
+            },
+            "cache_salt": "ckpt-42"
+        });
+        let request: NvCreateChatCompletionRequest =
+            serde_json::from_value(request_json).expect("Failed to deserialize request");
+
+        ValidateRequest::validate(&request).expect("cache_salt should be accepted");
+        assert_eq!(
+            <NvCreateChatCompletionRequest as NvExtProvider>::cache_salt(&request),
+            Some("ckpt-42")
+        );
+    }
+
+    #[test]
+    fn test_nvext_cache_salt_takes_precedence() {
+        let request_json = json!({
+            "model": "test-model",
+            "messages": [{"role": "user", "content": "(token-in mode)"}],
+            "cache_salt": "top-level",
+            "nvext": {
+                "cache_salt": "nvext-level"
+            }
+        });
+        let request: NvCreateChatCompletionRequest =
+            serde_json::from_value(request_json).expect("Failed to deserialize request");
+
+        assert_eq!(
+            <NvCreateChatCompletionRequest as NvExtProvider>::cache_salt(&request),
+            Some("nvext-level")
+        );
+    }
+
+    #[test]
+    fn test_completion_token_ids_rejects_multiple_choices() {
+        let request_json = json!({
+            "model": "test-model",
+            "messages": [{"role": "user", "content": "(token-in mode)"}],
+            "n": 2,
+            "nvext": {
+                "extra_fields": ["completion_token_ids"]
+            }
+        });
+        let request: NvCreateChatCompletionRequest =
+            serde_json::from_value(request_json).expect("Failed to deserialize request");
+
+        let err = ValidateRequest::validate(&request).expect_err("n > 1 should be rejected");
+        assert!(
+            err.to_string().contains("completion_token_ids"),
+            "unexpected error: {err}"
+        );
     }
 }