Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions components/src/dynamo/vllm/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1925,6 +1925,11 @@ def _build_prompt_from_request(
prompt_token_ids=request["token_ids"],
multi_modal_data=multi_modal_data,
)
nvext_args = extra_args.get("nvext") if isinstance(extra_args, dict) else None
if isinstance(nvext_args, dict):
cache_salt = nvext_args.get("cache_salt")
if cache_salt is not None:
prompt_kwargs["cache_salt"] = cache_salt
if mm_uuids is not None:
prompt_kwargs["multi_modal_uuids"] = mm_uuids
if mm_processor_kwargs is not None:
Expand Down
12 changes: 8 additions & 4 deletions docs/components/frontend/nvext.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,23 @@ Include `nvext` as a top-level field alongside standard OpenAI-compatible fields
| `use_raw_prompt` | `bool` | `None` | Preprocessor | Bypasses the prompt template and passes the prompt directly to the tokenizer. |
| `annotations` | `string[]` | `None` | Preprocessor | Triggers out-of-band information in the SSE stream via the `event:` field. |
| `backend_instance_id` | `u64` | `None` | Router | Routes the request to a specific backend instance. |
| `token_data` | `u32[]` | `None` | Preprocessor | Pre-tokenized prompt tokens. When provided with `backend_instance_id`, tokenization is skipped. |
| `token_data` | `u32[]` | `None` | Preprocessor | Pre-tokenized prompt tokens. When provided, tokenization is skipped. `backend_instance_id` remains an independent routing hint. |
| `max_thinking_tokens` | `u32` | `None` | Backend | Maximum thinking tokens allowed (passed through to backends). |
| `extra_fields` | `string[]` | `None` | Response builder | Fields to include in the response `nvext`. Supported: `"worker_id"`, `"timing"`, `"routed_experts"`, `"engine_data"`, `"stop_reason"`. |
| `cache_salt` | `string` | `None` | Backend | Prefix-cache isolation hint for token-in clients. The top-level `cache_salt` request field is also accepted for renderer compatibility. |
| `extra_fields` | `string[]` | `None` | Response builder | Fields to include in the response `nvext`. Supported: `"worker_id"`, `"timing"`, `"routed_experts"`, `"engine_data"`, `"stop_reason"`, `"completion_token_ids"`. |
| `prefill_worker_id` | `u64` | `None` | Router | Routes the request to a specific prefill worker (disaggregated serving). |
| `decode_worker_id` | `u64` | `None` | Router | Routes the request to a specific decode worker (disaggregated serving). |
| `agent_context` | object | `None` | Preprocessor | Passive session and trajectory identity for agent traces. See [Agent Context](#agent-context) below and [Agent Tracing](../../agents/agent-tracing.md). |
| `agent_hints` | object | `None` | Router | Per-request hints for scheduling and load balancing. See [Agent Hints](#agent-hints). |
| `session_control` | object | `None` | Router | Session lifecycle and sticky routing for subagent KV isolation. See [Session Control](#session-control). |

Related root-level Dynamo output option:
Related root-level Dynamo compatibility fields:

| Field | Type | Default | Consumed By | Description |
|-------|------|---------|-------------|-------------|
| `return_tokens_as_token_ids` | `bool` | `false` | Response builder | Formats logprob token strings as `token_id:<id>` instead of decoded text. |
| `cache_salt` | `string` | `None` | Backend | Compatibility alias for `nvext.cache_salt`; `nvext.cache_salt` takes precedence when both are present. |
| `stop_token_ids` | `u32[]` | `None` | Preprocessor | Compatibility alias for integer token stop IDs, equivalent to passing token IDs in the normal `stop` array. |

`return_tokens_as_token_ids` only changes returned logprob token display. To stop on
token IDs, pass integer IDs in the normal `stop` array, for example
Expand Down Expand Up @@ -206,8 +209,9 @@ When the client requests response metadata via `extra_fields`, the response incl
| `worker_id` | `extra_fields: ["worker_id"]` | Prefill/decode worker IDs and data parallel ranks that processed the request. |
| `timing` | `extra_fields: ["timing"]` | Per-request timing information (TTFT, ITL, queue time, etc.). |
| `routed_experts` | `extra_fields: ["routed_experts"]` | Routed expert capture payload returned by SGLang-backed requests. |
| `engine_data` | `extra_fields: ["engine_data"]` | Opaque backend-provided engine metadata. |
| `engine_data` | `extra_fields: ["engine_data"]` | Opaque backend-provided engine metadata. For chat token-in requests, Dynamo also includes generated `completion_token_ids` and, when available, `completion_logprobs` under this object for compatibility with rl-sdk token clients. |
| `stop_reason` | `extra_fields: ["stop_reason"]` | Backend-specific matched stop condition, returned under `nvext` because it is not part of the OpenAI completions schema. Dynamo currently serves this as a response-level field for single-choice requests; supporting `n > 1` will require an indexed per-choice shape. |
| `completion_token_ids` | `extra_fields: ["completion_token_ids"]` | Generated token IDs accumulated across the chat-completions response and emitted on the final chunk. Supported only for single-choice requests (`n <= 1`). |
| `token_ids` | Automatic (GAIE Stage 1) | Tokenized prompt for reuse in Stage 2 query-only mode. |

### Example response `nvext`
Expand Down
31 changes: 30 additions & 1 deletion lib/llm/src/preprocessor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -556,12 +556,36 @@ impl OpenAIPreprocessor {
}));
}

if let Some(extra_args) = Self::nvext_passthrough_extra_args(request) {
builder.extra_args(Some(extra_args));
}

// Forward mm_processor_kwargs (e.g. use_audio_in_video) to the backend.
builder.mm_processor_kwargs(request.mm_processor_kwargs().cloned());

Ok(builder)
}

fn nvext_passthrough_extra_args<R: NvExtProvider>(request: &R) -> Option<serde_json::Value> {
let mut nvext_args = serde_json::Map::new();

if let Some(fields) = request.nvext_extra_fields()
&& !fields.is_empty()
{
nvext_args.insert("extra_fields".to_string(), serde_json::json!(fields));
}

if let Some(cache_salt) = request.cache_salt() {
nvext_args.insert("cache_salt".to_string(), serde_json::json!(cache_salt));
}

if nvext_args.is_empty() {
None
} else {
Some(serde_json::json!({ "nvext": serde_json::Value::Object(nvext_args) }))
}
}

pub fn apply_template<
R: OAIChatLikeRequest
+ AnnotationsProvider
Expand Down Expand Up @@ -623,7 +647,7 @@ impl OpenAIPreprocessor {
}
}

pub async fn gather_multi_modal_data<R: OAIChatLikeRequest>(
pub async fn gather_multi_modal_data<R: OAIChatLikeRequest + NvExtProvider>(
&self,
request: &R,
builder: &mut PreprocessedRequestBuilder,
Expand Down Expand Up @@ -847,6 +871,11 @@ impl OpenAIPreprocessor {
let mut extra_args = serde_json::json!({
"messages": messages_json
});
if let Some(nvext_passthrough) = Self::nvext_passthrough_extra_args(request)
&& let Some(nvext) = nvext_passthrough.get("nvext")
{
extra_args["nvext"] = nvext.clone();
}

// Strip redundant inline data: URLs only when frontend decoding is active
// (media_loader decoded the images into RDMA descriptors). TRT-LLM and
Expand Down
102 changes: 98 additions & 4 deletions lib/llm/src/protocols/openai/chat_completions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,17 @@ impl NvExtProvider for NvCreateChatCompletionRequest {
fn raw_prompt(&self) -> Option<String> {
None
}

fn cache_salt(&self) -> Option<&str> {
self.nvext
.as_ref()
.and_then(|nvext| nvext.cache_salt.as_deref())
.or_else(|| {
self.unsupported_fields
.get("cache_salt")
.and_then(|value| value.as_str())
})
}
}

/// Implements `AnnotationsProvider` for `NvCreateChatCompletionRequest`,
Expand Down Expand Up @@ -288,7 +299,14 @@ impl OpenAIStopConditionsProvider for NvCreateChatCompletionRequest {
}

fn get_stop_token_ids(&self) -> Option<Vec<crate::types::TokenIdType>> {
self.inner.stop.as_ref().and_then(|stop| stop.token_ids())
if let Some(ids) = self.inner.stop.as_ref().and_then(|stop| stop.token_ids()) {
return Some(ids);
}
self.unsupported_fields
.get("stop_token_ids")
.and_then(|value| {
serde_json::from_value::<Vec<crate::types::TokenIdType>>(value.clone()).ok()
})
}

/// Returns a reference to the optional `NvExt` extension, if available.
Expand Down Expand Up @@ -353,6 +371,15 @@ impl ValidateRequest for NvCreateChatCompletionRequest {
// validate::validate_max_tokens(self.inner.max_tokens)?; // warning depricated field
validate::validate_max_completion_tokens(self.inner.max_completion_tokens)?;
validate::validate_n(self.inner.n)?;
if self.inner.n.unwrap_or(1) > 1
&& self
.nvext
.as_ref()
.and_then(|nvext| nvext.extra_fields.as_ref())
.is_some_and(|fields| fields.iter().any(|field| field == "completion_token_ids"))
{
anyhow::bail!("`nvext.extra_fields=[\"completion_token_ids\"]` requires `n <= 1`");
}
// none for modalities
// none for prediction
// none for audio
Expand Down Expand Up @@ -504,14 +531,81 @@ mod tests {
serde_json::from_value(scalar_token_id_stop);
assert!(result.is_err());

let unsupported_stop_token_ids = json!({
let passthrough_stop_token_ids = json!({
"model": "test-model",
"messages": [{"role": "user", "content": "Hello"}],
"stop_token_ids": [576]
});
let request: NvCreateChatCompletionRequest =
serde_json::from_value(unsupported_stop_token_ids)
serde_json::from_value(passthrough_stop_token_ids)
.expect("Failed to deserialize request");
assert!(ValidateRequest::validate(&request).is_err());
ValidateRequest::validate(&request).expect("stop_token_ids should be accepted");
assert_eq!(request.get_stop_token_ids(), Some(vec![576]));

let stop_conditions = request
.extract_stop_conditions()
.expect("extract stop conditions");
assert_eq!(stop_conditions.stop, None);
assert_eq!(stop_conditions.stop_token_ids, Some(vec![576]));
}

#[test]
fn test_cache_salt_accepts_renderer_top_level_shape() {
let request_json = json!({
"model": "test-model",
"messages": [{"role": "user", "content": "(token-in mode)"}],
"nvext": {
"token_data": [1, 2, 3],
"extra_fields": ["completion_token_ids"]
},
"cache_salt": "ckpt-42"
});
let request: NvCreateChatCompletionRequest =
serde_json::from_value(request_json).expect("Failed to deserialize request");

ValidateRequest::validate(&request).expect("cache_salt should be accepted");
assert_eq!(
<NvCreateChatCompletionRequest as NvExtProvider>::cache_salt(&request),
Some("ckpt-42")
);
}

#[test]
fn test_nvext_cache_salt_takes_precedence() {
let request_json = json!({
"model": "test-model",
"messages": [{"role": "user", "content": "(token-in mode)"}],
"cache_salt": "top-level",
"nvext": {
"cache_salt": "nvext-level"
}
});
let request: NvCreateChatCompletionRequest =
serde_json::from_value(request_json).expect("Failed to deserialize request");

assert_eq!(
<NvCreateChatCompletionRequest as NvExtProvider>::cache_salt(&request),
Some("nvext-level")
);
}

#[test]
fn test_completion_token_ids_rejects_multiple_choices() {
let request_json = json!({
"model": "test-model",
"messages": [{"role": "user", "content": "(token-in mode)"}],
"n": 2,
"nvext": {
"extra_fields": ["completion_token_ids"]
}
});
let request: NvCreateChatCompletionRequest =
serde_json::from_value(request_json).expect("Failed to deserialize request");

let err = ValidateRequest::validate(&request).expect_err("n > 1 should be rejected");
assert!(
err.to_string().contains("completion_token_ids"),
"unexpected error: {err}"
);
}
}
Loading
Loading