Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
4895316
feat: wire r3 v3 routed experts stack
S1ro1 May 13, 2026
721a874
feat: reset routing caches on policy update
S1ro1 May 13, 2026
baa6935
fix: rely on native vllm routed experts
S1ro1 May 13, 2026
18e9a7a
fix: pin routed experts dependencies for ci
S1ro1 May 13, 2026
90d2e3a
fix: update scheduler tests for prefix reset
S1ro1 May 13, 2026
1fea38e
fix: clean routed experts replay integration
S1ro1 May 14, 2026
2c019e1
fix: keep routed experts transport first class
S1ro1 May 14, 2026
803b4ae
fix: keep routed experts on samples
S1ro1 May 14, 2026
9092eca
fix: use upstream vllm nightly wheel
S1ro1 May 14, 2026
f49caec
fix: pin latest routed experts verifiers
S1ro1 May 14, 2026
9438623
Merge branch 'main' into feat/r3-v3-routed-experts
S1ro1 May 14, 2026
61a0388
fix: pin routed experts dependencies
S1ro1 May 14, 2026
094d233
fix: allow routed experts with nixl
S1ro1 May 14, 2026
d6d06b4
style: format nixl patch
S1ro1 May 14, 2026
9317cef
Use raw uint8 routed experts payloads
S1ro1 May 15, 2026
3cb8345
Remove unrelated rlm-swe dependency
S1ro1 May 15, 2026
66a2984
Pin vllm-router 0.1.25 wheel
S1ro1 May 15, 2026
777aae7
Keep verifiers routed experts opaque
S1ro1 May 16, 2026
a74e7f5
Forward renderer thinking preservation config
samsja May 16, 2026
a723ac0
Avoid duplicate routed experts in token responses
samsja May 16, 2026
b9d09bd
[codex] Guard checkpoint disk metrics mkdir (#2523)
samsja May 18, 2026
e2cffa1
Pin verifiers routed experts sidecar
S1ro1 May 19, 2026
0edc0c5
Pin cleaned verifiers routed experts handling
S1ro1 May 19, 2026
4402d7e
Pin rebased verifiers routed experts handling
S1ro1 May 19, 2026
7076bb1
fix: remove unrelated prime-rl changes
S1ro1 May 21, 2026
de71036
Merge remote-tracking branch 'origin/main' into feat/r3-v3-routed-exp…
S1ro1 May 21, 2026
aa1fc36
Merge branch 'main' into feat/r3-v3-routed-experts
S1ro1 May 21, 2026
62cc96b
fix: pack routed experts as typed payloads
S1ro1 May 21, 2026
ae6b8b3
refactor: inline routed experts trajectory packing
S1ro1 May 21, 2026
6de7fd1
fix: restore trajectory tokenization helpers
S1ro1 May 21, 2026
f50ad90
refactor: simplify routed experts packing
S1ro1 May 22, 2026
48fc98c
Merge branch 'main' into feat/r3-v3-routed-experts
S1ro1 May 22, 2026
c13b0b3
Merge branch 'main' into feat/r3-v3-routed-experts
S1ro1 May 22, 2026
97c65a2
chore: pin vllm router wheel
S1ro1 May 22, 2026
17c1508
Merge branch 'main' into feat/r3-v3-routed-experts
S1ro1 May 22, 2026
fbf16de
chore: update renderers and verifiers
S1ro1 May 22, 2026
2630ce9
Pin vLLM PR39568 backport wheel
S1ro1 May 22, 2026
c3ffa15
Pin vLLM revert42434 PR39568 wheel
S1ro1 May 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deps/verifiers
Submodule verifiers updated 134 files
13 changes: 13 additions & 0 deletions packages/prime-rl-configs/src/prime_rl/configs/rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,19 @@ def auto_setup_router_replay(self):
)
return self

@model_validator(mode="after")
def validate_router_replay_without_kv_offload(self):
if (
self.trainer.enable_router_replay
and self.inference is not None
and self.inference.kv_cache_offload is not None
):
raise ValueError(
"Router replay with inference.kv_cache_offload is not supported. "
"External KV cache hits do not carry routed-expert decisions."
)
return self

@model_validator(mode="after")
def auto_setup_deployment(self):
if self.deployment.type == "single_node": # single-node
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ dependencies = [
"tilelang>=0.1.8",
"flash-linear-attention",
"nvidia-ml-py>=12.575.51",
"pybase64>=1.4.2",
]

[project.scripts]
Expand Down Expand Up @@ -178,6 +179,7 @@ override-dependencies = [
# we want latest vllm, remove next patch
vllm = false
tokenspeed-mla = false
fastokens = false
flash_attn_3 = false
# PrimeIntellect-published on PyPI (trusted publisher)
prime = false
Expand Down Expand Up @@ -230,9 +232,9 @@ dion = { git = "https://github.com/samsja/dion.git", rev = "d891eeb" }
transformers = { git = "https://github.com/huggingface/transformers.git", rev = "c1c3424" }
flash-attn-4 = { git = "https://github.com/Dao-AILab/flash-attention.git", subdirectory = "flash_attn/cute", rev = "96bd151" }
prime-pydantic-config = { workspace = true }
vllm-router = { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.22/vllm_router-0.1.22-cp38-abi3-manylinux_2_28_x86_64.whl" }
vllm-router = { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.25/vllm_router-0.1.25-cp38-abi3-manylinux_2_28_x86_64.whl" }
vllm = [
{ url = "https://github.com/vllm-project/vllm/releases/download/v0.21.0/vllm-0.21.0+cu129-cp38-abi3-manylinux_2_34_x86_64.whl", marker = "platform_machine == 'x86_64'" },
{ url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.21.0+cu129.r42434.pr39568.a106aa6-cp38-abi3-manylinux_2_24_x86_64.whl", marker = "platform_machine == 'x86_64'" },
{ url = "https://github.com/vllm-project/vllm/releases/download/v0.21.0/vllm-0.21.0+cu129-cp38-abi3-manylinux_2_34_aarch64.whl", marker = "platform_machine == 'aarch64'" },
]
deep-ep = { url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/deep_ep-1.2.1+29d31c0-cp312-cp312-linux_x86_64.whl" }
Expand Down
45 changes: 45 additions & 0 deletions src/prime_rl/inference/patches.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,51 @@ def transformers_v5_compat():
monkey_patch_dp_engine_core_pause_resume_deadlock()
monkey_patch_vllm_layerwise_reload_alias_buffers()
monkey_patch_vllm_padded_input_scrub()
monkey_patch_return_routed_experts_with_nixl_connector()


def monkey_patch_return_routed_experts_with_nixl_connector():
from vllm import envs
from vllm.config.vllm import VllmConfig
from vllm.logger import init_logger

logger = init_logger(__name__)
original_post_init = VllmConfig.__post_init__

if getattr(original_post_init, "_prime_rl_allows_nixl_routed_experts", False):
return

def _is_nixl_routed_experts_pd_config(config: VllmConfig) -> bool:
kv_transfer_config = config.kv_transfer_config
return (
config.model_config is not None
and config.model_config.enable_return_routed_experts
and kv_transfer_config is not None
and kv_transfer_config.kv_connector == "NixlConnector"
and kv_transfer_config.is_kv_transfer_instance
)

def _post_init(config: VllmConfig):
if not _is_nixl_routed_experts_pd_config(config):
return original_post_init(config)

if config.parallel_config.pipeline_parallel_size > 1:
raise ValueError("--enable-return-routed-experts is incompatible with pipeline parallelism (PP > 1).")
if envs.VLLM_USE_V2_MODEL_RUNNER:
raise ValueError("VLLM_USE_V2_MODEL_RUNNER does not yet support: routed experts capture")

# vLLM rejects every KV connector, but our P/D path uses NIXL and
# stitches prefill/decode routed experts in the router. CPU KV offload
# remains rejected by prime-rl config validation.
config.model_config.enable_return_routed_experts = False
try:
return original_post_init(config)
finally:
config.model_config.enable_return_routed_experts = True

_post_init._prime_rl_allows_nixl_routed_experts = True
VllmConfig.__post_init__ = _post_init
logger.warning("Enabled vLLM routed-experts capture with NIXL connector patch.")


def monkey_patch_vllm_layerwise_reload_alias_buffers():
Expand Down
40 changes: 40 additions & 0 deletions src/prime_rl/inference/vllm/routed_experts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

from collections.abc import AsyncIterator
from typing import Any

import numpy as np
import pybase64
from vllm.outputs import RequestOutput


def serialize_routed_experts(routed_experts: Any) -> dict[str, Any] | None:
if routed_experts is None:
return None

array = np.asarray(routed_experts)
assert array.ndim == 3
assert np.issubdtype(array.dtype, np.integer)
if array.size:
assert array.min() >= 0
assert array.max() <= np.iinfo(np.uint8).max

compact = np.ascontiguousarray(array.astype(np.uint8, copy=False))
return {
"data": pybase64.b64encode(memoryview(compact)).decode("ascii"),
"shape": list(compact.shape),
}


class RoutedExpertsCapture:
def __init__(self, generator: AsyncIterator[RequestOutput]):
self._generator = generator
self.routed_experts: dict[int, dict[str, Any]] = {}

async def __aiter__(self):
async for request_output in self._generator:
for output in request_output.outputs:
encoded = serialize_routed_experts(getattr(output, "routed_experts", None))
if encoded is not None:
self.routed_experts[output.index] = encoded
yield request_output
75 changes: 18 additions & 57 deletions src/prime_rl/inference/vllm/serving_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
header and forwarded to ``engine_client.generate``. The DP-replicated
inference servers prime-RL runs need this to target a specific replica.

2. ``routed_experts`` per-token export — when the engine emits routing
decisions (``enable_return_routed_experts``), surface them on each choice.
This is what the trainer's router-replay path consumes.
2. Compact ``routed_experts`` export — when the engine emits routing
decisions, surface them as base64 raw-byte payloads without requiring a vLLM
source fork.

3. Server-side ``max_tokens`` defaulting — ``ServingTokens`` hands the
client-supplied ``SamplingParams`` to the engine verbatim, and
Expand All @@ -30,13 +30,11 @@

from __future__ import annotations

import base64
from collections.abc import AsyncGenerator
from functools import cached_property
from typing import Any

import numpy as np
from fastapi import Request
from pydantic import Field
from vllm.entrypoints.openai.engine.protocol import ErrorResponse, RequestResponseMetadata
from vllm.entrypoints.serve.disagg.protocol import (
GenerateRequest,
Expand All @@ -48,64 +46,29 @@
from vllm.outputs import RequestOutput
from vllm.sampling_params import RequestOutputKind, SamplingParams

from prime_rl.inference.vllm.routed_experts import RoutedExpertsCapture


class PrimeRlGenerateResponseChoice(GenerateResponseChoice):
routed_experts: dict | None = Field(
default=None,
description=(
"Per-token expert routing decisions (base85-encoded int32 array + shape). "
"Populated only when the engine was launched with "
"``enable_return_routed_experts=True``; otherwise ``None``."
),
)
routed_experts: dict[str, Any] | None = None


class PrimeRlGenerateResponse(GenerateResponse):
choices: list[PrimeRlGenerateResponseChoice]


def encode_routed_experts(arr: np.ndarray) -> dict:
return {
"data": base64.b85encode(arr.tobytes()).decode("ascii"),
"shape": list(arr.shape),
}


class _RoutedExpertsCaptureBase:
"""Wraps the engine result generator and accumulates a
``{output_index: encoded_experts}`` map as outputs stream. Subclasses
implement ``post_process`` to fold the captured map into the response
in whatever shape the endpoint returns (in-place vs rebuilt)."""

def __init__(self, generator: AsyncGenerator[RequestOutput, None]):
self._generator = generator
self.routed_experts: dict[int, dict] = {}

async def __aiter__(self):
async for request_output in self._generator:
for output in request_output.outputs:
if output.routed_experts is not None:
self.routed_experts[output.index] = encode_routed_experts(output.routed_experts)
yield request_output


class _RoutedExpertsCapture(_RoutedExpertsCaptureBase):
"""Generate-endpoint variant: rebuilds the response with
``PrimeRlGenerateResponseChoice`` because upstream's
``GenerateResponseChoice`` isn't ``extra='allow'``, so an attribute
set after construction wouldn't survive serialization."""

class _GenerateRoutedExpertsCapture(RoutedExpertsCapture):
def post_process(self, response: GenerateResponse) -> PrimeRlGenerateResponse:
new_choices = [
choices = [
PrimeRlGenerateResponseChoice(
**choice.model_dump(),
**choice.model_dump(exclude={"routed_experts"}),
routed_experts=self.routed_experts.get(choice.index),
)
for choice in response.choices
]
return PrimeRlGenerateResponse(
request_id=response.request_id,
choices=new_choices,
choices=choices,
prompt_logprobs=response.prompt_logprobs,
kv_transfer_params=response.kv_transfer_params,
)
Expand Down Expand Up @@ -135,7 +98,7 @@ async def _client_set_max_tokens(raw_request: Request | None) -> bool:


class PrimeRlServingTokens(ServingTokens):
"""ServingTokens + DP-rank routing + routed_experts export + max_tokens defaulting."""
"""ServingTokens + DP-rank routing + compact routed experts + max_tokens defaulting."""

@cached_property
def _max_tokens_defaults(self) -> tuple[dict, int | None]:
Expand Down Expand Up @@ -298,15 +261,13 @@ async def serve_tokens_full_generator( # type: ignore[override]
model_name: str,
request_metadata: RequestResponseMetadata,
) -> ErrorResponse | GenerateResponse:
# Wrap the result generator to capture routed_experts as it streams,
# defer the rest to upstream, then post-process the response into our
# PrimeRlGenerateResponse subclass so the encoded experts surface in
# the JSON. Skipping the wrapper when the engine isn't producing routed
# experts keeps us a no-op subclass on the common path.
capture: _RoutedExpertsCapture | None = None
# Capture routed_experts as vLLM streams request outputs, then post-process
# the final response into our GenerateResponse subclass so the encoded
# experts surface in the JSON.
capture: _GenerateRoutedExpertsCapture | None = None
if self.model_config.enable_return_routed_experts:
capture = _RoutedExpertsCapture(result_generator)
result_generator = capture # type: ignore[assignment]
capture = _GenerateRoutedExpertsCapture(result_generator)
result_generator = capture

response = await super().serve_tokens_full_generator(
request, result_generator, request_id, model_name, request_metadata
Expand Down
Loading
Loading