diff --git a/configs/experimental/dynamo/hendrycks_math_qwen4b_aime25.toml b/configs/experimental/dynamo/hendrycks_math_qwen4b_aime25.toml new file mode 100644 index 0000000000..eb62945a1f --- /dev/null +++ b/configs/experimental/dynamo/hendrycks_math_qwen4b_aime25.toml @@ -0,0 +1,63 @@ +max_steps = 500 +seq_len = 2048 +max_async_level = 1 + +[wandb] +project = "hendrycks-math-qwen4b" +group = "dynamo-vllm-qwen4b-aime25-500-bs64" +name = "dynamo-qwen4b-aime25-500-bs64" + +[model] +name = "Qwen/Qwen3-4B-Instruct-2507" + +[orchestrator] +batch_size = 64 +max_inflight_rollouts = 64 +rollouts_per_example = 4 +num_train_workers = 8 +max_off_policy_steps = 8 +use_token_client = false + +[orchestrator.train.sampling] +temperature = 1.0 +repetition_penalty = 1.0 +max_completion_tokens = 1024 +min_tokens = 0 +extra_body = { top_k = -1, min_p = 0.0 } + +[[orchestrator.train.env]] +id = "math-env" +name = "hendrycks-math" +args = { dataset_name = "PrimeIntellect/Hendrycks-Math", dataset_subset = "default", math_verify_max_workers = 128, math_verify_timeout = 60 } + +[orchestrator.buffer] +easy_threshold = 1.0 +hard_threshold = 0.0 + +[orchestrator.eval] +interval = 100 +eval_base_model = false +skip_eval_on_resume = true + +[orchestrator.eval.sampling] +max_completion_tokens = 1024 + +[[orchestrator.eval.env]] +id = "aime2025" +name = "aime2025" +num_examples = 30 +rollouts_per_example = 4 +interval = 100 + +[trainer] + +[inference] +backend = "dynamo" +vllm_extra = { max_num_seqs = 64 } + +[inference.dynamo] +deploy_router = true +router_mode = "round-robin" +discovery_backend = "file" +request_plane = "tcp" +system_port = 8081 diff --git a/packages/prime-rl-configs/src/prime_rl/configs/inference.py b/packages/prime-rl-configs/src/prime_rl/configs/inference.py index 805481de76..b638aa3867 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/inference.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/inference.py @@ -10,6 +10,17 @@ # TODO: Set thinking/ solution budget +InferenceBackend: TypeAlias = Literal["vllm", "dynamo"] +DynamoRouterMode: TypeAlias = Literal[ + "round-robin", + "random", + "kv", + "direct", + "power-of-two", + "least-loaded", + "device-aware-weighted", +] + class ServerConfig(BaseConfig): """Configures the inference server.""" @@ -120,6 +131,78 @@ class WeightBroadcastConfig(BaseConfig): ) +class DynamoConfig(BaseConfig): + """Configures the experimental Dynamo backend.""" + + deploy_router: Annotated[ + bool, + Field( + description=( + "Launch Dynamo's frontend/router from the inference entrypoint. Disable only when an " + "external Dynamo frontend/router is already deployed and orchestrator.client.base_url points at it." + ), + ), + ] = True + + namespace: Annotated[ + str, + Field(description="Dynamo namespace used by the frontend and worker for service discovery."), + ] = "dynamo" + + discovery_backend: Annotated[ + Literal["file", "etcd", "kubernetes", "mem"], + Field(description="Dynamo discovery backend. Use 'file' for local single-node runs without etcd/NATS."), + ] = "file" + + request_plane: Annotated[ + Literal["tcp", "nats", "http"], + Field(description="Dynamo request plane used between the frontend and worker."), + ] = "tcp" + + router_mode: Annotated[ + DynamoRouterMode, + Field(description="Dynamo frontend router mode used for OpenAI chat-completions traffic."), + ] = "round-robin" + + min_initial_workers: Annotated[ + int | None, + Field( + ge=0, + description="Optional Dynamo frontend minimum worker count before the router starts serving requests.", + ), + ] = None + + event_plane: Annotated[ + Literal["nats", "zmq"] | None, + Field(description="Dynamo event plane. If None, Dynamo derives it from discovery_backend."), + ] = None + + system_port: Annotated[ + int, + Field(ge=1, le=65535, description="Dynamo worker system-server port for /health and /engine routes."), + ] = 8081 + + use_vllm_tokenizer: Annotated[ + bool, + Field( + description=( + "Use Dynamo's text-in/text-out vLLM worker path. prime-rl defaults this off so the " + "Dynamo frontend tokenizes OpenAI requests before dispatching them to the worker." + ), + ), + ] = False + + frontend_extra: Annotated[ + dict[str, Any], + Field(description="Extra CLI arguments for python -m dynamo.frontend."), + ] = {} + + worker_extra: Annotated[ + dict[str, Any], + Field(description="Extra CLI arguments for the Dynamo vLLM worker."), + ] = {} + + class KVCacheOffloadConfig(BaseModel): """CPU KV cache offloading for vLLM inference workers.""" @@ -252,6 +335,16 @@ class InferenceExperimentalConfig(BaseConfig): class InferenceConfig(BaseConfig): """Configures inference.""" + backend: Annotated[ + InferenceBackend, + Field( + description=( + "Inference server backend to launch. vLLM is the default; Dynamo is an experimental " + "backend for OpenAI-compatible rollouts with prime-rl weight updates." + ) + ), + ] = "vllm" + # The server configuration server: ServerConfig = ServerConfig() @@ -384,6 +477,11 @@ class InferenceConfig(BaseConfig): WeightBroadcastConfig() ) + dynamo: Annotated[ + DynamoConfig, + Field(description="Experimental Dynamo-specific settings used when backend='dynamo'."), + ] = DynamoConfig() + kv_cache_offload: Annotated[ KVCacheOffloadConfig | None, Field( @@ -447,6 +545,17 @@ def validate_multi_node_requires_slurm(self): raise ValueError("Must use SLURM for multi-node deployment.") return self + @model_validator(mode="after") + def validate_backend_support(self): + if self.backend != "dynamo": + return self + + if self.deployment.type != "single_node": + raise ValueError("inference.backend='dynamo' currently supports only single_node deployment.") + if self.kv_cache_offload is not None: + raise ValueError("inference.backend='dynamo' does not support prime-rl kv_cache_offload plumbing yet.") + return self + @model_validator(mode="after") def auto_setup_kv_cache_offload(self): if self.kv_cache_offload is not None: @@ -603,3 +712,110 @@ def to_vllm(self) -> Namespace: delattr(namespace, "rope_scaling") return namespace + + def to_dynamo_frontend(self) -> Namespace: + """Convert InferenceConfig to Dynamo frontend CLI namespace.""" + namespace = Namespace() + to_frontend = { + "server.host": "http_host", + "server.port": "http_port", + "dynamo.namespace": "namespace", + "dynamo.discovery_backend": "discovery_backend", + "dynamo.request_plane": "request_plane", + "dynamo.router_mode": "router_mode", + "dynamo.min_initial_workers": "min_initial_workers", + "dynamo.event_plane": "event_plane", + "model.name": "model_name", + } + + for config_key, frontend_key in to_frontend.items(): + value = rgetattr(self, config_key.replace("-", "_")) + rsetattr(namespace, frontend_key, value) + + namespace.dyn_chat_processor = "vllm" + + for key, value in self.dynamo.frontend_extra.items(): + setattr(namespace, key, value) + + for optional_key in ("http_host", "event_plane", "model_name", "min_initial_workers"): + value = getattr(namespace, optional_key, None) + if value is None: + delattr(namespace, optional_key) + + return namespace + + def to_dynamo_vllm(self) -> Namespace: + """Convert InferenceConfig to Dynamo vLLM worker CLI namespace.""" + namespace = Namespace() + to_dynamo_vllm = { + "dynamo.namespace": "namespace", + "dynamo.discovery_backend": "discovery_backend", + "dynamo.request_plane": "request_plane", + "dynamo.event_plane": "event_plane", + "dynamo.use_vllm_tokenizer": "use_vllm_tokenizer", + "model.name": "model", + "model.dtype": "dtype", + "model.max_model_len": "max_model_len", + "model.enforce_eager": "enforce_eager", + "model.trust_remote_code": "trust_remote_code", + "model.rope_scaling": "rope_scaling", + "parallel.tp": "tensor_parallel_size", + "parallel.dp": "data_parallel_size", + "data_parallel_size_local": "data_parallel_size_local", + "data_parallel_rpc_port": "data_parallel_rpc_port", + "enable_lora": "enable_lora", + "enable_prefix_caching": "enable_prefix_caching", + "max_loras": "max_loras", + "max_cpu_loras": "max_cpu_loras", + "max_lora_rank": "max_lora_rank", + "lora_target_modules": "lora_target_modules", + "gpu_memory_utilization": "gpu_memory_utilization", + "enable_return_routed_experts": "enable_return_routed_experts", + "enable_expert_parallel": "enable_expert_parallel", + "all2all_backend": "all2all_backend", + "enable_eplb": "enable_eplb", + "enable_dbo": "enable_dbo", + "seed": "seed", + } + + for config_key, dynamo_key in to_dynamo_vllm.items(): + value = rgetattr(self, config_key.replace("-", "_")) + rsetattr(namespace, dynamo_key, value) + + namespace.served_model_name = self.model.name + + if self.model.tool_call_parser not in (None, "auto"): + namespace.dyn_tool_call_parser = self.model.tool_call_parser + if self.model.reasoning_parser is not None: + namespace.dyn_reasoning_parser = self.model.reasoning_parser + + namespace.logprobs_mode = "processed_logprobs" + + if self.enable_fp32_lm_head: + existing = getattr(namespace, "additional_config", None) or {} + existing["fp32_lm_head"] = True + rsetattr(namespace, "additional_config", existing) + + for key, value in self.vllm_extra.items(): + setattr(namespace, key, value) + for key, value in self.dynamo.worker_extra.items(): + setattr(namespace, key, value) + + for optional_key in ( + "event_plane", + "rope_scaling", + "data_parallel_size_local", + "data_parallel_rpc_port", + "enable_prefix_caching", + "max_lora_rank", + "lora_target_modules", + "additional_config", + ): + if not hasattr(namespace, optional_key): + continue + + value = getattr(namespace, optional_key) + if value is None: + delattr(namespace, optional_key) + + return namespace diff --git a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py index 3911816227..a5753af8c6 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py @@ -1339,5 +1339,6 @@ def resolve_env_config(self): if is_vllm: env.sampling.extra_body.setdefault("top_k", -1) env.sampling.extra_body.setdefault("min_p", 0.0) - env.sampling.extra_body.setdefault("return_token_ids", True) + if self.use_token_client: + env.sampling.extra_body.setdefault("return_token_ids", True) return self diff --git a/packages/prime-rl-configs/src/prime_rl/configs/rl.py b/packages/prime-rl-configs/src/prime_rl/configs/rl.py index a160af2c9f..e403dd4c8d 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/rl.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/rl.py @@ -681,6 +681,35 @@ def auto_setup_weight_broadcast(self): return self + @model_validator(mode="after") + def auto_setup_dynamo_backend(self): + """Configure orchestrator compatibility for the experimental Dynamo backend.""" + if self.inference is None or self.inference.backend != "dynamo": + return self + + self.orchestrator.client.admin_backend = "dynamo" + if self.orchestrator.client.admin_base_url is None: + self.orchestrator.client.admin_base_url = [f"http://localhost:{self.inference.dynamo.system_port}"] + + if self.orchestrator.use_renderer: + raise ValueError( + "inference.backend='dynamo' does not support orchestrator.use_renderer because prime-rl's " + "renderer client targets the vLLM-only /v1/generate endpoint." + ) + + if self.orchestrator.use_token_client: + if "use_token_client" in self.orchestrator.model_fields_set: + raise ValueError( + "inference.backend='dynamo' does not support orchestrator.use_token_client because Dynamo " + "does not expose prime-rl's vLLM-only /v1/chat/completions/tokens endpoint." + ) + self.orchestrator.use_token_client = False + + for env in self.orchestrator.train.env: + env.sampling.extra_body.pop("return_token_ids", None) + + return self + @model_validator(mode="after") def validate_eplb_requires_quantized_weight_transfer(self): if self.inference is None or not self.inference.enable_eplb: diff --git a/packages/prime-rl-configs/src/prime_rl/configs/shared.py b/packages/prime-rl-configs/src/prime_rl/configs/shared.py index d26c33d9a9..f999d1bf54 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/shared.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/shared.py @@ -79,6 +79,7 @@ def resolve_project_dir(self): ServerType = Literal["vllm", "openai"] +AdminBackend = Literal["vllm", "dynamo"] class VLMConfig(BaseConfig): @@ -310,6 +311,11 @@ class ClientConfig(BaseConfig): ), ] = None + admin_backend: Annotated[ + AdminBackend, + Field(description="Backend API exposed by admin_base_url/base_url for weight updates and health checks."), + ] = "vllm" + elastic: Annotated[ ElasticConfig | None, Field( diff --git a/pyproject.toml b/pyproject.toml index e130ba0f9c..a6a162d64e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,10 @@ gpt-oss = [ quack = [ "quack-kernels>=0.3.3", ] +dynamo = [ + "ai-dynamo==1.1.1", + "blake3>=1.0.0,<2.0.0", +] all = [ "prime-rl[flash-attn]", "prime-rl[disagg]", @@ -129,6 +133,8 @@ override-dependencies = [ [tool.uv.exclude-newer-package] # we want latest vllm, remove next patch vllm = false +ai-dynamo = false +ai-dynamo-runtime = false flash_attn_3 = false # Self-vendored packages on our primeintellect index reverse-text = false diff --git a/skills/config/SKILL.md b/skills/config/SKILL.md index e8dc13216c..b9afa50528 100644 --- a/skills/config/SKILL.md +++ b/skills/config/SKILL.md @@ -132,6 +132,25 @@ On the CLI, pass as a JSON string: uv run inference --vllm-extra '{"key1": "value1", "key2": 123}' ``` +### Experimental Dynamo backend + +Dynamo support is intentionally isolated behind `inference.backend = "dynamo"` and requires installing the optional Dynamo extra. The launcher starts Dynamo's frontend/router and a Dynamo vLLM worker from `prime_rl.experimental.dynamo`: + +```toml +[inference] +backend = "dynamo" + +[inference.dynamo] +deploy_router = true +router_mode = "round-robin" +system_port = 8081 +discovery_backend = "file" +``` + +The public rollout endpoint is Dynamo's OpenAI-compatible frontend on `inference.server.port`. Admin calls for health, NCCL setup, and weight updates use the worker system server on `inference.dynamo.system_port`. The RL config switches rollout training to the standard OpenAI chat-completions client and points admin weight updates at the Dynamo worker system server. + +Set `inference.dynamo.deploy_router = false` only when an external Dynamo frontend/router is already deployed and `orchestrator.client.base_url` points at it. With `discovery_backend = "file"`, also set `DYN_FILE_KV` to a discovery directory shared by the external router and worker. + ### Discriminated unions Some config fields use discriminated unions (e.g. loss type, data type). Set the `type` field to select the variant: diff --git a/skills/entrypoints/SKILL.md b/skills/entrypoints/SKILL.md index f6589aca97..deb38dd2e2 100644 --- a/skills/entrypoints/SKILL.md +++ b/skills/entrypoints/SKILL.md @@ -39,7 +39,7 @@ The entrypoint launches torchrun internally — no need to call torchrun directl ## `inference` — Standalone inference server -Launches a vLLM-based inference server with OpenAI-compatible API. +Launches a vLLM-based inference server with OpenAI-compatible API. The experimental Dynamo backend is available with `inference.backend = "dynamo"` after installing the optional Dynamo extra. ```bash uv run inference @ configs/debug/infer.toml @@ -48,10 +48,20 @@ uv run inference --model.name Qwen/Qwen3-0.6B --model.enforce-eager Always use the `inference` entrypoint — never `vllm serve` directly. +For Dynamo: + +```bash +uv run --extra dynamo inference @ configs/debug/infer.toml --backend dynamo +``` + +Dynamo code lives under `prime_rl.experimental.dynamo`; the launcher exposes Dynamo's OpenAI frontend/router on `inference.server.port` and uses `inference.dynamo.system_port` for weight-update/admin routes on the worker. `inference.dynamo.deploy_router` defaults to true for local `rl` testing; set it false only when `orchestrator.client.base_url` points at an externally deployed Dynamo router. + Custom endpoints beyond standard OpenAI API: - `/v1/chat/completions/tokens` — accepts token IDs as prompt input - `/update_weights` — hot-reload model weights from the trainer - `/load_lora_adapter` — load LoRA adapters at runtime + +For Dynamo, generation should go through Dynamo's standard `/v1/chat/completions` frontend route. prime-rl adds only worker admin routes under `/engine/*` for liveness, NCCL broadcaster setup, and weight updates. - `/init_broadcaster` — initialize weight broadcast for distributed training Check health with: diff --git a/src/prime_rl/entrypoints/inference.py b/src/prime_rl/entrypoints/inference.py index a14ae26e63..576294f9ea 100644 --- a/src/prime_rl/entrypoints/inference.py +++ b/src/prime_rl/entrypoints/inference.py @@ -131,9 +131,16 @@ def inference_local(config: InferenceConfig): setup_vllm_env(config) - from prime_rl.inference.vllm.server import server # pyright: ignore + if config.backend == "vllm": + from prime_rl.inference.vllm.server import server # pyright: ignore - server(config, vllm_extra=config.vllm_extra) + server(config, vllm_extra=config.vllm_extra) + elif config.backend == "dynamo": + from prime_rl.experimental.dynamo.server import server + + server(config) + else: + raise ValueError(f"Unsupported inference backend: {config.backend}") def inference(config: InferenceConfig): diff --git a/src/prime_rl/entrypoints/rl.py b/src/prime_rl/entrypoints/rl.py index b740f58ba2..4622640630 100644 --- a/src/prime_rl/entrypoints/rl.py +++ b/src/prime_rl/entrypoints/rl.py @@ -153,7 +153,12 @@ def rl_local(config: RLConfig): check_gpus_available(all_gpu_ids) # Validate client port matches inference server port - if config.inference is not None and not config.orchestrator.client.is_elastic: + local_router_deployed = not ( + config.inference is not None + and config.inference.backend == "dynamo" + and not config.inference.dynamo.deploy_router + ) + if config.inference is not None and not config.orchestrator.client.is_elastic and local_router_deployed: from urllib.parse import urlparse base_url = config.orchestrator.client.base_url[0] diff --git a/src/prime_rl/experimental/__init__.py b/src/prime_rl/experimental/__init__.py new file mode 100644 index 0000000000..ffcf35181b --- /dev/null +++ b/src/prime_rl/experimental/__init__.py @@ -0,0 +1 @@ +"""Experimental integrations for prime-rl.""" diff --git a/src/prime_rl/experimental/dynamo/__init__.py b/src/prime_rl/experimental/dynamo/__init__.py new file mode 100644 index 0000000000..32368aab91 --- /dev/null +++ b/src/prime_rl/experimental/dynamo/__init__.py @@ -0,0 +1 @@ +"""Dynamo inference backend integration.""" diff --git a/src/prime_rl/experimental/dynamo/server.py b/src/prime_rl/experimental/dynamo/server.py new file mode 100644 index 0000000000..19e9af0da8 --- /dev/null +++ b/src/prime_rl/experimental/dynamo/server.py @@ -0,0 +1,152 @@ +from __future__ import annotations + +import json +import os +import signal +import subprocess +import sys +import tempfile +import time +from argparse import Namespace +from pathlib import Path +from typing import Any + +from prime_rl.configs.inference import InferenceConfig +from prime_rl.inference.vllm.server import WORKER_EXTENSION_CLS +from prime_rl.utils.logger import get_logger +from prime_rl.utils.nccl import disable_nccl_p2p_if_unavailable + + +def _format_cli_value(value: Any) -> str: + if isinstance(value, Path): + return value.as_posix() + if isinstance(value, (dict, list, tuple)): + return json.dumps(value) + return str(value) + + +def _namespace_to_cli_args(namespace: Namespace) -> list[str]: + args: list[str] = [] + for key, value in vars(namespace).items(): + if value is None: + continue + + flag = f"--{key.replace('_', '-')}" + if value is True: + args.append(flag) + elif value is False: + continue + else: + args.extend([flag, _format_cli_value(value)]) + return args + + +def _terminate(processes: list[subprocess.Popen]) -> None: + for process in processes: + if process.poll() is None: + process.terminate() + for process in processes: + try: + process.wait(timeout=30) + except subprocess.TimeoutExpired: + process.kill() + + +def _build_worker_namespace(config: InferenceConfig) -> Namespace: + namespace = config.to_dynamo_vllm() + namespace.worker_extension_cls = WORKER_EXTENSION_CLS[config.weight_broadcast.type] + return namespace + + +def server(config: InferenceConfig) -> None: + """Launch a Dynamo vLLM worker, optionally with Dynamo's frontend/router.""" + logger = get_logger() + disable_nccl_p2p_if_unavailable() + + worker_command = [ + sys.executable, + "-m", + "prime_rl.experimental.dynamo.worker", + *_namespace_to_cli_args(_build_worker_namespace(config)), + ] + + base_env = os.environ.copy() + base_env.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + if config.enable_lora: + base_env["VLLM_ALLOW_RUNTIME_LORA_UPDATING"] = "True" + + discovery_dir = None + if ( + not config.dynamo.deploy_router + and config.dynamo.discovery_backend == "file" + and "DYN_FILE_KV" not in base_env + ): + raise ValueError( + "inference.dynamo.deploy_router=false with discovery_backend='file' requires DYN_FILE_KV to point " + "at a file discovery directory shared with the external Dynamo frontend/router." + ) + if not config.dynamo.deploy_router and config.dynamo.discovery_backend == "mem": + raise ValueError( + "inference.dynamo.deploy_router=false cannot use discovery_backend='mem' because the external " + "Dynamo frontend/router cannot share in-process discovery state with this worker." + ) + if config.dynamo.deploy_router and config.dynamo.discovery_backend == "file" and "DYN_FILE_KV" not in base_env: + discovery_dir = tempfile.TemporaryDirectory(prefix="prime_rl_dynamo_") + base_env["DYN_FILE_KV"] = discovery_dir.name + if "DYN_EVENT_PLANE" not in base_env: + if config.dynamo.event_plane is not None: + base_env["DYN_EVENT_PLANE"] = config.dynamo.event_plane + elif config.dynamo.discovery_backend in ("file", "mem"): + base_env["DYN_EVENT_PLANE"] = "zmq" + + frontend_env = base_env.copy() + frontend_env.pop("DYN_SYSTEM_PORT", None) + + worker_env = base_env.copy() + worker_env["DYN_SYSTEM_PORT"] = str(config.dynamo.system_port) + + process_specs: list[tuple[str, list[str], dict[str, str]]] = [] + if config.dynamo.deploy_router: + frontend_namespace = config.to_dynamo_frontend() + frontend_command = [ + sys.executable, + "-m", + "dynamo.frontend", + *_namespace_to_cli_args(frontend_namespace), + ] + process_specs.append(("Dynamo frontend/router", frontend_command, frontend_env)) + else: + logger.info("Skipping Dynamo frontend/router launch; using external router deployment.") + + process_specs.append(("Dynamo vLLM worker", worker_command, worker_env)) + + for name, command, _env in process_specs: + logger.info(f"Starting {name}: {' '.join(command)}") + + processes: list[subprocess.Popen] = [] + + def handle_signal(signum, _frame): + logger.warning(f"Received signal {signum}, terminating Dynamo processes") + _terminate(processes) + raise SystemExit(128 + signum) + + previous_sigterm = signal.signal(signal.SIGTERM, handle_signal) + previous_sigint = signal.signal(signal.SIGINT, handle_signal) + + try: + for _name, command, env in process_specs: + processes.append(subprocess.Popen(command, env=env)) + + while True: + for process in processes: + return_code = process.poll() + if return_code is not None: + _terminate(processes) + raise SystemExit(return_code) + time.sleep(1) + finally: + signal.signal(signal.SIGTERM, previous_sigterm) + signal.signal(signal.SIGINT, previous_sigint) + _terminate(processes) + if discovery_dir is not None: + discovery_dir.cleanup() diff --git a/src/prime_rl/experimental/dynamo/worker.py b/src/prime_rl/experimental/dynamo/worker.py new file mode 100644 index 0000000000..0b668f68af --- /dev/null +++ b/src/prime_rl/experimental/dynamo/worker.py @@ -0,0 +1,102 @@ +"""Dynamo vLLM worker entrypoint with prime-rl admin routes.""" + +from __future__ import annotations + +from typing import Any, Callable + +from prime_rl.utils.logger import get_logger + +logger = get_logger() + + +async def _pause_generation(handler, body: dict[str, Any]) -> dict[str, str]: + mode = body.get("mode", "keep") + clear_cache = bool(body.get("clear_cache", False)) + await handler.engine_client.pause_generation(mode=mode, clear_cache=clear_cache) + return {"status": "ok"} + + +async def _resume_generation(handler, _body: dict[str, Any]) -> dict[str, str]: + await handler.engine_client.resume_generation() + return {"status": "ok"} + + +async def _update_weights(handler, body: dict[str, Any]) -> dict[str, str]: + weight_dir = body.get("weight_dir") + await handler.engine_client.pause_generation(mode="keep", clear_cache=False) + try: + await handler.engine_client.collective_rpc("update_weights_from_path", args=(weight_dir,)) + reset_prefix_cache = bool(body.get("reset_prefix_cache", True)) + if reset_prefix_cache and hasattr(handler.engine_client, "reset_prefix_cache"): + await handler.engine_client.reset_prefix_cache() + finally: + await handler.engine_client.resume_generation() + return {"status": "ok"} + + +async def _init_broadcaster(handler, body: dict[str, Any]) -> dict[str, str]: + await handler.engine_client.collective_rpc( + "init_broadcaster", + args=( + body.get("host"), + body.get("port"), + body.get("rank_offset"), + body.get("inference_world_size"), + body.get("timeout"), + body.get("quantize_in_weight_transfer", False), + ), + ) + return {"status": "ok"} + + +async def _liveness_probe(handler, _body: dict[str, Any]) -> dict[str, str]: + await handler.engine_client.collective_rpc("liveness_probe") + return {"status": "ok"} + + +def _bind(handler, callback: Callable[[Any, dict[str, Any]], Any]): + async def route(body: dict[str, Any] | None = None): + return await callback(handler, body or {}) + + return route + + +def patch_dynamo_vllm_worker() -> None: + """Register prime-rl admin routes on Dynamo's worker system server.""" + from dynamo.vllm.handlers import BaseWorkerHandler + + if getattr(BaseWorkerHandler, "_prime_rl_admin_routes_patched", False): + return + + original_init = BaseWorkerHandler.__init__ + + def patched_init(self, runtime, *args, **kwargs) -> None: + original_init(self, runtime, *args, **kwargs) + if getattr(self, "_prime_rl_admin_routes_registered", False): + return + runtime.register_engine_route("pause", _bind(self, _pause_generation)) + runtime.register_engine_route("resume", _bind(self, _resume_generation)) + runtime.register_engine_route("update_weights", _bind(self, _update_weights)) + runtime.register_engine_route("init_broadcaster", _bind(self, _init_broadcaster)) + runtime.register_engine_route("liveness", _bind(self, _liveness_probe)) + self._prime_rl_admin_routes_registered = True + logger.info( + "Registered prime-rl Dynamo admin routes: " + "/engine/pause, /engine/resume, /engine/update_weights, /engine/init_broadcaster, " + "/engine/liveness" + ) + + BaseWorkerHandler.__init__ = patched_init + BaseWorkerHandler._prime_rl_admin_routes_patched = True + + +def main() -> None: + patch_dynamo_vllm_worker() + + from dynamo.vllm.main import main as dynamo_vllm_main + + dynamo_vllm_main() + + +if __name__ == "__main__": + main() diff --git a/src/prime_rl/inference/server.py b/src/prime_rl/inference/server.py index 322f0145eb..9558a006b1 100644 --- a/src/prime_rl/inference/server.py +++ b/src/prime_rl/inference/server.py @@ -19,9 +19,16 @@ def main(): setup_vllm_env(config) # We import here to be able to set environment variables before importing vLLM - from prime_rl.inference.vllm.server import server # pyright: ignore + if config.backend == "vllm": + from prime_rl.inference.vllm.server import server # pyright: ignore - server(config, vllm_extra=config.vllm_extra) + server(config, vllm_extra=config.vllm_extra) + elif config.backend == "dynamo": + from prime_rl.experimental.dynamo.server import server + + server(config) + else: + raise ValueError(f"Unsupported inference backend: {config.backend}") if __name__ == "__main__": diff --git a/src/prime_rl/orchestrator/envs.py b/src/prime_rl/orchestrator/envs.py index 1de52994a9..c5d9047e0a 100644 --- a/src/prime_rl/orchestrator/envs.py +++ b/src/prime_rl/orchestrator/envs.py @@ -104,10 +104,14 @@ def _spawn( self._env_server_process = process return address - def _sampling_args_with_salt(self, cache_salt: str) -> dict: + def _sampling_args_with_salt(self, cache_salt: str | None) -> dict: sampling_args = {**self.sampling_args} - extra_body = {**sampling_args.get("extra_body", {}), "cache_salt": cache_salt} - sampling_args["extra_body"] = extra_body + if cache_salt is not None: + extra_body = {**sampling_args.get("extra_body", {}), "cache_salt": cache_salt} + sampling_args["extra_body"] = extra_body + else: + sampling_args.pop("logprobs", None) + sampling_args.pop("top_logprobs", None) return sampling_args async def run_rollout( @@ -115,7 +119,7 @@ async def run_rollout( client: vf.ClientConfig, example: dict, model_name: str, - cache_salt: str, + cache_salt: str | None, ) -> vf.RolloutOutput: """Run a single rollout for an example.""" return await self.env.run_rollout( @@ -134,7 +138,7 @@ async def run_group( example: dict, model_name: str, rollouts_per_example: int, - cache_salt: str, + cache_salt: str | None, ) -> list[vf.RolloutOutput]: """Run a group of rollouts for an example. Required for group-scoring envs.""" return await self.env.run_group( @@ -179,7 +183,7 @@ async def evaluate( get_client: Callable[[], Awaitable[vf.ClientConfig]], ckpt_step: int, step: int, - cache_salt: str, + cache_salt: str | None, ) -> list[vf.RolloutOutput]: num_examples = len(self.examples) rollouts_per_example = self.config.rollouts_per_example diff --git a/src/prime_rl/orchestrator/orchestrator.py b/src/prime_rl/orchestrator/orchestrator.py index bc1128ebc7..689b691288 100644 --- a/src/prime_rl/orchestrator/orchestrator.py +++ b/src/prime_rl/orchestrator/orchestrator.py @@ -394,7 +394,7 @@ async def orchestrate(config: OrchestratorConfig): get_client=inference_pool.get_eval_client, ckpt_step=ckpt_step, step=progress.step, - cache_salt=str(ckpt_step), + cache_salt=str(ckpt_step) if scheduler.supports_cache_salt else None, ) for eval_env in envs_to_eval ] @@ -813,7 +813,7 @@ def compute_solve_rates(df): get_client=inference_pool.get_eval_client, ckpt_step=ckpt_step, step=progress.step, - cache_salt=str(ckpt_step), + cache_salt=str(ckpt_step) if scheduler.supports_cache_salt else None, ) for eval_env in eval_envs ] diff --git a/src/prime_rl/orchestrator/scheduler.py b/src/prime_rl/orchestrator/scheduler.py index a8427b69f7..7e02ccd147 100644 --- a/src/prime_rl/orchestrator/scheduler.py +++ b/src/prime_rl/orchestrator/scheduler.py @@ -88,6 +88,7 @@ def __init__( self.lora_name = lora_name self.model_name = self.config.model.name self.json_logging = config.log.json_logging + self.supports_cache_salt = config.client.admin_backend != "dynamo" # Inference pool - used for admin operations (adapter sync) and metrics self.inference_pool = inference_pool @@ -199,7 +200,7 @@ async def schedule_rollout(self, group_id: int): env_name = group.example["env_name"] env = self.train_envs.get(env_name) - cache_salt = str(self.ckpt_step) + cache_salt = str(self.ckpt_step) if self.supports_cache_salt else None if env.requires_group_scoring: rollout_count = group.rollouts_to_schedule group.rollouts_to_schedule = 0 diff --git a/src/prime_rl/utils/client.py b/src/prime_rl/utils/client.py index 21659dfc46..201117a429 100644 --- a/src/prime_rl/utils/client.py +++ b/src/prime_rl/utils/client.py @@ -15,6 +15,8 @@ from prime_rl.configs.shared import ClientConfig from prime_rl.utils.logger import get_logger +ADMIN_BACKEND_ATTR = "prime_rl_admin_backend" + @runtime_checkable class InferencePool(Protocol): @@ -81,6 +83,11 @@ def __init__( ) self._eval_clients = setup_clients(client_config, client_type=eval_client_type) self._admin_clients = setup_admin_clients(client_config) + self._model_clients = ( + setup_admin_clients(client_config, use_admin_base_url=False) + if client_config.admin_base_url + else self._admin_clients + ) self._skip_model_check = client_config.skip_model_check self._wait_for_ready_timeout = client_config.wait_for_ready_timeout self._eval_cycle = cycle(self._eval_clients) @@ -105,10 +112,14 @@ async def get_eval_client(self) -> vf.ClientConfig: return next(self._eval_cycle) async def wait_for_ready(self, model_name: str, timeout: int | None = None) -> None: - await check_health( - self._admin_clients, timeout=timeout if timeout is not None else self._wait_for_ready_timeout + wait_timeout = timeout if timeout is not None else self._wait_for_ready_timeout + await check_health(self._admin_clients, timeout=wait_timeout) + await maybe_check_has_model( + self._model_clients, + model_name, + skip_model_check=self._skip_model_check, + timeout=wait_timeout, ) - await maybe_check_has_model(self._admin_clients, model_name, skip_model_check=self._skip_model_check) async def update_weights(self, weight_dir: Path | None, lora_name: str | None = None, step: int = 0) -> None: await update_weights(self._admin_clients, weight_dir, lora_name=lora_name, step=step) @@ -211,14 +222,19 @@ def setup_clients( return clients -def setup_admin_clients(client_config: ClientConfig) -> list[AsyncClient]: +def setup_admin_clients(client_config: ClientConfig, *, use_admin_base_url: bool = True) -> list[AsyncClient]: """Create dedicated admin clients for weight update operations. Uses a separate connection pool to avoid queueing behind streaming requests. - When admin_base_url is set, uses those URLs instead of base_url, allowing - weight updates to bypass routers in disaggregated P/D deployments. + When admin_base_url is set and use_admin_base_url is true, uses those URLs + instead of base_url, allowing weight updates to bypass routers in + disaggregated P/D deployments. """ - urls = client_config.admin_base_url if client_config.admin_base_url else client_config.base_url + urls = ( + client_config.admin_base_url + if use_admin_base_url and client_config.admin_base_url + else client_config.base_url + ) def _setup_admin_client(base_url: str) -> httpx.AsyncClient: headers = client_config.headers.copy() # avoid mutating config @@ -229,29 +245,62 @@ def _setup_admin_client(base_url: str) -> httpx.AsyncClient: # Strip /v1 suffix since admin endpoints are at root level base_url = base_url.rstrip("/").removesuffix("/v1") - return AsyncClient( + client = AsyncClient( base_url=base_url, headers=headers, limits=httpx.Limits(max_connections=4, max_keepalive_connections=1), timeout=httpx.Timeout(None), ) + setattr(client, ADMIN_BACKEND_ATTR, client_config.admin_backend) + return client return [_setup_admin_client(base_url) for base_url in urls] +def get_admin_backend(admin_client: AsyncClient) -> str: + return getattr(admin_client, ADMIN_BACKEND_ATTR, "vllm") + + async def maybe_check_has_model( - admin_clients: list[AsyncClient], model_name: str, skip_model_check: bool = False + admin_clients: list[AsyncClient], + model_name: str, + skip_model_check: bool = False, + interval: int = 1, + log_interval: int = 10, + timeout: int = 1800, ) -> None: if skip_model_check: return logger = get_logger() logger.debug(f"Checking if model {model_name} is in the inference pool") - results = await asyncio.gather(*[admin_client.get("/v1/models") for admin_client in admin_clients]) - for admin_client, result in zip(admin_clients, results): - models = result.json()["data"] - if not any(model["id"] == model_name for model in models): - raise ValueError(f"Model {model_name} was not found in the inference pool on {admin_client.base_url}") - logger.debug(f"Model {model_name} was found in the inference pool") + wait_time = 0 + last_error: Exception | None = None + while wait_time < timeout: + try: + results = await asyncio.gather(*[admin_client.get("/v1/models") for admin_client in admin_clients]) + missing_urls = [] + for admin_client, result in zip(admin_clients, results): + result.raise_for_status() + models = result.json()["data"] + if not any(model["id"] == model_name for model in models): + missing_urls.append(str(admin_client.base_url)) + if not missing_urls: + logger.debug(f"Model {model_name} was found in the inference pool") + return + last_error = ValueError(f"Model {model_name} was not found on {', '.join(missing_urls)}") + except Exception as e: + last_error = e + + if wait_time % log_interval == 0 and wait_time > 0: + logger.warning( + f"Model {model_name} was not found in the inference pool after {wait_time} seconds " + f"(Error: {last_error})" + ) + await asyncio.sleep(interval) + wait_time += interval + + msg = f"Model {model_name} was not found in the inference pool after {wait_time} (>{timeout}) seconds." + raise TimeoutError(msg) from last_error async def check_health( @@ -261,10 +310,18 @@ async def check_health( async def _check_health(admin_client: AsyncClient) -> None: wait_time = 0 - logger.debug("Starting pinging /health to check health") + admin_backend = get_admin_backend(admin_client) + logger.debug("Starting pinging health endpoint to check health") while wait_time < timeout: try: - await admin_client.get("/health") + if admin_backend == "dynamo": + response = await admin_client.post("/engine/liveness", json={}) + else: + response = await admin_client.get("/health") + if response.status_code == 404: + logger.warning("The route /health does not exist. Skipping health check.") + return + response.raise_for_status() logger.debug(f"Inference pool is ready after {wait_time} seconds") return except NotFoundError: @@ -332,15 +389,29 @@ async def update_weights( weight_dir_posix = weight_dir.as_posix() if weight_dir is not None else None if lora_name is not None and weight_dir is not None: + dynamo_clients = [client for client in admin_clients if get_admin_backend(client) == "dynamo"] + if dynamo_clients: + raise ValueError("Dynamo backend does not support prime-rl LoRA adapter weight updates yet.") await load_lora_adapter(admin_clients, lora_name, weight_dir) else: + vllm_clients = [client for client in admin_clients if get_admin_backend(client) == "vllm"] async def _update_weights(admin_client: AsyncClient, weight_dir: str | None) -> None: + if get_admin_backend(admin_client) == "dynamo": + response = await admin_client.post("/engine/update_weights", json={"weight_dir": weight_dir}) + response.raise_for_status() + result = response.json() + if result.get("status") == "error": + raise RuntimeError(result.get("message", "Dynamo weight update failed")) + return + response = await admin_client.post("/update_weights", json={"weight_dir": weight_dir}) response.raise_for_status() - # Pause engines so all DP workers drain in-flight work and can join the NCCL broadcast - await _pause_engines(admin_clients) + # Pause vLLM engines so all DP workers drain in-flight work and can join the NCCL broadcast. + # Dynamo's update route pauses/resumes its engine internally. + if vllm_clients: + await _pause_engines(vllm_clients) try: # Create ready marker before servers enter receive path (used by NCCL broadcast) @@ -352,7 +423,8 @@ async def _update_weights(admin_client: AsyncClient, weight_dir: str | None) -> await asyncio.gather(*[_update_weights(admin_client, weight_dir_posix) for admin_client in admin_clients]) finally: - await _resume_engines(admin_clients) + if vllm_clients: + await _resume_engines(vllm_clients) def _is_retryable_lora_error(exception: BaseException) -> bool: @@ -452,6 +524,24 @@ async def init_nccl_broadcast( ) async def _init_nccl_broadcast(admin_client: AsyncClient, rank_offset: int) -> None: + if get_admin_backend(admin_client) == "dynamo": + response = await admin_client.post( + "/engine/init_broadcaster", + json={ + "host": host, + "port": port, + "rank_offset": rank_offset, + "inference_world_size": inference_world_size, + "timeout": timeout, + "quantize_in_weight_transfer": quantize_in_weight_transfer, + }, + ) + response.raise_for_status() + result = response.json() + if result.get("status") == "error": + raise RuntimeError(result.get("message", "Dynamo NCCL initialization failed")) + return + try: response = await admin_client.post( "/init_broadcaster", diff --git a/tests/unit/test_configs.py b/tests/unit/test_configs.py index ffcc18f270..30efe20cd9 100644 --- a/tests/unit/test_configs.py +++ b/tests/unit/test_configs.py @@ -159,3 +159,50 @@ def test_removed_fused_lm_head_chunk_size_field_is_rejected(): def test_selective_activation_checkpointing_requires_custom_impl(): with pytest.raises(ValidationError, match="Selective activation checkpointing requires model.impl='custom'"): TrainerModelConfig.model_validate({"impl": "hf", "ac": {"mode": "selective"}}) + + +def test_inference_config_translates_dynamo_args(): + config = InferenceConfig.model_validate( + { + "backend": "dynamo", + "server": {"host": "127.0.0.1", "port": 8123}, + "model": {"name": "Qwen/Qwen3-4B", "dtype": "bfloat16", "max_model_len": 2048}, + "parallel": {"tp": 2, "dp": 1}, + "dynamo": { + "system_port": 9001, + "discovery_backend": "file", + "router_mode": "least-loaded", + "min_initial_workers": 1, + "worker_extra": {"block_size": 64}, + }, + "vllm_extra": {"max_num_seqs": 32}, + } + ) + + frontend = config.to_dynamo_frontend() + worker = config.to_dynamo_vllm() + + assert frontend.http_host == "127.0.0.1" + assert frontend.http_port == 8123 + assert frontend.namespace == "dynamo" + assert frontend.discovery_backend == "file" + assert frontend.router_mode == "least-loaded" + assert frontend.min_initial_workers == 1 + assert frontend.dyn_chat_processor == "vllm" + + assert worker.model == "Qwen/Qwen3-4B" + assert worker.dtype == "bfloat16" + assert worker.max_model_len == 2048 + assert worker.tensor_parallel_size == 2 + assert worker.block_size == 64 + assert worker.max_num_seqs == 32 + assert worker.logprobs_mode == "processed_logprobs" + + +def test_rl_config_auto_selects_openai_client_for_dynamo(): + config = RLConfig.model_validate({"trainer": {}, "orchestrator": {}, "inference": {"backend": "dynamo"}}) + + assert config.orchestrator.use_token_client is False + assert config.orchestrator.client.admin_backend == "dynamo" + assert config.orchestrator.client.admin_base_url == ["http://localhost:8081"] + assert config.orchestrator.client.skip_model_check is False diff --git a/uv.lock b/uv.lock index 002a9313d6..b254a36436 100644 --- a/uv.lock +++ b/uv.lock @@ -11,37 +11,39 @@ supported-markers = [ ] [options] -exclude-newer = "2026-05-03T13:45:58.065909043Z" +exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. exclude-newer-span = "P7D" [options.exclude-newer-package] -vllm = false verifiers = false -vllm-router = false dion = false alphabet-sort = false science-env = false -color-codeword = false -nixl-cu12 = false -flash-attn-3 = false -prime-tunnel = false -prime = false -deep-gemm = false -aime2024 = false prime-evals = false deepdive = false reverse-text = false +ai-dynamo-runtime = false code-env = false mini-swe-agent-plus = false deep-ep = false pydantic-config = false -math-env = false -logic-env = false +ai-dynamo = false wiki-search = false math-python = false math500 = false aime2025 = false +vllm = false +vllm-router = false +color-codeword = false +nixl-cu12 = false +flash-attn-3 = false +prime-tunnel = false +deep-gemm = false +aime2024 = false +math-env = false prime-sandboxes = false +logic-env = false +prime = false [manifest] members = [ @@ -63,6 +65,36 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/aa/ba0014cc4659328dc818a28827be78e6d97312ab0cb98105a770924dc11e/absl_py-2.3.1-py3-none-any.whl", hash = "sha256:eeecf07f0c2a93ace0772c92e596ace6d3d3996c042b2128459aaae2a76de11d", size = 135811, upload-time = "2025-07-03T09:31:42.253Z" }, ] +[[package]] +name = "ai-dynamo" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ai-dynamo-runtime", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "kubernetes", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "msgpack", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "msgspec", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "prometheus-client", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "pyzmq", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "transformers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/65/7d/b99564ff88262e8ba377bb592c17ba84db9ae1b4993cbdeb532484dc034e/ai_dynamo-1.1.1-py3-none-any.whl", hash = "sha256:fd6a811a6d5ef36537a5f32af88dda722c0cf6b7c4f3e31a7671e1e59dc36777", size = 1762603, upload-time = "2026-05-09T02:43:51.382Z" }, +] + +[[package]] +name = "ai-dynamo-runtime" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "uvloop", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/e2/47cc654088fc4b17f53fbf147020ca497afb9fdd3955997d667c285ba1bb/ai_dynamo_runtime-1.1.1-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:2c96c93195c87be5377f42e99151536cde9629bf7e8d5af8574d30e9fbcb8bb3", size = 34753711, upload-time = "2026-05-09T02:41:45.993Z" }, + { url = "https://files.pythonhosted.org/packages/00/68/acfdb9d6b0b9a4d25f991bae781992893b845fcd0981907189fbb27aa81b/ai_dynamo_runtime-1.1.1-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a7378079c81f2d81ca3363345111f04d9292aa158271b23e0236670f501e2b1d", size = 35042939, upload-time = "2026-05-09T02:40:41.554Z" }, +] + [[package]] name = "aime2024" version = "0.1.18" @@ -1137,6 +1169,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" }, ] +[[package]] +name = "google-auth" +version = "2.50.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "pyasn1-modules", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5f/18/238d7021d151bdab868f23433817b027dd759135202f4dfce0670d1230ca/google_auth-2.50.0.tar.gz", hash = "sha256:f35eafb191195328e8ce10a7883970877e7aeb49c2bfaa54aa0e394316d353d0", size = 336523, upload-time = "2026-04-30T21:19:29.659Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/37/cf/4880c2137c14280b2f59975cdf12cc442bc0ae1f9ea473a26eaa0c146786/google_auth-2.50.0-py3-none-any.whl", hash = "sha256:04382175e28b94f49694977f0a792688b59a668def1499e9d8de996dc9ce5b15", size = 246495, upload-time = "2026-04-30T21:19:27.664Z" }, +] + [[package]] name = "googleapis-common-protos" version = "1.72.0" @@ -1588,11 +1633,13 @@ wheels = [ [[package]] name = "kubernetes" -version = "35.0.0" +version = "32.0.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "certifi", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "durationpy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "google-auth", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "oauthlib", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "python-dateutil", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "pyyaml", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "requests", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -1601,9 +1648,9 @@ dependencies = [ { name = "urllib3", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "websocket-client", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/2c/8f/85bf51ad4150f64e8c665daf0d9dfe9787ae92005efb9a4d1cba592bd79d/kubernetes-35.0.0.tar.gz", hash = "sha256:3d00d344944239821458b9efd484d6df9f011da367ecb155dadf9513f05f09ee", size = 1094642, upload-time = "2026-01-16T01:05:27.76Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b7/e8/0598f0e8b4af37cd9b10d8b87386cf3173cb8045d834ab5f6ec347a758b3/kubernetes-32.0.1.tar.gz", hash = "sha256:42f43d49abd437ada79a79a16bd48a604d3471a117a8347e87db693f2ba0ba28", size = 946691, upload-time = "2025-02-18T21:06:34.148Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/70/05b685ea2dffcb2adbf3cdcea5d8865b7bc66f67249084cf845012a0ff13/kubernetes-35.0.0-py2.py3-none-any.whl", hash = "sha256:39e2b33b46e5834ef6c3985ebfe2047ab39135d41de51ce7641a7ca5b372a13d", size = 2017602, upload-time = "2026-01-16T01:05:25.991Z" }, + { url = "https://files.pythonhosted.org/packages/08/10/9f8af3e6f569685ce3af7faab51c8dd9d93b9c38eba339ca31c746119447/kubernetes-32.0.1-py2.py3-none-any.whl", hash = "sha256:35282ab8493b938b08ab5526c7ce66588232df00ef5e1dbe88a419107dc10998", size = 1988070, upload-time = "2025-02-18T21:06:31.391Z" }, ] [[package]] @@ -2802,6 +2849,10 @@ disagg = [ { name = "nixl-cu12", version = "0.10.1", source = { url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/nixl_cu12-0.10.1-cp312-cp312-linux_x86_64.whl" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "vllm-router", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] +dynamo = [ + { name = "ai-dynamo", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "blake3", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] envs = [ { name = "aime2024", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "aime2025", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -2848,11 +2899,13 @@ mamba-ssm = [ [package.metadata] requires-dist = [ + { name = "ai-dynamo", marker = "extra == 'dynamo'", specifier = "==1.1.1" }, { name = "aime2024", marker = "extra == 'envs'", index = "https://hub.primeintellect.ai/primeintellect/simple/" }, { name = "aime2025", marker = "extra == 'envs'", index = "https://hub.primeintellect.ai/primeintellect/simple/" }, { name = "aiolimiter", specifier = ">=1.2.1" }, { name = "alphabet-sort", marker = "extra == 'envs'", index = "https://hub.primeintellect.ai/primeintellect/simple/" }, { name = "beartype", specifier = ">=0.21.0" }, + { name = "blake3", marker = "extra == 'dynamo'", specifier = ">=1.0.0,<2.0.0" }, { name = "code-env", marker = "extra == 'envs'", index = "https://hub.primeintellect.ai/primeintellect/simple/" }, { name = "color-codeword", marker = "extra == 'envs'", index = "https://hub.primeintellect.ai/primeintellect/simple/" }, { name = "datasets", specifier = ">=4.0.0" }, @@ -2910,7 +2963,7 @@ requires-dist = [ { name = "wandb", specifier = ">=0.26.1" }, { name = "wiki-search", marker = "extra == 'envs'", index = "https://hub.primeintellect.ai/primeintellect/simple/" }, ] -provides-extras = ["flash-attn", "flash-attn-3", "flash-attn-cute", "envs", "disagg", "gpt-oss", "quack", "all"] +provides-extras = ["flash-attn", "flash-attn-3", "flash-attn-cute", "envs", "disagg", "gpt-oss", "quack", "dynamo", "all"] [package.metadata.requires-dev] dev = [ @@ -2974,11 +3027,11 @@ wheels = [ [[package]] name = "prometheus-client" -version = "0.22.1" +version = "0.25.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5e/cf/40dde0a2be27cc1eb41e333d1a674a74ce8b8b0457269cc640fd42b07cf7/prometheus_client-0.22.1.tar.gz", hash = "sha256:190f1331e783cf21eb60bca559354e0a4d4378facecf78f5428c39b675d20d28", size = 69746, upload-time = "2025-06-02T14:29:01.152Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1b/fb/d9aa83ffe43ce1f19e557c0971d04b90561b0cfd50762aafb01968285553/prometheus_client-0.25.0.tar.gz", hash = "sha256:5e373b75c31afb3c86f1a52fa1ad470c9aace18082d39ec0d2f918d11cc9ba28", size = 86035, upload-time = "2026-04-09T19:53:42.359Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl", hash = "sha256:cca895342e308174341b2cbf99a56bef291fbc0ef7b9e5412a0f26d653ba7094", size = 58694, upload-time = "2025-06-02T14:29:00.068Z" }, + { url = "https://files.pythonhosted.org/packages/8d/9b/d4b1e644385499c8346fa9b622a3f030dce14cd6ef8a1871c221a17a67e7/prometheus_client-0.25.0-py3-none-any.whl", hash = "sha256:d5aec89e349a6ec230805d0df882f3807f74fd6c1a2fa86864e3c2279059fed1", size = 64154, upload-time = "2026-04-09T19:53:41.324Z" }, ] [[package]] @@ -3079,6 +3132,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/05/d9/4d09d919f35d599bc05c6950095e358c3e15148ead26292dfca1fb659b0c/pyarrow-21.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:072116f65604b822a7f22945a7a6e581cfa28e3454fdcc6939d4ff6090126623", size = 45133802, upload-time = "2025-07-18T00:55:57.714Z" }, ] +[[package]] +name = "pyasn1" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5c/5f/6583902b6f79b399c9c40674ac384fd9cd77805f9e6205075f828ef11fb2/pyasn1-0.6.3.tar.gz", hash = "sha256:697a8ecd6d98891189184ca1fa05d1bb00e2f84b5977c481452050549c8a72cf", size = 148685, upload-time = "2026-03-17T01:06:53.382Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/a0/7d793dce3fa811fe047d6ae2431c672364b462850c6235ae306c0efd025f/pyasn1-0.6.3-py3-none-any.whl", hash = "sha256:a80184d120f0864a52a073acc6fc642847d0be408e7c7252f31390c0f4eadcde", size = 83997, upload-time = "2026-03-17T01:06:52.036Z" }, +] + +[[package]] +name = "pyasn1-modules" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, +] + [[package]] name = "pybase64" version = "1.4.2"