pytorch · felipemello1 · May 18, 2026 · May 18, 2026 · May 19, 2026 · May 19, 2026
diff --git a/torchtitan/experiments/rl/README.md b/torchtitan/experiments/rl/README.md
@@ -27,11 +27,12 @@ uv venv --python 3.12 titan-rl
 source titan-rl/bin/activate
 ```
 
-1. Install Monarch and TorchStore from main:
+1. Install Monarch, TorchStore, and Renderers from main:
 ```bash
 uv pip install torchmonarch==0.4.1
 uv pip install --no-deps "git+https://github.com/meta-pytorch/torchstore.git@main"
 uv pip install pygtrie portpicker
+uv pip install "git+https://github.com/PrimeIntellect-ai/renderers.git@main"
 ```
 
 2. Install Flash Attention 3 kernels:

diff --git a/torchtitan/experiments/rl/actors/generator.py b/torchtitan/experiments/rl/actors/generator.py
@@ -163,6 +163,9 @@ class SamplingConfig:
     max_tokens: int = 100
     """Maximum number of tokens to generate per completion."""
 
+    stop_token_ids: list[int] = field(default_factory=list)
+    """Role-boundary stop tokens from the renderer (e.g. Qwen3 `<|im_end|>`)."""
+
 
 class VLLMGenerator(Actor, Configurable):
     """
@@ -411,6 +414,7 @@ async def generate(
                 top_p=_sampling_config.top_p,
                 max_tokens=_sampling_config.max_tokens,
                 n=_sampling_config.n,
+                stop_token_ids=_sampling_config.stop_token_ids or None,
                 seed=self.config.debug.seed,
                 logprobs=1,
                 output_kind=RequestOutputKind.FINAL_ONLY,
@@ -436,14 +440,14 @@ async def generate(
                     all_outputs.extend(self._engine.step())
 
             # vLLM may return requests out of order; sort by the integer
-            # request_id we assigned so prompt_idx lines up with the input.
+            # request_id we assigned so request_idx lines up with the input.
             all_outputs.sort(key=lambda o: int(o.request_id))
 
             completions: list[Completion] = []
             generation_metrics: list[m.Metric] = []
             output_token_counts: list[int] = []
             for output in all_outputs:
-                prompt_idx = int(output.request_id)
+                request_idx = int(output.request_id)
                 generation_metrics.extend(
                     _prepare_generation_request_metrics(output, prefix=metrics_prefix)
                 )
@@ -456,8 +460,7 @@ async def generate(
                     completions.append(
                         Completion(
                             policy_version=self.policy_version,
-                            prompt_idx=prompt_idx,
-                            text=sample.text,
+                            request_idx=request_idx,
                             token_ids=sample.token_ids,
                             token_logprobs=per_token_logprobs,
                             finish_reason=sample.finish_reason,

diff --git a/torchtitan/experiments/rl/actors/trainer.py b/torchtitan/experiments/rl/actors/trainer.py
@@ -455,7 +455,7 @@ async def forward_backward(
     async def optim_step(self) -> OptimStepOutput:
         """Clip gradients, step optimizer + LR scheduler, return updated state."""
         # TODO: Accept optional optimizer params (e.g. learning rate)
-        # to allow controller-owned schedules (see Tinker API).
+        # to allow controller-owned schedules.
 
         # capture LR before step
         current_lrs = self.lr_schedulers.schedulers[0].get_last_lr()

diff --git a/torchtitan/experiments/rl/config_registry.py b/torchtitan/experiments/rl/config_registry.py
@@ -25,7 +25,8 @@
 from torchtitan.experiments.rl.batcher import BatchConfig, Batcher
 from torchtitan.experiments.rl.grpo import GRPOLoss, RLTrainer
 from torchtitan.experiments.rl.observability.metrics import MetricsProcessor
-from torchtitan.experiments.rl.sum_digits import SumDigitsEnv
+from torchtitan.experiments.rl.renderer import RendererConfig
+from torchtitan.experiments.rl.tasks.sum_digits import SumDigitsDataset, SumDigitsTask
 from torchtitan.models.qwen3 import model_registry
 
 
@@ -39,10 +40,14 @@ def rl_grpo_qwen3_0_6b() -> RLTrainer.Config:
         num_prompts_per_step=5,
         num_validation_samples=20,
         compile=CompileConfig(enable=True, backend="aot_eager"),
-        env=SumDigitsEnv.Config(seed=42, correctness_reward=1.0, format_reward=0.3),
-        validation_env=SumDigitsEnv.Config(
-            seed=99, correctness_reward=1.0, format_reward=0.3
-        ),
+        tasks={
+            "sum_digits": SumDigitsTask.Config(
+                train_dataset=SumDigitsDataset.Config(seed=42),
+                val_dataset=SumDigitsDataset.Config(seed=99),
+            )
+        },
+        group_size=group_size,
+        renderer=RendererConfig(name="qwen3", enable_thinking=True),
         metrics=MetricsProcessor.Config(enable_wandb=True),
         batcher=Batcher.Config(
             batch=BatchConfig(local_batch_size=2, global_batch_size=8, seq_len=2048),
@@ -77,10 +82,9 @@ def rl_grpo_qwen3_0_6b() -> RLTrainer.Config:
             ),
             checkpoint=CheckpointManager.Config(enable=False),
             sampling=SamplingConfig(
-                n=group_size,
                 temperature=0.8,
                 top_p=0.95,
-                max_tokens=100,
+                max_tokens=700,
             ),
         ),
     )
@@ -96,10 +100,14 @@ def rl_grpo_qwen3_1_7b() -> RLTrainer.Config:
         num_prompts_per_step=5,
         num_validation_samples=20,
         compile=CompileConfig(enable=True, backend="aot_eager"),
-        env=SumDigitsEnv.Config(seed=42, correctness_reward=1.0, format_reward=0.3),
-        validation_env=SumDigitsEnv.Config(
-            seed=99, correctness_reward=1.0, format_reward=0.3
-        ),
+        tasks={
+            "sum_digits": SumDigitsTask.Config(
+                train_dataset=SumDigitsDataset.Config(seed=42),
+                val_dataset=SumDigitsDataset.Config(seed=99),
+            )
+        },
+        group_size=group_size,
+        renderer=RendererConfig(name="qwen3", enable_thinking=True),
         metrics=MetricsProcessor.Config(enable_wandb=True),
         batcher=Batcher.Config(
             batch=BatchConfig(local_batch_size=2, global_batch_size=8, seq_len=2048),
@@ -135,10 +143,9 @@ def rl_grpo_qwen3_1_7b() -> RLTrainer.Config:
             ),
             checkpoint=CheckpointManager.Config(enable=False),
             sampling=SamplingConfig(
-                n=group_size,
                 temperature=0.8,
                 top_p=0.95,
-                max_tokens=100,
+                max_tokens=700,
             ),
         ),
     )
@@ -154,10 +161,14 @@ def rl_grpo_qwen3_14b() -> RLTrainer.Config:
         num_prompts_per_step=5,
         num_validation_samples=20,
         compile=CompileConfig(enable=True, backend="aot_eager"),
-        env=SumDigitsEnv.Config(seed=42, correctness_reward=1.0, format_reward=0.3),
-        validation_env=SumDigitsEnv.Config(
-            seed=99, correctness_reward=1.0, format_reward=0.3
-        ),
+        tasks={
+            "sum_digits": SumDigitsTask.Config(
+                train_dataset=SumDigitsDataset.Config(seed=42),
+                val_dataset=SumDigitsDataset.Config(seed=99),
+            )
+        },
+        group_size=group_size,
+        renderer=RendererConfig(name="qwen3", enable_thinking=True),
         metrics=MetricsProcessor.Config(enable_wandb=True),
         batcher=Batcher.Config(
             batch=BatchConfig(local_batch_size=2, global_batch_size=8, seq_len=2048),
@@ -192,17 +203,16 @@ def rl_grpo_qwen3_14b() -> RLTrainer.Config:
             ),
             checkpoint=CheckpointManager.Config(enable=False),
             sampling=SamplingConfig(
-                n=group_size,
                 temperature=0.8,
                 top_p=0.95,
-                max_tokens=100,
+                max_tokens=700,
             ),
         ),
     )
 
 
 def rl_grpo_qwen3_0_6b_batch_invariant() -> RLTrainer.Config:
-    """On-policy GRPO config for Qwen3-0.6B under same parallelism (4 GPUs: 2 gen + 2 train).
+    """On-policy GRPO config for Qwen3-0.6B (4 GPUs: 2 gen + 2 train).
 
     Enables deterministic + batch-invariant mode for true on-policy RL training.
     """
@@ -215,10 +225,14 @@ def rl_grpo_qwen3_0_6b_batch_invariant() -> RLTrainer.Config:
         num_prompts_per_step=5,
         num_validation_samples=20,
         compile=CompileConfig(enable=True, backend="aot_eager"),
-        env=SumDigitsEnv.Config(seed=42, correctness_reward=1.0, format_reward=0.3),
-        validation_env=SumDigitsEnv.Config(
-            seed=99, correctness_reward=1.0, format_reward=0.3
-        ),
+        tasks={
+            "sum_digits": SumDigitsTask.Config(
+                train_dataset=SumDigitsDataset.Config(seed=42),
+                val_dataset=SumDigitsDataset.Config(seed=99),
+            )
+        },
+        group_size=group_size,
+        renderer=RendererConfig(name="qwen3", enable_thinking=True),
         metrics=MetricsProcessor.Config(enable_wandb=True),
         batcher=Batcher.Config(
             batch=BatchConfig(local_batch_size=2, global_batch_size=8, seq_len=2048),
@@ -257,10 +271,9 @@ def rl_grpo_qwen3_0_6b_batch_invariant() -> RLTrainer.Config:
             ),
             checkpoint=CheckpointManager.Config(enable=False),
             sampling=SamplingConfig(
-                n=group_size,
                 temperature=0.8,
                 top_p=0.95,
-                max_tokens=100,
+                max_tokens=700,
             ),
             debug=batch_invariant_config,
         ),

diff --git a/torchtitan/experiments/rl/env_types/__init__.py b/torchtitan/experiments/rl/env_types/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torchtitan.experiments.rl.env_types.message_env import (
+    MessageEnv,
+    MessageResetOutput,
+    MessageStepOutput,
+)
+from torchtitan.experiments.rl.env_types.renderer_env import (
+    RendererWrapperEnv,
+    TokenizedStepOutput,
+    TurnMessages,
+)
+
+__all__ = [
+    "MessageEnv",
+    "MessageResetOutput",
+    "MessageStepOutput",
+    "RendererWrapperEnv",
+    "TokenizedStepOutput",
+    "TurnMessages",
+]
diff --git a/torchtitan/experiments/rl/env_types/message_env.py b/torchtitan/experiments/rl/env_types/message_env.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+import abc
+from dataclasses import dataclass, field
+
+from renderers import Message, ToolSpec
+
+
+@dataclass(kw_only=True, slots=True)
+class MessageResetOutput:
+    """Initial prompt messages + tool specs from `MessageEnv.reset`."""
+
+    prompt_messages: list[Message]  # [M_prompt]
+    """The messages that form the initial prompt (e.g. [system, user])."""
+
+    tools: list[ToolSpec] = field(default_factory=list)  # [K_tools]
+    """Tool schemas exposed to the assistant. Empty for tool-less envs."""
+
+
+@dataclass(kw_only=True, slots=True)
+class MessageStepOutput:
+    """The env's reply to the assistant's turn."""
+
+    env_messages: list[Message] = field(default_factory=list)  # [M_env]
+    """The env's reply messages (tool / user). Empty when the rollout terminates
+    with no follow-up."""
+
+    done: bool = False
+    """`True` ends the rollout."""
+
+    env_rewards: dict[str, float] = field(default_factory=dict)
+    """Optional reward signal the env provides for this step; the rubric decides
+    whether and how to use it. Empty if the env scores nothing."""
+
+    def __post_init__(self) -> None:
+        # env replies are tool/user turns; the assistant turn comes from the generator
+        if any(m.get("role") == "assistant" for m in self.env_messages):
+            raise ValueError(
+                "MessageStepOutput.env_messages may not contain assistant messages"
+            )
+
+
+class MessageEnv(abc.ABC):
+    """User-written env in message space. Implement `reset` + `step`.
+
+    Tip: `MessageEnv` works in messages and never sees token ids; You can have `RendererWrapperEnv`
+    wrap it and use a `Renderer` to convert messages <-> token ids for the generator.
+
+    Example:
+        # a one-tool calculator env. It is multi-turn — the env answers the
+        # assistant's tool call, then ends once the assistant replies without a tool.
+
+        class CalculatorEnv(MessageEnv):
+            async def reset(self) -> MessageResetOutput:
+                return MessageResetOutput(
+                    prompt_messages=[{"role": "user", "content": "What is 12 * 7?"}],
+                    tools=[CALCULATOR_TOOL],
+                )
+
+            async def step(self, assistant_message: Message) -> MessageStepOutput:
+                tool_calls = assistant_message.get("tool_calls")
+                if not tool_calls:
+                    return MessageStepOutput(done=True)  # assistant gave its final answer
+                result = run_calculator(tool_calls[0])
+                return MessageStepOutput(
+                    env_messages=[{"role": "tool", "content": result}]
+                )
+    """
+
+    @abc.abstractmethod
+    async def reset(self) -> MessageResetOutput:
+        """Return the initial conversation + tools for prompt rendering."""
+
+    @abc.abstractmethod
+    async def step(self, assistant_message: Message) -> MessageStepOutput:
+        """Advance the env one turn given the assistant's latest message.
+
+        `RendererWrapperEnv` parses the completion and handles
+        finish_reason / length / parse / timeout failures before calling this,
+        so the env only sees a well-formed assistant message.
+
+        Args:
+            assistant_message: the assistant's parsed turn.
+
+        Returns:
+            `MessageStepOutput` with the env's reply messages.
+        """
+
+    async def close(self) -> None:
+        """Release env-owned resources. Default no-op; idempotent."""