PrimeIntellect-ai · mikasenghaas · May 22, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 Documenting **breaking** configuration changes — renamed, removed, or moved fields that require users to update existing configs.
 
+- **`rollouts_per_example` → `group_size`**: The orchestrator-level field, the group-level `[orchestrator.eval]` field, and the per-env `[[orchestrator.eval.env]]` field have all been renamed. The old name still parses as a validation alias (in both TOML and CLI), so existing configs keep working without changes; new configs should prefer `group_size`. (2026-05-22)
 - **`AdvantageInputs` / `AdvantageOutputs` are now per-group, and `AdvantageOutputs.advantages` is a plain `list[float]`** (second breaking change to this API in three weeks). `AdvantageInputs.rollouts` is now `list[vf.RolloutOutput]` (a single group) instead of `list[list[vf.RolloutOutput]]`, and `AdvantageOutputs.advantages` is now `list[float]` instead of a 2D `Float[Tensor, "num_examples rollouts_per_example"]`. `compute_advantages` calls `advantage_fn` once per group, which lets partial-group training (groups smaller than `rollouts_per_example` after rollout errors) round-trip without the previous bucket-by-size workaround. Custom advantage functions must drop the outer list dimension and return a list of floats — e.g. `AdvantageOutputs(advantages=(rewards - rewards.mean(dim=1, keepdim=True)).tolist())` becomes `AdvantageOutputs(advantages=[r - mean for r in rewards])` (or `.tolist()` if you keep torch internally). (2026-05-22)
 - **`[model.vlm]` requires `orchestrator.use_renderer = true`**: VLMs must go through the renderer path; the `vlm_requires_renderer` validator rejects `use_renderer = false` when `[model.vlm]` is set. The renderer owns the HF processor per-slot and ships generic `mm_kwargs` keyed by the model's forward signature. Since `use_renderer` already defaults to `true`, most VLM configs need no change. (2026-05-19)
 - **First-class `training_mode` + batch-driven loss dispatch** (collection of removals/renames). Loss selection is now driven by `TrainingSample.training_mode` (`rl` / `opd` / `sft`), set under `[orchestrator]`. The trainer is mode-agnostic and dispatches per batch.

diff --git a/configs/acereason_math/stage1.toml b/configs/acereason_math/stage1.toml
@@ -17,7 +17,7 @@ name = "stage1"
 
 [orchestrator]
 batch_size = 1024
-rollouts_per_example = 8
+group_size = 8
 
 [orchestrator.train.sampling]
 temperature = 0.6
@@ -32,15 +32,15 @@ interval = 50
 
 [[orchestrator.eval.env]]
 id = "math500"
-rollouts_per_example = 1
+group_size = 1
 
 [[orchestrator.eval.env]]
 id = "aime2024"
-rollouts_per_example = 32
+group_size = 32
 
 [[orchestrator.eval.env]]
 id = "aime2025"
-rollouts_per_example = 32
+group_size = 32
 
 [trainer.model.ac]
 

diff --git a/configs/acereason_math/stage2.toml b/configs/acereason_math/stage2.toml
@@ -18,7 +18,7 @@ name = "stage2"
 
 [orchestrator]
 batch_size = 2048
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.train.sampling]
 temperature = 0.6
@@ -33,15 +33,15 @@ interval = 50
 
 [[orchestrator.eval.env]]
 id = "math500"
-rollouts_per_example = 1
+group_size = 1
 
 [[orchestrator.eval.env]]
 id = "aime2024"
-rollouts_per_example = 32
+group_size = 32
 
 [[orchestrator.eval.env]]
 id = "aime2025"
-rollouts_per_example = 32
+group_size = 32
 
 [trainer.model.ac]
 

diff --git a/configs/alphabet_sort/rl.toml b/configs/alphabet_sort/rl.toml
@@ -11,7 +11,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507"
 
 [orchestrator]
 batch_size = 512
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.train.sampling]
 max_completion_tokens = 512
@@ -28,7 +28,7 @@ interval = 50
 id = "alphabet-sort"
 name = "alphabet-sort"
 num_examples = 50
-rollouts_per_example = 4
+group_size = 4
 args = { min_turns = 2, max_turns = 2 }
 
 [trainer]

diff --git a/configs/ci/integration/alphabet_sort.toml b/configs/ci/integration/alphabet_sort.toml
@@ -11,7 +11,7 @@ lr = 1e-5
 
 [orchestrator]
 batch_size = 128
-rollouts_per_example = 8
+group_size = 8
 
 [orchestrator.train.sampling]
 max_completion_tokens = 384

diff --git a/configs/ci/integration/reverse_text/resume.toml b/configs/ci/integration/reverse_text/resume.toml
@@ -12,7 +12,7 @@ lr = 3e-6
 
 [orchestrator]
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.train.sampling]
 max_completion_tokens = 128

diff --git a/configs/ci/integration/reverse_text/start.toml b/configs/ci/integration/reverse_text/start.toml
@@ -11,7 +11,7 @@ lr = 3e-6
 
 [orchestrator]
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.train.sampling]
 max_completion_tokens = 128

diff --git a/configs/ci/integration/reverse_text_lora/resume.toml b/configs/ci/integration/reverse_text_lora/resume.toml
@@ -18,7 +18,7 @@ save_adapter_separately = true
 
 [orchestrator]
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.model.lora]
 name = "r8-1e-4"

diff --git a/configs/ci/integration/reverse_text_lora/start.toml b/configs/ci/integration/reverse_text_lora/start.toml
@@ -17,7 +17,7 @@ save_adapter_separately = true
 
 [orchestrator]
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.model.lora]
 name = "r8-1e-4"

diff --git a/configs/ci/integration/reverse_text_moe/start.toml b/configs/ci/integration/reverse_text_moe/start.toml
@@ -14,7 +14,7 @@ impl = "custom"
 
 [orchestrator]
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.train.sampling]
 max_completion_tokens = 128

diff --git a/configs/ci/integration/reverse_text_multi_run/orchestrator.toml b/configs/ci/integration/reverse_text_multi_run/orchestrator.toml
@@ -1,7 +1,7 @@
 # Orchestrator config for multi-run RL integration test
 # model.lora.name and output_dir are set via CLI
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 seq_len = 2048
 max_steps = 20
 

diff --git a/configs/ci/integration/reverse_text_rl_opd/start.toml b/configs/ci/integration/reverse_text_rl_opd/start.toml
@@ -17,7 +17,7 @@ name = "ci-rl-opd"
 [orchestrator]
 training_mode = "opd"
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.renderer]
 name = "qwen3"

diff --git a/configs/ci/integration/reverse_text_rl_sft/start.toml b/configs/ci/integration/reverse_text_rl_sft/start.toml
@@ -17,7 +17,7 @@ name = "ci-rl-sft"
 [orchestrator]
 training_mode = "sft"
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.train.sampling]
 max_completion_tokens = 128

diff --git a/configs/ci/nightly/acereason_math.toml b/configs/ci/nightly/acereason_math.toml
@@ -15,7 +15,7 @@ name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
 
 [orchestrator]
 batch_size = 1024
-rollouts_per_example = 8
+group_size = 8
 
 [orchestrator.train.sampling]
 temperature = 0.6

diff --git a/configs/ci/nightly/multimodal_color_codeword.toml b/configs/ci/nightly/multimodal_color_codeword.toml
@@ -15,7 +15,7 @@ language_model_attr = "model.language_model"
 
 [orchestrator]
 batch_size = 256
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.train.sampling]
 max_completion_tokens = 64

diff --git a/configs/debug/training_modes/opd.toml b/configs/debug/training_modes/opd.toml
@@ -18,7 +18,7 @@ name = "debug-opd"
 [orchestrator]
 training_mode = "opd"
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.renderer]
 name = "qwen3"

diff --git a/configs/debug/training_modes/opd_lora.toml b/configs/debug/training_modes/opd_lora.toml
@@ -18,7 +18,7 @@ name = "debug-opd-lora"
 [orchestrator]
 training_mode = "opd"
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.renderer]
 name = "qwen3"

diff --git a/configs/debug/training_modes/rl.toml b/configs/debug/training_modes/rl.toml
@@ -11,7 +11,7 @@ name = "debug-rl"
 [orchestrator]
 training_mode = "rl"
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.renderer]
 name = "qwen3"

diff --git a/configs/debug/training_modes/sft.toml b/configs/debug/training_modes/sft.toml
@@ -18,7 +18,7 @@ name = "debug-sft"
 [orchestrator]
 training_mode = "sft"
 batch_size = 128
-rollouts_per_example = 4
+group_size = 4
 
 [orchestrator.train.sampling]
 max_completion_tokens = 128

diff --git a/configs/debug/training_modes/sft_external.toml b/configs/debug/training_modes/sft_external.toml
@@ -17,7 +17,7 @@ name = "debug-sft-external"
 [orchestrator]
 training_mode = "sft"
 batch_size = 128
-rollouts_per_example = 4
+group_size = 4
 
 [orchestrator.train.sampling]
 max_completion_tokens = 2048

diff --git a/configs/debug/training_modes/sft_lora.toml b/configs/debug/training_modes/sft_lora.toml
@@ -18,7 +18,7 @@ name = "debug-sft-lora"
 [orchestrator]
 training_mode = "sft"
 batch_size = 128
-rollouts_per_example = 4
+group_size = 4
 
 [orchestrator.train.sampling]
 max_completion_tokens = 128

diff --git a/configs/deepscaler/stage1.toml b/configs/deepscaler/stage1.toml
@@ -17,7 +17,7 @@ interval = 100
 
 [orchestrator]
 batch_size = 1024
-rollouts_per_example = 8
+group_size = 8
 
 [orchestrator.train.sampling]
 temperature = 0.6

diff --git a/configs/deepscaler/stage2.toml b/configs/deepscaler/stage2.toml
@@ -18,7 +18,7 @@ resume_step = 500
 
 [orchestrator]
 batch_size = 1024
-rollouts_per_example = 8
+group_size = 8
 
 [orchestrator.train.sampling]
 temperature = 0.6

diff --git a/configs/deepscaler/stage3.toml b/configs/deepscaler/stage3.toml
@@ -18,7 +18,7 @@ resume_step = 1000
 
 [orchestrator]
 batch_size = 1024
-rollouts_per_example = 8
+group_size = 8
 
 [orchestrator.train.sampling]
 temperature = 0.6

diff --git a/configs/elastic/rl.toml b/configs/elastic/rl.toml
@@ -30,7 +30,7 @@ lr = 1e-5
 
 [orchestrator]
 batch_size = 512
-rollouts_per_example = 8
+group_size = 8
 
 [orchestrator.train.sampling]
 max_completion_tokens = 768

diff --git a/configs/env_mix/env_mix.toml b/configs/env_mix/env_mix.toml
@@ -14,7 +14,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507"
 
 [orchestrator]
 batch_size = 512
-rollouts_per_example = 16
+group_size = 16
 
 [[orchestrator.train.env]]
 id = "math-env"

diff --git a/configs/gsm8k/rl.toml b/configs/gsm8k/rl.toml
@@ -10,7 +10,7 @@ name = "Qwen/Qwen3-0.6B"
 
 [orchestrator]
 batch_size = 512
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.train.sampling]
 max_completion_tokens = 2048

diff --git a/configs/hendrycks_math/rl.toml b/configs/hendrycks_math/rl.toml
@@ -10,7 +10,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507"
 
 [orchestrator]
 batch_size = 512
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.train.sampling]
 max_completion_tokens = 2048
@@ -33,7 +33,7 @@ max_completion_tokens = 2048
 [[orchestrator.eval.env]]
 id = "math500"
 num_examples = 30
-rollouts_per_example = 4
+group_size = 4
 
 [trainer] # Default trainer config
 

diff --git a/configs/hendrycks_math/sanity.toml b/configs/hendrycks_math/sanity.toml
@@ -10,7 +10,7 @@ name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 
 [orchestrator]
 batch_size = 512
-rollouts_per_example = 8
+group_size = 8
 
 [[orchestrator.train.env]]
 id = "math-env"
@@ -23,7 +23,7 @@ interval = 50
 [[orchestrator.eval.env]]
 id = "aime2024"
 name = "aime2024"
-rollouts_per_example = 16
+group_size = 16
 
 [trainer.model.ac]
 

diff --git a/configs/math_group/rl.toml b/configs/math_group/rl.toml
@@ -9,7 +9,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507"
 
 [orchestrator]
 batch_size = 256
-rollouts_per_example = 8
+group_size = 8
 
 [[orchestrator.train.env]]
 id = "math-env"
@@ -34,7 +34,7 @@ interval = 50
 id = "aime2024"
 name = "aime2024"
 num_examples = 30
-rollouts_per_example = 4
+group_size = 4
 
 [trainer.model]
 seq_len = 4096

diff --git a/configs/math_python/math_python.toml b/configs/math_python/math_python.toml
@@ -10,7 +10,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507"
 
 [orchestrator]
 batch_size = 512
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.train.sampling]
 max_completion_tokens = 512

diff --git a/configs/multi_reverse_text/rl.toml b/configs/multi_reverse_text/rl.toml
@@ -6,7 +6,7 @@ name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"
 
 [orchestrator]
 batch_size = 128
-rollouts_per_example = 16
+group_size = 16
 
 # -- train envs --
 
@@ -36,13 +36,13 @@ max_completion_tokens = 512
 id = "reverse-text"
 name = "eval-default"
 num_examples = 32
-rollouts_per_example = 4
+group_size = 4
 
 [[orchestrator.eval.env]]
 id = "reverse-text"
 name = "eval-custom"
 num_examples = 16
-rollouts_per_example = 2
+group_size = 2
 interval = 10
 
 [orchestrator.eval.env.sampling]

diff --git a/configs/multimodal/rl_color_codeword.toml b/configs/multimodal/rl_color_codeword.toml
@@ -10,7 +10,7 @@ language_model_attr = "model.language_model"
 
 [orchestrator]
 batch_size = 256
-rollouts_per_example = 16
+group_size = 16
 
 [orchestrator.train.sampling]
 max_completion_tokens = 64