Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

Documenting **breaking** configuration changes — renamed, removed, or moved fields that require users to update existing configs.

- **`rollouts_per_example` → `group_size`**: The orchestrator-level field, the group-level `[orchestrator.eval]` field, and the per-env `[[orchestrator.eval.env]]` field have all been renamed. The old name still parses as a validation alias (in both TOML and CLI), so existing configs keep working without changes; new configs should prefer `group_size`. (2026-05-22)
- **`AdvantageInputs` / `AdvantageOutputs` are now per-group, and `AdvantageOutputs.advantages` is a plain `list[float]`** (second breaking change to this API in three weeks). `AdvantageInputs.rollouts` is now `list[vf.RolloutOutput]` (a single group) instead of `list[list[vf.RolloutOutput]]`, and `AdvantageOutputs.advantages` is now `list[float]` instead of a 2D `Float[Tensor, "num_examples rollouts_per_example"]`. `compute_advantages` calls `advantage_fn` once per group, which lets partial-group training (groups smaller than `rollouts_per_example` after rollout errors) round-trip without the previous bucket-by-size workaround. Custom advantage functions must drop the outer list dimension and return a list of floats — e.g. `AdvantageOutputs(advantages=(rewards - rewards.mean(dim=1, keepdim=True)).tolist())` becomes `AdvantageOutputs(advantages=[r - mean for r in rewards])` (or `.tolist()` if you keep torch internally). (2026-05-22)
- **`[model.vlm]` requires `orchestrator.use_renderer = true`**: VLMs must go through the renderer path; the `vlm_requires_renderer` validator rejects `use_renderer = false` when `[model.vlm]` is set. The renderer owns the HF processor per-slot and ships generic `mm_kwargs` keyed by the model's forward signature. Since `use_renderer` already defaults to `true`, most VLM configs need no change. (2026-05-19)
- **First-class `training_mode` + batch-driven loss dispatch** (collection of removals/renames). Loss selection is now driven by `TrainingSample.training_mode` (`rl` / `opd` / `sft`), set under `[orchestrator]`. The trainer is mode-agnostic and dispatches per batch.
Expand Down
8 changes: 4 additions & 4 deletions configs/acereason_math/stage1.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ name = "stage1"

[orchestrator]
batch_size = 1024
rollouts_per_example = 8
group_size = 8

[orchestrator.train.sampling]
temperature = 0.6
Expand All @@ -32,15 +32,15 @@ interval = 50

[[orchestrator.eval.env]]
id = "math500"
rollouts_per_example = 1
group_size = 1

[[orchestrator.eval.env]]
id = "aime2024"
rollouts_per_example = 32
group_size = 32

[[orchestrator.eval.env]]
id = "aime2025"
rollouts_per_example = 32
group_size = 32

[trainer.model.ac]

Expand Down
8 changes: 4 additions & 4 deletions configs/acereason_math/stage2.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ name = "stage2"

[orchestrator]
batch_size = 2048
rollouts_per_example = 16
group_size = 16

[orchestrator.train.sampling]
temperature = 0.6
Expand All @@ -33,15 +33,15 @@ interval = 50

[[orchestrator.eval.env]]
id = "math500"
rollouts_per_example = 1
group_size = 1

[[orchestrator.eval.env]]
id = "aime2024"
rollouts_per_example = 32
group_size = 32

[[orchestrator.eval.env]]
id = "aime2025"
rollouts_per_example = 32
group_size = 32

[trainer.model.ac]

Expand Down
4 changes: 2 additions & 2 deletions configs/alphabet_sort/rl.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507"

[orchestrator]
batch_size = 512
rollouts_per_example = 16
group_size = 16

[orchestrator.train.sampling]
max_completion_tokens = 512
Expand All @@ -28,7 +28,7 @@ interval = 50
id = "alphabet-sort"
name = "alphabet-sort"
num_examples = 50
rollouts_per_example = 4
group_size = 4
args = { min_turns = 2, max_turns = 2 }

[trainer]
Expand Down
2 changes: 1 addition & 1 deletion configs/ci/integration/alphabet_sort.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ lr = 1e-5

[orchestrator]
batch_size = 128
rollouts_per_example = 8
group_size = 8

[orchestrator.train.sampling]
max_completion_tokens = 384
Expand Down
2 changes: 1 addition & 1 deletion configs/ci/integration/reverse_text/resume.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ lr = 3e-6

[orchestrator]
batch_size = 128
rollouts_per_example = 16
group_size = 16

[orchestrator.train.sampling]
max_completion_tokens = 128
Expand Down
2 changes: 1 addition & 1 deletion configs/ci/integration/reverse_text/start.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ lr = 3e-6

[orchestrator]
batch_size = 128
rollouts_per_example = 16
group_size = 16

[orchestrator.train.sampling]
max_completion_tokens = 128
Expand Down
2 changes: 1 addition & 1 deletion configs/ci/integration/reverse_text_lora/resume.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ save_adapter_separately = true

[orchestrator]
batch_size = 128
rollouts_per_example = 16
group_size = 16

[orchestrator.model.lora]
name = "r8-1e-4"
Expand Down
2 changes: 1 addition & 1 deletion configs/ci/integration/reverse_text_lora/start.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ save_adapter_separately = true

[orchestrator]
batch_size = 128
rollouts_per_example = 16
group_size = 16

[orchestrator.model.lora]
name = "r8-1e-4"
Expand Down
2 changes: 1 addition & 1 deletion configs/ci/integration/reverse_text_moe/start.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ impl = "custom"

[orchestrator]
batch_size = 128
rollouts_per_example = 16
group_size = 16

[orchestrator.train.sampling]
max_completion_tokens = 128
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Orchestrator config for multi-run RL integration test
# model.lora.name and output_dir are set via CLI
batch_size = 128
rollouts_per_example = 16
group_size = 16
seq_len = 2048
max_steps = 20

Expand Down
2 changes: 1 addition & 1 deletion configs/ci/integration/reverse_text_rl_opd/start.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ name = "ci-rl-opd"
[orchestrator]
training_mode = "opd"
batch_size = 128
rollouts_per_example = 16
group_size = 16

[orchestrator.renderer]
name = "qwen3"
Expand Down
2 changes: 1 addition & 1 deletion configs/ci/integration/reverse_text_rl_sft/start.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ name = "ci-rl-sft"
[orchestrator]
training_mode = "sft"
batch_size = 128
rollouts_per_example = 16
group_size = 16

[orchestrator.train.sampling]
max_completion_tokens = 128
Expand Down
2 changes: 1 addition & 1 deletion configs/ci/nightly/acereason_math.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

[orchestrator]
batch_size = 1024
rollouts_per_example = 8
group_size = 8

[orchestrator.train.sampling]
temperature = 0.6
Expand Down
2 changes: 1 addition & 1 deletion configs/ci/nightly/multimodal_color_codeword.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ language_model_attr = "model.language_model"

[orchestrator]
batch_size = 256
rollouts_per_example = 16
group_size = 16

[orchestrator.train.sampling]
max_completion_tokens = 64
Expand Down
2 changes: 1 addition & 1 deletion configs/debug/training_modes/opd.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ name = "debug-opd"
[orchestrator]
training_mode = "opd"
batch_size = 128
rollouts_per_example = 16
group_size = 16

[orchestrator.renderer]
name = "qwen3"
Expand Down
2 changes: 1 addition & 1 deletion configs/debug/training_modes/opd_lora.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ name = "debug-opd-lora"
[orchestrator]
training_mode = "opd"
batch_size = 128
rollouts_per_example = 16
group_size = 16

[orchestrator.renderer]
name = "qwen3"
Expand Down
2 changes: 1 addition & 1 deletion configs/debug/training_modes/rl.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ name = "debug-rl"
[orchestrator]
training_mode = "rl"
batch_size = 128
rollouts_per_example = 16
group_size = 16

[orchestrator.renderer]
name = "qwen3"
Expand Down
2 changes: 1 addition & 1 deletion configs/debug/training_modes/sft.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ name = "debug-sft"
[orchestrator]
training_mode = "sft"
batch_size = 128
rollouts_per_example = 4
group_size = 4

[orchestrator.train.sampling]
max_completion_tokens = 128
Expand Down
2 changes: 1 addition & 1 deletion configs/debug/training_modes/sft_external.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ name = "debug-sft-external"
[orchestrator]
training_mode = "sft"
batch_size = 128
rollouts_per_example = 4
group_size = 4

[orchestrator.train.sampling]
max_completion_tokens = 2048
Expand Down
2 changes: 1 addition & 1 deletion configs/debug/training_modes/sft_lora.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ name = "debug-sft-lora"
[orchestrator]
training_mode = "sft"
batch_size = 128
rollouts_per_example = 4
group_size = 4

[orchestrator.train.sampling]
max_completion_tokens = 128
Expand Down
2 changes: 1 addition & 1 deletion configs/deepscaler/stage1.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ interval = 100

[orchestrator]
batch_size = 1024
rollouts_per_example = 8
group_size = 8

[orchestrator.train.sampling]
temperature = 0.6
Expand Down
2 changes: 1 addition & 1 deletion configs/deepscaler/stage2.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ resume_step = 500

[orchestrator]
batch_size = 1024
rollouts_per_example = 8
group_size = 8

[orchestrator.train.sampling]
temperature = 0.6
Expand Down
2 changes: 1 addition & 1 deletion configs/deepscaler/stage3.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ resume_step = 1000

[orchestrator]
batch_size = 1024
rollouts_per_example = 8
group_size = 8

[orchestrator.train.sampling]
temperature = 0.6
Expand Down
2 changes: 1 addition & 1 deletion configs/elastic/rl.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ lr = 1e-5

[orchestrator]
batch_size = 512
rollouts_per_example = 8
group_size = 8

[orchestrator.train.sampling]
max_completion_tokens = 768
Expand Down
2 changes: 1 addition & 1 deletion configs/env_mix/env_mix.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507"

[orchestrator]
batch_size = 512
rollouts_per_example = 16
group_size = 16

[[orchestrator.train.env]]
id = "math-env"
Expand Down
2 changes: 1 addition & 1 deletion configs/gsm8k/rl.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ name = "Qwen/Qwen3-0.6B"

[orchestrator]
batch_size = 512
rollouts_per_example = 16
group_size = 16

[orchestrator.train.sampling]
max_completion_tokens = 2048
Expand Down
4 changes: 2 additions & 2 deletions configs/hendrycks_math/rl.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507"

[orchestrator]
batch_size = 512
rollouts_per_example = 16
group_size = 16

[orchestrator.train.sampling]
max_completion_tokens = 2048
Expand All @@ -33,7 +33,7 @@ max_completion_tokens = 2048
[[orchestrator.eval.env]]
id = "math500"
num_examples = 30
rollouts_per_example = 4
group_size = 4

[trainer] # Default trainer config

Expand Down
4 changes: 2 additions & 2 deletions configs/hendrycks_math/sanity.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

[orchestrator]
batch_size = 512
rollouts_per_example = 8
group_size = 8

[[orchestrator.train.env]]
id = "math-env"
Expand All @@ -23,7 +23,7 @@ interval = 50
[[orchestrator.eval.env]]
id = "aime2024"
name = "aime2024"
rollouts_per_example = 16
group_size = 16

[trainer.model.ac]

Expand Down
4 changes: 2 additions & 2 deletions configs/math_group/rl.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507"

[orchestrator]
batch_size = 256
rollouts_per_example = 8
group_size = 8

[[orchestrator.train.env]]
id = "math-env"
Expand All @@ -34,7 +34,7 @@ interval = 50
id = "aime2024"
name = "aime2024"
num_examples = 30
rollouts_per_example = 4
group_size = 4

[trainer.model]
seq_len = 4096
Expand Down
2 changes: 1 addition & 1 deletion configs/math_python/math_python.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507"

[orchestrator]
batch_size = 512
rollouts_per_example = 16
group_size = 16

[orchestrator.train.sampling]
max_completion_tokens = 512
Expand Down
6 changes: 3 additions & 3 deletions configs/multi_reverse_text/rl.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"

[orchestrator]
batch_size = 128
rollouts_per_example = 16
group_size = 16

# -- train envs --

Expand Down Expand Up @@ -36,13 +36,13 @@ max_completion_tokens = 512
id = "reverse-text"
name = "eval-default"
num_examples = 32
rollouts_per_example = 4
group_size = 4

[[orchestrator.eval.env]]
id = "reverse-text"
name = "eval-custom"
num_examples = 16
rollouts_per_example = 2
group_size = 2
interval = 10

[orchestrator.eval.env.sampling]
Expand Down
2 changes: 1 addition & 1 deletion configs/multimodal/rl_color_codeword.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ language_model_attr = "model.language_model"

[orchestrator]
batch_size = 256
rollouts_per_example = 16
group_size = 16

[orchestrator.train.sampling]
max_completion_tokens = 64
Expand Down
Loading
Loading