diff --git a/CHANGELOG.md b/CHANGELOG.md index 058bbd0ba7..484a5ea4ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ Documenting **breaking** configuration changes — renamed, removed, or moved fields that require users to update existing configs. +- **`rollouts_per_example` → `group_size`**: The orchestrator-level field, the group-level `[orchestrator.eval]` field, and the per-env `[[orchestrator.eval.env]]` field have all been renamed. The old name still parses as a validation alias (in both TOML and CLI), so existing configs keep working without changes; new configs should prefer `group_size`. (2026-05-22) - **`AdvantageInputs` / `AdvantageOutputs` are now per-group, and `AdvantageOutputs.advantages` is a plain `list[float]`** (second breaking change to this API in three weeks). `AdvantageInputs.rollouts` is now `list[vf.RolloutOutput]` (a single group) instead of `list[list[vf.RolloutOutput]]`, and `AdvantageOutputs.advantages` is now `list[float]` instead of a 2D `Float[Tensor, "num_examples rollouts_per_example"]`. `compute_advantages` calls `advantage_fn` once per group, which lets partial-group training (groups smaller than `rollouts_per_example` after rollout errors) round-trip without the previous bucket-by-size workaround. Custom advantage functions must drop the outer list dimension and return a list of floats — e.g. `AdvantageOutputs(advantages=(rewards - rewards.mean(dim=1, keepdim=True)).tolist())` becomes `AdvantageOutputs(advantages=[r - mean for r in rewards])` (or `.tolist()` if you keep torch internally). (2026-05-22) - **`[model.vlm]` requires `orchestrator.use_renderer = true`**: VLMs must go through the renderer path; the `vlm_requires_renderer` validator rejects `use_renderer = false` when `[model.vlm]` is set. The renderer owns the HF processor per-slot and ships generic `mm_kwargs` keyed by the model's forward signature. Since `use_renderer` already defaults to `true`, most VLM configs need no change. (2026-05-19) - **First-class `training_mode` + batch-driven loss dispatch** (collection of removals/renames). Loss selection is now driven by `TrainingSample.training_mode` (`rl` / `opd` / `sft`), set under `[orchestrator]`. The trainer is mode-agnostic and dispatches per batch. diff --git a/configs/acereason_math/stage1.toml b/configs/acereason_math/stage1.toml index 6c102b4a63..7afbda5526 100644 --- a/configs/acereason_math/stage1.toml +++ b/configs/acereason_math/stage1.toml @@ -17,7 +17,7 @@ name = "stage1" [orchestrator] batch_size = 1024 -rollouts_per_example = 8 +group_size = 8 [orchestrator.train.sampling] temperature = 0.6 @@ -32,15 +32,15 @@ interval = 50 [[orchestrator.eval.env]] id = "math500" -rollouts_per_example = 1 +group_size = 1 [[orchestrator.eval.env]] id = "aime2024" -rollouts_per_example = 32 +group_size = 32 [[orchestrator.eval.env]] id = "aime2025" -rollouts_per_example = 32 +group_size = 32 [trainer.model.ac] diff --git a/configs/acereason_math/stage2.toml b/configs/acereason_math/stage2.toml index 7745b9ff05..57eebcd204 100644 --- a/configs/acereason_math/stage2.toml +++ b/configs/acereason_math/stage2.toml @@ -18,7 +18,7 @@ name = "stage2" [orchestrator] batch_size = 2048 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] temperature = 0.6 @@ -33,15 +33,15 @@ interval = 50 [[orchestrator.eval.env]] id = "math500" -rollouts_per_example = 1 +group_size = 1 [[orchestrator.eval.env]] id = "aime2024" -rollouts_per_example = 32 +group_size = 32 [[orchestrator.eval.env]] id = "aime2025" -rollouts_per_example = 32 +group_size = 32 [trainer.model.ac] diff --git a/configs/alphabet_sort/rl.toml b/configs/alphabet_sort/rl.toml index 22daacdc5f..6c54274b64 100644 --- a/configs/alphabet_sort/rl.toml +++ b/configs/alphabet_sort/rl.toml @@ -11,7 +11,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507" [orchestrator] batch_size = 512 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 512 @@ -28,7 +28,7 @@ interval = 50 id = "alphabet-sort" name = "alphabet-sort" num_examples = 50 -rollouts_per_example = 4 +group_size = 4 args = { min_turns = 2, max_turns = 2 } [trainer] diff --git a/configs/ci/integration/alphabet_sort.toml b/configs/ci/integration/alphabet_sort.toml index 33ce009b34..adc7a8215b 100644 --- a/configs/ci/integration/alphabet_sort.toml +++ b/configs/ci/integration/alphabet_sort.toml @@ -11,7 +11,7 @@ lr = 1e-5 [orchestrator] batch_size = 128 -rollouts_per_example = 8 +group_size = 8 [orchestrator.train.sampling] max_completion_tokens = 384 diff --git a/configs/ci/integration/reverse_text/resume.toml b/configs/ci/integration/reverse_text/resume.toml index b9ca6b8515..aa446b4b35 100644 --- a/configs/ci/integration/reverse_text/resume.toml +++ b/configs/ci/integration/reverse_text/resume.toml @@ -12,7 +12,7 @@ lr = 3e-6 [orchestrator] batch_size = 128 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 128 diff --git a/configs/ci/integration/reverse_text/start.toml b/configs/ci/integration/reverse_text/start.toml index e0d8ed659a..190250e048 100644 --- a/configs/ci/integration/reverse_text/start.toml +++ b/configs/ci/integration/reverse_text/start.toml @@ -11,7 +11,7 @@ lr = 3e-6 [orchestrator] batch_size = 128 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 128 diff --git a/configs/ci/integration/reverse_text_lora/resume.toml b/configs/ci/integration/reverse_text_lora/resume.toml index e7a96edcac..7c431ef0d2 100644 --- a/configs/ci/integration/reverse_text_lora/resume.toml +++ b/configs/ci/integration/reverse_text_lora/resume.toml @@ -18,7 +18,7 @@ save_adapter_separately = true [orchestrator] batch_size = 128 -rollouts_per_example = 16 +group_size = 16 [orchestrator.model.lora] name = "r8-1e-4" diff --git a/configs/ci/integration/reverse_text_lora/start.toml b/configs/ci/integration/reverse_text_lora/start.toml index 0e32533259..08740163f7 100644 --- a/configs/ci/integration/reverse_text_lora/start.toml +++ b/configs/ci/integration/reverse_text_lora/start.toml @@ -17,7 +17,7 @@ save_adapter_separately = true [orchestrator] batch_size = 128 -rollouts_per_example = 16 +group_size = 16 [orchestrator.model.lora] name = "r8-1e-4" diff --git a/configs/ci/integration/reverse_text_moe/start.toml b/configs/ci/integration/reverse_text_moe/start.toml index 62b1d286fb..209f9fb5c8 100644 --- a/configs/ci/integration/reverse_text_moe/start.toml +++ b/configs/ci/integration/reverse_text_moe/start.toml @@ -14,7 +14,7 @@ impl = "custom" [orchestrator] batch_size = 128 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 128 diff --git a/configs/ci/integration/reverse_text_multi_run/orchestrator.toml b/configs/ci/integration/reverse_text_multi_run/orchestrator.toml index 6be85cc776..2c43dcf8a0 100644 --- a/configs/ci/integration/reverse_text_multi_run/orchestrator.toml +++ b/configs/ci/integration/reverse_text_multi_run/orchestrator.toml @@ -1,7 +1,7 @@ # Orchestrator config for multi-run RL integration test # model.lora.name and output_dir are set via CLI batch_size = 128 -rollouts_per_example = 16 +group_size = 16 seq_len = 2048 max_steps = 20 diff --git a/configs/ci/integration/reverse_text_rl_opd/start.toml b/configs/ci/integration/reverse_text_rl_opd/start.toml index bfc9d6f1d4..edd23df9c7 100644 --- a/configs/ci/integration/reverse_text_rl_opd/start.toml +++ b/configs/ci/integration/reverse_text_rl_opd/start.toml @@ -17,7 +17,7 @@ name = "ci-rl-opd" [orchestrator] training_mode = "opd" batch_size = 128 -rollouts_per_example = 16 +group_size = 16 [orchestrator.renderer] name = "qwen3" diff --git a/configs/ci/integration/reverse_text_rl_sft/start.toml b/configs/ci/integration/reverse_text_rl_sft/start.toml index f3fe4448f4..6b26bb3335 100644 --- a/configs/ci/integration/reverse_text_rl_sft/start.toml +++ b/configs/ci/integration/reverse_text_rl_sft/start.toml @@ -17,7 +17,7 @@ name = "ci-rl-sft" [orchestrator] training_mode = "sft" batch_size = 128 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 128 diff --git a/configs/ci/nightly/acereason_math.toml b/configs/ci/nightly/acereason_math.toml index 0553e17ff3..4239ecac8a 100644 --- a/configs/ci/nightly/acereason_math.toml +++ b/configs/ci/nightly/acereason_math.toml @@ -15,7 +15,7 @@ name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" [orchestrator] batch_size = 1024 -rollouts_per_example = 8 +group_size = 8 [orchestrator.train.sampling] temperature = 0.6 diff --git a/configs/ci/nightly/multimodal_color_codeword.toml b/configs/ci/nightly/multimodal_color_codeword.toml index a90fdfb454..2ef83a63f0 100644 --- a/configs/ci/nightly/multimodal_color_codeword.toml +++ b/configs/ci/nightly/multimodal_color_codeword.toml @@ -15,7 +15,7 @@ language_model_attr = "model.language_model" [orchestrator] batch_size = 256 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 64 diff --git a/configs/debug/training_modes/opd.toml b/configs/debug/training_modes/opd.toml index b24cf3fe91..39cbf6a604 100644 --- a/configs/debug/training_modes/opd.toml +++ b/configs/debug/training_modes/opd.toml @@ -18,7 +18,7 @@ name = "debug-opd" [orchestrator] training_mode = "opd" batch_size = 128 -rollouts_per_example = 16 +group_size = 16 [orchestrator.renderer] name = "qwen3" diff --git a/configs/debug/training_modes/opd_lora.toml b/configs/debug/training_modes/opd_lora.toml index 135f083936..ba56ffea5c 100644 --- a/configs/debug/training_modes/opd_lora.toml +++ b/configs/debug/training_modes/opd_lora.toml @@ -18,7 +18,7 @@ name = "debug-opd-lora" [orchestrator] training_mode = "opd" batch_size = 128 -rollouts_per_example = 16 +group_size = 16 [orchestrator.renderer] name = "qwen3" diff --git a/configs/debug/training_modes/rl.toml b/configs/debug/training_modes/rl.toml index bbc3ff27f9..27838809b3 100644 --- a/configs/debug/training_modes/rl.toml +++ b/configs/debug/training_modes/rl.toml @@ -11,7 +11,7 @@ name = "debug-rl" [orchestrator] training_mode = "rl" batch_size = 128 -rollouts_per_example = 16 +group_size = 16 [orchestrator.renderer] name = "qwen3" diff --git a/configs/debug/training_modes/sft.toml b/configs/debug/training_modes/sft.toml index 3d583e6185..aed5b30cb3 100644 --- a/configs/debug/training_modes/sft.toml +++ b/configs/debug/training_modes/sft.toml @@ -18,7 +18,7 @@ name = "debug-sft" [orchestrator] training_mode = "sft" batch_size = 128 -rollouts_per_example = 4 +group_size = 4 [orchestrator.train.sampling] max_completion_tokens = 128 diff --git a/configs/debug/training_modes/sft_external.toml b/configs/debug/training_modes/sft_external.toml index 7fa5a478d9..cb9ea8d09e 100644 --- a/configs/debug/training_modes/sft_external.toml +++ b/configs/debug/training_modes/sft_external.toml @@ -17,7 +17,7 @@ name = "debug-sft-external" [orchestrator] training_mode = "sft" batch_size = 128 -rollouts_per_example = 4 +group_size = 4 [orchestrator.train.sampling] max_completion_tokens = 2048 diff --git a/configs/debug/training_modes/sft_lora.toml b/configs/debug/training_modes/sft_lora.toml index 560f94a321..687b45bbe3 100644 --- a/configs/debug/training_modes/sft_lora.toml +++ b/configs/debug/training_modes/sft_lora.toml @@ -18,7 +18,7 @@ name = "debug-sft-lora" [orchestrator] training_mode = "sft" batch_size = 128 -rollouts_per_example = 4 +group_size = 4 [orchestrator.train.sampling] max_completion_tokens = 128 diff --git a/configs/deepscaler/stage1.toml b/configs/deepscaler/stage1.toml index 68c95fdf26..09b5e16179 100644 --- a/configs/deepscaler/stage1.toml +++ b/configs/deepscaler/stage1.toml @@ -17,7 +17,7 @@ interval = 100 [orchestrator] batch_size = 1024 -rollouts_per_example = 8 +group_size = 8 [orchestrator.train.sampling] temperature = 0.6 diff --git a/configs/deepscaler/stage2.toml b/configs/deepscaler/stage2.toml index 77c51a0340..2bb3f9feea 100644 --- a/configs/deepscaler/stage2.toml +++ b/configs/deepscaler/stage2.toml @@ -18,7 +18,7 @@ resume_step = 500 [orchestrator] batch_size = 1024 -rollouts_per_example = 8 +group_size = 8 [orchestrator.train.sampling] temperature = 0.6 diff --git a/configs/deepscaler/stage3.toml b/configs/deepscaler/stage3.toml index db42eddcf3..42ed754ae8 100644 --- a/configs/deepscaler/stage3.toml +++ b/configs/deepscaler/stage3.toml @@ -18,7 +18,7 @@ resume_step = 1000 [orchestrator] batch_size = 1024 -rollouts_per_example = 8 +group_size = 8 [orchestrator.train.sampling] temperature = 0.6 diff --git a/configs/elastic/rl.toml b/configs/elastic/rl.toml index 387bfddbf2..3073dc4537 100644 --- a/configs/elastic/rl.toml +++ b/configs/elastic/rl.toml @@ -30,7 +30,7 @@ lr = 1e-5 [orchestrator] batch_size = 512 -rollouts_per_example = 8 +group_size = 8 [orchestrator.train.sampling] max_completion_tokens = 768 diff --git a/configs/env_mix/env_mix.toml b/configs/env_mix/env_mix.toml index 64d8283fff..79429a3d9f 100644 --- a/configs/env_mix/env_mix.toml +++ b/configs/env_mix/env_mix.toml @@ -14,7 +14,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507" [orchestrator] batch_size = 512 -rollouts_per_example = 16 +group_size = 16 [[orchestrator.train.env]] id = "math-env" diff --git a/configs/gsm8k/rl.toml b/configs/gsm8k/rl.toml index ae73b2c586..e3839c8770 100644 --- a/configs/gsm8k/rl.toml +++ b/configs/gsm8k/rl.toml @@ -10,7 +10,7 @@ name = "Qwen/Qwen3-0.6B" [orchestrator] batch_size = 512 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 2048 diff --git a/configs/hendrycks_math/rl.toml b/configs/hendrycks_math/rl.toml index acc1434ea9..b4fb071ba4 100644 --- a/configs/hendrycks_math/rl.toml +++ b/configs/hendrycks_math/rl.toml @@ -10,7 +10,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507" [orchestrator] batch_size = 512 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 2048 @@ -33,7 +33,7 @@ max_completion_tokens = 2048 [[orchestrator.eval.env]] id = "math500" num_examples = 30 -rollouts_per_example = 4 +group_size = 4 [trainer] # Default trainer config diff --git a/configs/hendrycks_math/sanity.toml b/configs/hendrycks_math/sanity.toml index 9174350867..11da4f0bbf 100644 --- a/configs/hendrycks_math/sanity.toml +++ b/configs/hendrycks_math/sanity.toml @@ -10,7 +10,7 @@ name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" [orchestrator] batch_size = 512 -rollouts_per_example = 8 +group_size = 8 [[orchestrator.train.env]] id = "math-env" @@ -23,7 +23,7 @@ interval = 50 [[orchestrator.eval.env]] id = "aime2024" name = "aime2024" -rollouts_per_example = 16 +group_size = 16 [trainer.model.ac] diff --git a/configs/math_group/rl.toml b/configs/math_group/rl.toml index 965cb1cd1c..51781dc9f2 100644 --- a/configs/math_group/rl.toml +++ b/configs/math_group/rl.toml @@ -9,7 +9,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507" [orchestrator] batch_size = 256 -rollouts_per_example = 8 +group_size = 8 [[orchestrator.train.env]] id = "math-env" @@ -34,7 +34,7 @@ interval = 50 id = "aime2024" name = "aime2024" num_examples = 30 -rollouts_per_example = 4 +group_size = 4 [trainer.model] seq_len = 4096 diff --git a/configs/math_python/math_python.toml b/configs/math_python/math_python.toml index ae9ec70035..b5dae46f92 100644 --- a/configs/math_python/math_python.toml +++ b/configs/math_python/math_python.toml @@ -10,7 +10,7 @@ name = "Qwen/Qwen3-4B-Instruct-2507" [orchestrator] batch_size = 512 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 512 diff --git a/configs/multi_reverse_text/rl.toml b/configs/multi_reverse_text/rl.toml index f00e68ab9e..d19602c543 100644 --- a/configs/multi_reverse_text/rl.toml +++ b/configs/multi_reverse_text/rl.toml @@ -6,7 +6,7 @@ name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT" [orchestrator] batch_size = 128 -rollouts_per_example = 16 +group_size = 16 # -- train envs -- @@ -36,13 +36,13 @@ max_completion_tokens = 512 id = "reverse-text" name = "eval-default" num_examples = 32 -rollouts_per_example = 4 +group_size = 4 [[orchestrator.eval.env]] id = "reverse-text" name = "eval-custom" num_examples = 16 -rollouts_per_example = 2 +group_size = 2 interval = 10 [orchestrator.eval.env.sampling] diff --git a/configs/multimodal/rl_color_codeword.toml b/configs/multimodal/rl_color_codeword.toml index a98f9ae7f8..35cfcab809 100644 --- a/configs/multimodal/rl_color_codeword.toml +++ b/configs/multimodal/rl_color_codeword.toml @@ -10,7 +10,7 @@ language_model_attr = "model.language_model" [orchestrator] batch_size = 256 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 64 diff --git a/configs/multimodal/rl_color_codeword_feat_renderer.toml b/configs/multimodal/rl_color_codeword_feat_renderer.toml index 599ff2755f..d7fcce03c6 100644 --- a/configs/multimodal/rl_color_codeword_feat_renderer.toml +++ b/configs/multimodal/rl_color_codeword_feat_renderer.toml @@ -38,7 +38,7 @@ gpus_per_node = 2 [orchestrator] batch_size = 16 -rollouts_per_example = 8 +group_size = 8 use_renderer = true # Track zero-advantage groups but don't drop them — we're validating the @@ -64,7 +64,7 @@ args = { images_per_turn = 2, max_turns = 2, num_examples = 100, seed = 42 } [orchestrator.renderer] name = "auto" -# 64 concurrent rollouts (batch_size=16 × rollouts_per_example=4) want +# 64 concurrent rollouts (batch_size=16 × group_size=4) want # more than one tokenizer slot to avoid serialization queueing. The # image processor (CPU-bound) dominates for VLMs so returns diminish # past 4; bump to 4 as the default for multimodal runs. diff --git a/configs/multimodal/rl_color_codeword_test.toml b/configs/multimodal/rl_color_codeword_test.toml index 151bad0987..23a25a94f9 100644 --- a/configs/multimodal/rl_color_codeword_test.toml +++ b/configs/multimodal/rl_color_codeword_test.toml @@ -11,7 +11,7 @@ language_model_attr = "model.language_model" [orchestrator] batch_size = 16 -rollouts_per_example = 2 +group_size = 2 [orchestrator.train.sampling] max_completion_tokens = 32 diff --git a/configs/nemotron_4node/rl.toml b/configs/nemotron_4node/rl.toml index 0fda06a049..b46fbe1c41 100644 --- a/configs/nemotron_4node/rl.toml +++ b/configs/nemotron_4node/rl.toml @@ -46,7 +46,7 @@ max_inflight_activations = 5 [orchestrator] batch_size = 128 -rollouts_per_example = 8 +group_size = 8 [orchestrator.train.sampling] max_completion_tokens = 2048 @@ -69,7 +69,7 @@ max_completion_tokens = 2048 [[orchestrator.eval.env]] id = "math500" num_examples = 30 -rollouts_per_example = 4 +group_size = 4 [inference.parallel] tp = 8 diff --git a/configs/nemotron_debug/rl.toml b/configs/nemotron_debug/rl.toml index 9e1e921b09..342840dfd7 100644 --- a/configs/nemotron_debug/rl.toml +++ b/configs/nemotron_debug/rl.toml @@ -45,7 +45,7 @@ freq = 1 [orchestrator] batch_size = 512 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 4096 @@ -68,7 +68,7 @@ max_completion_tokens = 4096 [[orchestrator.eval.env]] id = "math500" num_examples = 30 -rollouts_per_example = 4 +group_size = 4 [inference.parallel] tp = 4 diff --git a/configs/wiki_search/rl.toml b/configs/wiki_search/rl.toml index d688ba21dc..ebf0037b03 100644 --- a/configs/wiki_search/rl.toml +++ b/configs/wiki_search/rl.toml @@ -27,7 +27,7 @@ target_modules = [ [orchestrator] batch_size = 512 -rollouts_per_example = 16 +group_size = 16 oversampling_factor = 2.0 [orchestrator.train.sampling] diff --git a/docs/bring-your-own-algorithms.md b/docs/bring-your-own-algorithms.md index fba1b1072f..a81549cacd 100644 --- a/docs/bring-your-own-algorithms.md +++ b/docs/bring-your-own-algorithms.md @@ -70,7 +70,7 @@ kwargs = { clip_eps = 0.2 } ## 2. Custom Advantage Functions -Advantages are computed **per-group** (one example × N rollouts). You provide a function that computes advantages for a single group; the framework calls it once per group and stitches the results back together. Groups may have fewer than `rollouts_per_example` rollouts when some rollouts in the group errored (partial-group training). +Advantages are computed **per-group** (one example × N rollouts). You provide a function that computes advantages for a single group; the framework calls it once per group and stitches the results back together. Groups may have fewer than `group_size` rollouts when some rollouts in the group errored (partial-group training). ### Interface diff --git a/docs/slurm.md b/docs/slurm.md index 7b171d4894..17a38ae8ec 100644 --- a/docs/slurm.md +++ b/docs/slurm.md @@ -138,7 +138,7 @@ freq = 1 [orchestrator] batch_size = 512 -rollouts_per_example = 16 +group_size = 16 [orchestrator.sampling] max_tokens = 2048 diff --git a/examples/alphabet_sort/rl.toml b/examples/alphabet_sort/rl.toml index 480cc9f04b..be35366d8c 100644 --- a/examples/alphabet_sort/rl.toml +++ b/examples/alphabet_sort/rl.toml @@ -26,7 +26,7 @@ lr = 1e-5 [orchestrator] batch_size = 512 -rollouts_per_example = 8 +group_size = 8 [orchestrator.train.sampling] max_completion_tokens = 768 diff --git a/examples/glm5_pd_disag/rl.toml b/examples/glm5_pd_disag/rl.toml index 91ac41300a..1ec1a5b435 100644 --- a/examples/glm5_pd_disag/rl.toml +++ b/examples/glm5_pd_disag/rl.toml @@ -62,7 +62,7 @@ weight_decay = 0.1 [orchestrator] batch_size = 4096 -rollouts_per_example = 16 +group_size = 16 oversampling_factor = 3 max_off_policy_steps = 16 diff --git a/examples/hendrycks_sanity/rl.toml b/examples/hendrycks_sanity/rl.toml index 10b7b1ce62..681e2682a7 100644 --- a/examples/hendrycks_sanity/rl.toml +++ b/examples/hendrycks_sanity/rl.toml @@ -13,7 +13,7 @@ name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" [orchestrator] batch_size = 512 -rollouts_per_example = 8 +group_size = 8 seq_len = 8192 [[orchestrator.train.env]] @@ -27,7 +27,7 @@ interval = 50 [[orchestrator.eval.env]] id = "primeintellect/aime2024" name = "aime2024" -rollouts_per_example = 32 +group_size = 32 [trainer.model] seq_len = 16384 diff --git a/examples/multinode/rl.toml b/examples/multinode/rl.toml index 6d2854077a..f5ee93d16b 100644 --- a/examples/multinode/rl.toml +++ b/examples/multinode/rl.toml @@ -36,7 +36,7 @@ freq = 1 [orchestrator] batch_size = 512 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 2048 diff --git a/examples/reverse_text/rl.toml b/examples/reverse_text/rl.toml index 75a4530cdd..6b3db70a55 100644 --- a/examples/reverse_text/rl.toml +++ b/examples/reverse_text/rl.toml @@ -10,7 +10,7 @@ name = "reverse-text" [orchestrator] batch_size = 128 -rollouts_per_example = 16 +group_size = 16 [orchestrator.train.sampling] max_completion_tokens = 128 diff --git a/examples/wiki_search/rl.toml b/examples/wiki_search/rl.toml index 6abbb3d815..599d70658f 100644 --- a/examples/wiki_search/rl.toml +++ b/examples/wiki_search/rl.toml @@ -31,7 +31,7 @@ target_modules = [ [orchestrator] batch_size = 512 -rollouts_per_example = 16 +group_size = 16 oversampling_factor = 2.0 [orchestrator.model.lora] diff --git a/examples/wordle/rl.toml b/examples/wordle/rl.toml index 07ac44d863..c16419ef81 100644 --- a/examples/wordle/rl.toml +++ b/examples/wordle/rl.toml @@ -16,7 +16,7 @@ name = "PrimeIntellect/Qwen3-1.7B-Wordle-SFT" [orchestrator] batch_size = 1024 -rollouts_per_example = 16 +group_size = 16 [[orchestrator.train.env]] id = "primeintellect/wordle" diff --git a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py index 1507f96079..8b68efe9cd 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py @@ -250,8 +250,8 @@ class EvalEnvConfig(EnvConfig): num_examples: int = -1 """Eval examples to sample from the dataset. ``-1`` uses all available examples.""" - rollouts_per_example: int = Field(1, ge=1) - """Rollouts generated per example. Used for pass@k estimation (e.g. ``rollouts_per_example=8`` enables pass@1 through pass@8).""" + group_size: int = Field(1, ge=1, validation_alias=AliasChoices("group_size", "rollouts_per_example")) + """Rollouts generated per example. Used for pass@k estimation (e.g. ``group_size=8`` enables pass@1 through pass@8).""" interval: int = Field(100, ge=1) """Per-env eval interval. If unset, inherits from the group-level eval interval.""" @@ -316,7 +316,7 @@ class EvalConfig(BaseConfig): num_examples: int = -1 """Default eval examples per environment. ``-1`` uses all. Can be overridden per env.""" - rollouts_per_example: int = Field(1, ge=1) + group_size: int = Field(1, ge=1, validation_alias=AliasChoices("group_size", "rollouts_per_example")) """Default rollouts per example. Can be overridden per env.""" num_workers: int | Literal["auto"] = "auto" @@ -330,7 +330,7 @@ class EvalConfig(BaseConfig): @model_validator(mode="after") def resolve_env_defaults(self): - """Resolve per-env overrides: inherit group-level sampling, num_workers, max_retries, num_examples, rollouts_per_example, and interval. Then resolve auto num_workers.""" + """Resolve per-env overrides: inherit group-level sampling, num_workers, max_retries, num_examples, group_size, and interval. Then resolve auto num_workers.""" group_sampling = self.sampling.model_dump() for env in self.env: if "sampling" not in env.model_fields_set: @@ -340,20 +340,20 @@ def resolve_env_defaults(self): env.sampling = EvalSamplingConfig(**merged) if "num_examples" not in env.model_fields_set: env.num_examples = self.num_examples - if "rollouts_per_example" not in env.model_fields_set: - env.rollouts_per_example = self.rollouts_per_example + if "group_size" not in env.model_fields_set: + env.group_size = self.group_size if "interval" not in env.model_fields_set: env.interval = self.interval if "num_workers" not in env.model_fields_set: env.num_workers = self.num_workers if "max_retries" not in env.model_fields_set: env.max_retries = self.max_retries - # Resolve auto num_workers now that num_examples and rollouts_per_example are set + # Resolve auto num_workers now that num_examples and group_size are set if env.num_workers == "auto": if env.num_examples == -1: env.num_workers = 4 else: - max_concurrent = env.num_examples * env.rollouts_per_example + max_concurrent = env.num_examples * env.group_size env.num_workers = max(1, math.ceil(max_concurrent / 256)) return self @@ -622,7 +622,7 @@ class OrchestratorConfig(BaseConfig): max_inflight_rollouts: int | None = Field(None, ge=1) """Maximum number of rollouts kept in-flight. Required for token-based batching. With ``batch_size`` set, defaults to ``batch_size * oversampling_factor`` (or ``batch_size`` when ``oversampling_factor`` is unset).""" - rollouts_per_example: int = Field(1, ge=1) + group_size: int = Field(1, ge=1, validation_alias=AliasChoices("group_size", "rollouts_per_example")) """Output sequences returned per example during training.""" seq_len: int = 2048 @@ -907,11 +907,11 @@ def resolve_batching(self): raise ValueError("max_inflight_rollouts must be set when token_batch_size is set") else: assert self.batch_size is not None - if self.batch_size % self.rollouts_per_example != 0: + if self.batch_size % self.group_size != 0: raise ValueError("Batch size must be divisible by the number of samples per problem") oversampling_factor = self.oversampling_factor if self.oversampling_factor is not None else 1.0 resolved_max_inflight_rollouts = max( - self.rollouts_per_example, + self.group_size, int(self.batch_size * oversampling_factor), ) if self.max_inflight_rollouts is not None and self.oversampling_factor is not None: @@ -921,7 +921,7 @@ def resolve_batching(self): if self.max_inflight_rollouts is None: self.max_inflight_rollouts = resolved_max_inflight_rollouts - if self.max_inflight_rollouts is not None and self.max_inflight_rollouts < self.rollouts_per_example: + if self.max_inflight_rollouts is not None and self.max_inflight_rollouts < self.group_size: raise ValueError("max_inflight_rollouts must be at least the number of rollouts per example") # Resolve train env num_workers from max_inflight_rollouts diff --git a/src/prime_rl/orchestrator/envs.py b/src/prime_rl/orchestrator/envs.py index c7ac150aa6..0fdace4e2c 100644 --- a/src/prime_rl/orchestrator/envs.py +++ b/src/prime_rl/orchestrator/envs.py @@ -142,12 +142,12 @@ async def run_group( client: vf.ClientConfig, example: dict, model_name: str, - rollouts_per_example: int, + group_size: int, cache_salt: str, ) -> list[vf.RolloutOutput]: """Run a group of rollouts for an example. Required for group-scoring envs.""" return await self.env.run_group( - [vf.RolloutInput(**example) for _ in range(rollouts_per_example)], + [vf.RolloutInput(**example) for _ in range(group_size)], client=client, model=model_name, sampling_args=self._sampling_args_with_salt(cache_salt), @@ -191,30 +191,30 @@ async def evaluate( cache_salt: str, ) -> list[vf.RolloutOutput]: num_examples = len(self.examples) - rollouts_per_example = self.config.rollouts_per_example - get_logger().info(f"Evaluating {self.name} ({num_examples=}, {rollouts_per_example=})") - total_rollouts = num_examples * rollouts_per_example + group_size = self.config.group_size + get_logger().info(f"Evaluating {self.name} ({num_examples=}, {group_size=})") + total_rollouts = num_examples * group_size pbar = ProgressTracker(total=total_rollouts, desc=f"Evaluating {self.name}") eval_start = time.perf_counter() if self.requires_group_scoring: async def run_with_progress(example: dict) -> list[vf.RolloutOutput] | None: - """Run rollouts_per_example rollouts as a scored group for one example.""" + """Run group_size rollouts as a scored group for one example.""" try: client = await get_client() outputs = await self.run_group( client=client, example=example, model_name=model_name, - rollouts_per_example=rollouts_per_example, + group_size=group_size, cache_salt=cache_salt, ) - pbar.update(rollouts_per_example) + pbar.update(group_size) return outputs except Exception as e: get_logger().warning(f"Group failed: {e}") - pbar.update(rollouts_per_example) + pbar.update(group_size) return None coros = [run_with_progress(example) for example in self.examples] @@ -235,7 +235,7 @@ async def run_with_progress(example: dict) -> list[vf.RolloutOutput] | None: pbar.update(1) return None - coros = [run_with_progress(example) for example in self.examples for _ in range(rollouts_per_example)] + coros = [run_with_progress(example) for example in self.examples for _ in range(group_size)] try: results = await asyncio.gather(*coros) @@ -291,9 +291,7 @@ async def run_with_progress(example: dict) -> list[vf.RolloutOutput] | None: pass_at_k = None get_logger().warning("Skipping computing pass@k rates because the task rewards appear to be non-binary") - message = ( - f"Evaluated {self.name} in {eval_time:.2f}s (Avg@{rollouts_per_example}={results_df.reward.mean():.4f}" - ) + message = f"Evaluated {self.name} in {eval_time:.2f}s (Avg@{group_size}={results_df.reward.mean():.4f}" if could_be_binary: assert pass_at_k is not None for pass_rate, pass_rate_score in pd.Series(pass_at_k.mean()).items(): @@ -307,7 +305,7 @@ async def run_with_progress(example: dict) -> list[vf.RolloutOutput] | None: get_logger().success(message) eval_metrics = { - f"avg@{rollouts_per_example}": float(results_df.reward.mean()), + f"avg@{group_size}": float(results_df.reward.mean()), "no_response/mean": float(results_df.no_response.mean()), "no_response/count": int(results_df.no_response.sum()), "completion_len/mean": results_df.completion_len.mean().item(), diff --git a/src/prime_rl/orchestrator/orchestrator.py b/src/prime_rl/orchestrator/orchestrator.py index 5e5932ef58..1871bfca85 100644 --- a/src/prime_rl/orchestrator/orchestrator.py +++ b/src/prime_rl/orchestrator/orchestrator.py @@ -614,7 +614,7 @@ def compute_solve_rates(df): """Compute solve_none, solve_all, effective_batch_size for a set of rollouts.""" reward_per_problem = df.groupby(["env_name", "example_id"]).reward.sum() solve_none = (reward_per_problem == 0).mean() - solve_all = (reward_per_problem == config.rollouts_per_example).mean() + solve_all = (reward_per_problem == config.group_size).mean() return solve_none, solve_all, 1 - solve_none - solve_all # Group by (env_name, example_id) to average across rollouts within each problem diff --git a/src/prime_rl/orchestrator/scheduler.py b/src/prime_rl/orchestrator/scheduler.py index 7b24ad9e49..c608ae0d1e 100644 --- a/src/prime_rl/orchestrator/scheduler.py +++ b/src/prime_rl/orchestrator/scheduler.py @@ -80,7 +80,7 @@ def __init__( self.config = config self.batch_size = config.batch_size self.token_batch_size = config.token_batch_size - self.rollouts_per_example = config.rollouts_per_example + self.group_size = config.group_size self.max_inflight_rollouts = max_inflight_rollouts self.max_async_level = max_async_level self.max_off_policy_steps = max_off_policy_steps @@ -219,7 +219,7 @@ async def schedule_rollout(self, group_id: int): client=client_config, example=group.example, model_name=self.model_name, - rollouts_per_example=rollout_count, + group_size=rollout_count, cache_salt=cache_salt, ) ) @@ -266,13 +266,13 @@ async def _schedule_next_request(self) -> bool: await self.schedule_rollout(group_id=group_id) return True - if remaining_capacity < self.rollouts_per_example: + if remaining_capacity < self.group_size: return False example = self.buffer.sample_examples(n=1)[0] group_id = self.next_group_id self.next_group_id += 1 - self.groups[group_id] = GroupState(example=example, rollouts_to_schedule=self.rollouts_per_example) + self.groups[group_id] = GroupState(example=example, rollouts_to_schedule=self.group_size) await self.schedule_rollout(group_id=group_id) return True @@ -475,16 +475,16 @@ async def generate_batch(self, step: int) -> list[vf.RolloutOutput]: # Wait until every dispatched rollout has come back (succeeded # or failed) before finalizing. The group may finalize as a - # partial group (< rollouts_per_example) when some rollouts + # partial group (< group_size) when some rollouts # errored - downstream advantage computation groups by # (env_name, example_id), so variable-size groups are fine. - if len(group.completed_rollouts) + group.failed_rollouts < self.rollouts_per_example: + if len(group.completed_rollouts) + group.failed_rollouts < self.group_size: continue if not group.completed_rollouts: self.dropped_groups_by_env[env_name] += 1 self.logger.warning( - f"Dropping group {group_id} ({env_name}) - all {self.rollouts_per_example} rollouts failed" + f"Dropping group {group_id} ({env_name}) - all {self.group_size} rollouts failed" ) self.groups.pop(group_id, None) continue @@ -492,7 +492,7 @@ async def generate_batch(self, step: int) -> list[vf.RolloutOutput]: if group.failed_rollouts > 0: self.logger.warning( f"Partial group {group_id} ({env_name}) - " - f"{len(group.completed_rollouts)}/{self.rollouts_per_example} valid " + f"{len(group.completed_rollouts)}/{self.group_size} valid " f"({group.failed_rollouts} failed)" ) diff --git a/tests/unit/orchestrator/test_advantage.py b/tests/unit/orchestrator/test_advantage.py index 93c2b7ea4e..6acd1057e4 100644 --- a/tests/unit/orchestrator/test_advantage.py +++ b/tests/unit/orchestrator/test_advantage.py @@ -288,7 +288,7 @@ def test_compute_advantages_without_config(): def test_compute_advantages_partial_groups(): - """Partial groups (size < rollouts_per_example) are advantaged against their own mean. + """Partial groups (size < group_size) are advantaged against their own mean. Two groups of different sizes must round-trip cleanly: each group's advantages must sum to zero and not leak into the other. diff --git a/tests/unit/test_configs.py b/tests/unit/test_configs.py index 66ce195bc6..77e6c84bf1 100644 --- a/tests/unit/test_configs.py +++ b/tests/unit/test_configs.py @@ -435,7 +435,7 @@ def test_shared_output_dir_propagates_through_cli(tmp_path): "seq_len": 128, "model": {"name": "Qwen/Qwen3-0.6B"}, "trainer": {}, - "orchestrator": {"batch_size": 16, "rollouts_per_example": 1}, + "orchestrator": {"batch_size": 16, "group_size": 1}, "inference": {}, }, ) diff --git a/tests/unit/train/rl/test_packer.py b/tests/unit/train/rl/test_packer.py index 7068e0665a..187eeecf5a 100644 --- a/tests/unit/train/rl/test_packer.py +++ b/tests/unit/train/rl/test_packer.py @@ -29,7 +29,7 @@ def create_run_with_config(output_dir: Path, run_name: str) -> Path: config = { "model": {"name": "test-model"}, "batch_size": 2, - "rollouts_per_example": 1, + "group_size": 1, "env": [{"id": "test-env"}], "sampling": {"temperature": 1.0}, # test-model isn't in MODEL_RENDERER_MAP; bypass the renderer-resolution validator. diff --git a/tests/unit/train/test_runs.py b/tests/unit/train/test_runs.py index b80da9c91e..883ef91bdb 100644 --- a/tests/unit/train/test_runs.py +++ b/tests/unit/train/test_runs.py @@ -40,7 +40,7 @@ def create_run_with_config( config = { "model": {"name": "test-model"}, "batch_size": 32, - "rollouts_per_example": 4, + "group_size": 4, "env": [{"id": "test-env"}], # test-model isn't in MODEL_RENDERER_MAP; bypass the renderer-resolution validator. "use_renderer": False, @@ -201,7 +201,7 @@ def test_config_loading(tmp_path: Path) -> None: "model": {"name": "test-model"}, "batch_size": 32, "max_steps": 1000, - "rollouts_per_example": 4, + "group_size": 4, "env": [{"id": "test-env"}], "use_renderer": False, } @@ -246,7 +246,7 @@ def test_config_cleanup_on_deletion(tmp_path: Path) -> None: test_config = { "model": {"name": "test-model"}, "batch_size": 16, - "rollouts_per_example": 4, + "group_size": 4, "env": [{"id": "test-env"}], "use_renderer": False, } @@ -277,7 +277,7 @@ def test_config_invalid(tmp_path: Path) -> None: invalid_config = { "model": {"name": "test-model"}, "batch_size": "not-a-number", # Invalid type - "rollouts_per_example": 4, + "group_size": 4, "env": [{"id": "test-env"}], } run_dir = create_run_with_config(tmp_path, "run_invalid", config=invalid_config)