🤝 validate gradient_accumulation_steps vs steps_per_generation for on-policy GRPO (#3493)

HarryHsing · shirinyamani · web-flow · commit 32df09358ece · 2025-06-25T18:03:22.000+02:00
Co-authored-by: Shirin Yamani &lt;75791599+shirinyamani@users.noreply.github.com&gt;
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -1204,7 +1204,7 @@ def _generate_and_score_completions(
             # When using num_iterations == 1 and steps_per_generation <= gradient_accumulation_steps
             # old_per_token_logps == per_token_logps, so we can skip it's computation here, and use
             # per_token_logps.detach() instead.
-            if self.num_iterations > 1 or self.args.steps_per_generation > self.args.gradient_accumulation_steps:
+            if self.num_iterations > 1 or self.args.gradient_accumulation_steps % self.args.steps_per_generation != 0:
                 old_per_token_logps = self._get_per_token_logps(
                     self.model, prompt_completion_ids, attention_mask, logits_to_keep, batch_size
                 )

Original file line number	Diff line number	Diff line change
`@@ -1204,7 +1204,7 @@ def _generate_and_score_completions(`
`1204`	`1204`	`# When using num_iterations == 1 and steps_per_generation <= gradient_accumulation_steps`
`1205`	`1205`	`# old_per_token_logps == per_token_logps, so we can skip it's computation here, and use`
`1206`	`1206`	`# per_token_logps.detach() instead.`
`1207`		`- if self.num_iterations > 1 or self.args.steps_per_generation > self.args.gradient_accumulation_steps:`
	`1207`	`+ if self.num_iterations > 1 or self.args.gradient_accumulation_steps % self.args.steps_per_generation != 0:`
`1208`	`1208`	`old_per_token_logps = self._get_per_token_logps(`
`1209`	`1209`	`self.model, prompt_completion_ids, attention_mask, logits_to_keep, batch_size`
`1210`	`1210`	`)`