diff --git a/skyrl/train/trainer.py b/skyrl/train/trainer.py index 91e38d3263..2124a74431 100644 --- a/skyrl/train/trainer.py +++ b/skyrl/train/trainer.py @@ -248,14 +248,24 @@ async def train(self): with Timer("postprocess_generator_output", self.all_timings): generator_output = self.postprocess_generator_output(generator_output, uids) - # 2. print example just for debugging - vis = self.tokenizer.decode(generator_output["response_ids"][0]) - log_example( - logger, - prompt=generator_input["prompts"][0], - response=vis, - reward=generator_output["rewards"][0], - ) + # 2. print all trajectories for debugging + # Prefer trajectory_ids from generator_output (step-wise mode expands them per step), + # fall back to generator_input for standard mode. + trajectory_ids = generator_output.get("trajectory_ids") or generator_input.get("trajectory_ids", []) + input_prompts = generator_input.get("prompts", []) + for i, (response_ids, reward) in enumerate( + zip(generator_output["response_ids"], generator_output["rewards"]) + ): + vis = self.tokenizer.decode(response_ids) + tid = trajectory_ids[i] if i < len(trajectory_ids) else None + tid_str = f"instance={tid.instance_id}, rep={tid.repetition_id}" if tid else f"idx={i}" + prompt = input_prompts[i] if i < len(input_prompts) else "" + log_example( + logger, + prompt=prompt, + response=f"[Trajectory {tid_str}]\n{vis}", + reward=reward, + ) # 3. Convert GeneratorOutput to TrainingInputBatch with Timer("convert_to_training_input", self.all_timings):