From fdf1c5610bdaf5ea5064ea70bbc5ef7d3488bf46 Mon Sep 17 00:00:00 2001 From: Rares Gaia Date: Fri, 5 Dec 2025 16:25:25 +0200 Subject: [PATCH] fix: planner cleanup on job complete Whenever a job is stopped, failed, completed, planner will still keep record of the current pipelines. So when we want to restart the same job, or a job with the same id, planner will wrongly decide to do scale_in, in the case where the queue level allows a scale_in. To fix this issue, we need to do cleanup in planner, whenever job context does cleanup. --- infscale/controller/job_context.py | 1 + infscale/controller/planner.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/infscale/controller/job_context.py b/infscale/controller/job_context.py index 7db1292..cfa8a46 100644 --- a/infscale/controller/job_context.py +++ b/infscale/controller/job_context.py @@ -1270,6 +1270,7 @@ def cleanup(self) -> None: self._new_cfg = None self._flow_graph_patched = False self._worlds_conflict_count = {} + self.ctrl.planner.remove_pipeline_data(self.job_id) def _release_gpu_resources(self, agent_data: AgentMetaData) -> None: resources = self.ctrl.agent_contexts[agent_data.id].resources diff --git a/infscale/controller/planner.py b/infscale/controller/planner.py index f9bcc2a..9210cd6 100644 --- a/infscale/controller/planner.py +++ b/infscale/controller/planner.py @@ -110,6 +110,11 @@ def __init__(self, path: str, autoscale: bool) -> None: self.pipeline_data: dict[str, list[PipelineData]] = {} + def remove_pipeline_data(self, job_id: str) -> None: + """Remove pipeline data for job id.""" + if job_id in self.pipeline_data: + del self.pipeline_data[job_id] + def update_pipeline_data(self, wids_to_remove: set[str], job_id: str) -> None: """Update pipeline data based on worker ids.""" if job_id not in self.pipeline_data: