From fdf1c5610bdaf5ea5064ea70bbc5ef7d3488bf46 Mon Sep 17 00:00:00 2001
From: Rares Gaia <rares.gaia@bytex.ro>
Date: Fri, 5 Dec 2025 16:25:25 +0200
Subject: [PATCH] fix: planner cleanup on job complete

Whenever a job is stopped, failed, completed, planner will still keep record of the current pipelines. So when we want to restart the same job, or a job with the same id, planner will wrongly decide to do scale_in, in the case where the queue level allows a scale_in. To fix this issue, we need to do cleanup in planner, whenever job context does cleanup.
---
 infscale/controller/job_context.py | 1 +
 infscale/controller/planner.py     | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/infscale/controller/job_context.py b/infscale/controller/job_context.py
index 7db1292..cfa8a46 100644
--- a/infscale/controller/job_context.py
+++ b/infscale/controller/job_context.py
@@ -1270,6 +1270,7 @@ def cleanup(self) -> None:
         self._new_cfg = None
         self._flow_graph_patched = False
         self._worlds_conflict_count = {}
+        self.ctrl.planner.remove_pipeline_data(self.job_id)
 
     def _release_gpu_resources(self, agent_data: AgentMetaData) -> None:
         resources = self.ctrl.agent_contexts[agent_data.id].resources
diff --git a/infscale/controller/planner.py b/infscale/controller/planner.py
index f9bcc2a..9210cd6 100644
--- a/infscale/controller/planner.py
+++ b/infscale/controller/planner.py
@@ -110,6 +110,11 @@ def __init__(self, path: str, autoscale: bool) -> None:
 
         self.pipeline_data: dict[str, list[PipelineData]] = {}
 
+    def remove_pipeline_data(self, job_id: str) -> None:
+        """Remove pipeline data for job id."""
+        if job_id in self.pipeline_data:
+            del self.pipeline_data[job_id]
+
     def update_pipeline_data(self, wids_to_remove: set[str], job_id: str) -> None:
         """Update pipeline data based on worker ids."""
         if job_id not in self.pipeline_data: