From 3cd1f6a47f65495316939a001e848296832b67be Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 1 Apr 2026 21:15:35 +0000 Subject: [PATCH 1/2] Fix critical bugs in sgh_restart_ensemble test case Bug #1: config.get('restart_ensemble', {}) crashes because MpasConfigParser.get() expects (section, option) positional args, not a dict fallback. Fixed: config['restart_ensemble'] returns a SectionProxy with proper .get()/.getint()/.getfloat()/.getboolean() methods. Bug #2: _should_restart_run() looked for per-run analysis_results.json files that are never written. AnalysisStep writes analysis_summary.json to its own work dir containing an individual_results dict for all runs. Fixed: add analysis_summary_file config option; configure() loads the file and passes per-run dicts to _should_restart_run() via a new run_results param. RestartScheduler.create_config_file() now includes analysis_summary_file in generated configs. Bug #3: restart_attempt_N/ tracking dirs were never created by InPlaceRestartMember.setup(), so max_consecutive_restarts was effectively disabled and all attempt counters read 0. Fixed: setup() now creates restart_attempt_N/ dirs using a single os.listdir() call to find the highest existing attempt number. Bug #5: restart_scheduler.py docstring Examples section referenced a non-existent module path. Fixed to the correct path. Agent-Logs-Url: https://github.com/alexolinhager/compass/sessions/ca2d29bf-1246-415c-bf2c-9de7521fa55f Co-authored-by: alexolinhager <131483939+alexolinhager@users.noreply.github.com> --- .../ensemble_generator.cfg | 21 +++-- .../sgh_restart_ensemble/restart_member.py | 15 +++ .../sgh_restart_ensemble/restart_scheduler.py | 11 ++- .../sgh_restart_ensemble/test_case.py | 91 +++++++++++-------- 4 files changed, 92 insertions(+), 46 deletions(-) diff --git a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/ensemble_generator.cfg b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/ensemble_generator.cfg index b75c6735f3..93f3c12496 100644 --- a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/ensemble_generator.cfg +++ b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/ensemble_generator.cfg @@ -6,19 +6,22 @@ # # To create a restart ensemble after running spinup: # -# 1. Analyze the completed spinup ensemble: -# from compass.landice.tests.ensemble_generator.ensemble_templates.sgh_ensemble.analysis import analyze_ensemble -# analyze_ensemble('/work/ensemble/spinup_ensemble', '/work/config.cfg') +# 1. Run the sgh_ensemble_analysis test case to produce analysis_summary.json +# compass setup -t landice/ensemble_generator/sgh_ensemble_analysis \ +# -w /work/analysis -f analysis.cfg +# compass run -w /work/analysis # # 2. Schedule restarts: -# from compass.landice.tests.ensemble_generator.ensemble_templates.sgh_ensemble.restart import schedule_restarts +# from compass.landice.tests.ensemble_generator.sgh_restart_ensemble \ +# .restart_scheduler import schedule_restarts # config_file, runs = schedule_restarts( -# '/work/ensemble/spinup_ensemble/analysis_summary.json', +# '/work/analysis/sgh_ensemble_analysis/analyze_ensemble/analysis_summary.json', # '/work/restart_ensemble' # ) # # 3. Set up and run the restart ensemble: -# compass setup -t restart_ensemble -w /work/restart_ensemble -f +# compass setup -t landice/ensemble_generator/sgh_restart_ensemble \ +# -w /work/restart_ensemble -f # compass run -w /work/restart_ensemble [ensemble_generator] @@ -31,6 +34,12 @@ ensemble_template = sgh_ensemble # the run000, run001, etc. subdirectories to be restarted spinup_work_dir = REPLACE_WITH_SPINUP_WORK_DIR +# REQUIRED: Path to the analysis_summary.json produced by sgh_ensemble_analysis +# This file contains per-run steady-state and validation results used to +# determine which runs need restarting. +# Typical path: /sgh_ensemble_analysis/analyze_ensemble/analysis_summary.json +analysis_summary_file = REPLACE_WITH_ANALYSIS_SUMMARY_JSON + # Maximum consecutive restart attempts per run # After this many restarts, the run will not be restarted again # (prevents infinite loops if a run keeps failing) diff --git a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/restart_member.py b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/restart_member.py index 2e401bea1c..dba0df7b91 100644 --- a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/restart_member.py +++ b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/restart_member.py @@ -108,6 +108,21 @@ def setup(self): print(f'Setting config_do_restart = .true. in {namelist_path}') _set_restart_in_namelist(namelist_path) + # Create a restart_attempt_N/ directory to track how many restarts + # have been attempted for this run. configure() counts these dirs + # to enforce max_consecutive_restarts. List the directory once to + # find the highest existing attempt number, then create the next one. + existing_nums = [ + int(d[len('restart_attempt_'):]) + for d in os.listdir(run_dir) + if d.startswith('restart_attempt_') and + d[len('restart_attempt_'):].isdigit() + ] + attempt_num = max(existing_nums, default=0) + 1 + attempt_dir = os.path.join(run_dir, f'restart_attempt_{attempt_num}') + os.makedirs(attempt_dir, exist_ok=True) + print(f'Tracking restart attempt {attempt_num} in {attempt_dir}') + # Register MALI executable so compass knows this step needs the model self.add_model_as_input() diff --git a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/restart_scheduler.py b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/restart_scheduler.py index 9aaecb6628..05bd8f4220 100644 --- a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/restart_scheduler.py +++ b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/restart_scheduler.py @@ -149,6 +149,9 @@ def create_config_file(self, restart_runs, base_config_file=None): # Path to the spinup ensemble to restart from spinup_work_dir = {self.original_ensemble_dir} +# Path to the analysis_summary.json from sgh_ensemble_analysis +analysis_summary_file = {self.summary_file} + # Restart configuration # Maximum consecutive restart attempts per run (prevents infinite loops) max_consecutive_restarts = 3 @@ -242,12 +245,12 @@ def schedule_restarts( Examples -------- - >>> from compass.landice.tests.ensemble_generator. - ensemble_templates.sgh_ensemble.restart - import schedule_restarts + >>> from compass.landice.tests.ensemble_generator.sgh_restart_ensemble \ + ... .restart_scheduler import schedule_restarts >>> >>> config_file, restart_runs = schedule_restarts( - ... '/work/ensemble1/spinup_ensemble/analysis_summary.json', + ... '/work/analysis/sgh_ensemble_analysis/analyze_ensemble/' + ... 'analysis_summary.json', ... '/work/ensemble2', ... min_years=50.0, ... max_attempts=3 diff --git a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/test_case.py b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/test_case.py index 16388c1dd0..c50be87e7e 100644 --- a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/test_case.py +++ b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/test_case.py @@ -15,6 +15,7 @@ import json import os +import compass.namelist from compass.landice.tests.ensemble_generator.ensemble_manager import ( EnsembleManager, ) @@ -57,7 +58,8 @@ def configure(self): 4. Sets up ensemble_manager to handle job submission """ config = self.config - section = config.get('restart_ensemble', {}) + # Bug fix: use dict-style access to get a SectionProxy, not .get() + section = config['restart_ensemble'] spinup_work_dir = section.get('spinup_work_dir') @@ -74,10 +76,28 @@ def configure(self): # Get restart configuration max_consecutive_restarts = section.getint( - 'max_consecutive_restarts', 3) + 'max_consecutive_restarts', fallback=3) min_simulation_years = section.getfloat( - 'min_simulation_years_before_restart', 50.0) - auto_restart = section.getboolean('auto_restart_incomplete', True) + 'min_simulation_years_before_restart', fallback=50.0) + auto_restart = section.getboolean( + 'auto_restart_incomplete', fallback=True) + + # Load per-run analysis results from analysis_summary.json if provided. + # The sgh_ensemble_analysis test case writes this file; it contains + # an 'individual_results' dict keyed by run number (strings in JSON). + analysis_summary = {} + analysis_summary_file = section.get('analysis_summary_file', + fallback=None) + if analysis_summary_file and \ + analysis_summary_file.lower() != 'none': + if not os.path.exists(analysis_summary_file): + raise ValueError( + "analysis_summary_file not found: " + f"{analysis_summary_file}") + with open(analysis_summary_file, 'r') as f: + summary = json.load(f) + # individual_results keys are ints in Python but strings in JSON + analysis_summary = summary.get('individual_results', {}) # Scan for existing run directories run_dirs = sorted(glob.glob(os.path.join(spinup_work_dir, 'run*'))) @@ -92,13 +112,17 @@ def configure(self): except ValueError: continue + # Look up per-run results from the summary (JSON int keys → str) + run_results = analysis_summary.get(str(run_num)) + # Check if run should be restarted should_restart, reason = self._should_restart_run( run_dir=run_dir, run_num=run_num, min_years=min_simulation_years, max_restarts=max_consecutive_restarts, - auto_restart=auto_restart + auto_restart=auto_restart, + run_results=run_results ) if should_restart: @@ -131,7 +155,8 @@ def _should_restart_run( run_num, min_years, max_restarts, - auto_restart): + auto_restart, + run_results=None): """ Determine if a run should be restarted. @@ -152,6 +177,11 @@ def _should_restart_run( auto_restart : bool Whether to automatically restart incomplete runs + run_results : dict or None + Per-run results dict from ``individual_results[run_num]`` in + ``analysis_summary.json``. When *None* the run cannot be + verified and will be skipped. + Returns ------- tuple @@ -174,7 +204,6 @@ def _should_restart_run( with open(restart_timestamp_file, 'r') as f: current_time = f.read().strip() - import compass.namelist namelist = compass.namelist.ingest(namelist_file) stop_time = \ namelist['time_management']['config_stop_time'].strip( @@ -186,37 +215,27 @@ def _should_restart_run( except Exception as e: return False, f"Error reading completion status: {e}" - # Check analysis results - analysis_file = os.path.join(run_dir, 'analysis_results.json') + # Check analysis results supplied from analysis_summary.json. + # run_results is the per-run dict from individual_results[run_num]. + if run_results is not None: + ss_info = run_results.get('steady_state') or {} - if os.path.exists(analysis_file): - try: - with open(analysis_file, 'r') as f: - results = json.load(f) - - # If at steady state, don't restart - ss_info = results.get('steady_state', {}) - if ss_info.get('is_steady_state', False): - return False, "Already at steady state" - - # Check simulation length - metrics = ss_info.get('metrics', {}) - sim_length = metrics.get('final_year', 0.0) - - if sim_length < min_years: - return False, f"Too short ({ - sim_length:.1f} < { - min_years:.1f} yrs)" - - except (json.JSONDecodeError, IOError): - # If analysis file is malformed, still allow restart - pass + # If at steady state, don't restart + if ss_info.get('is_steady_state', False): + return False, "Already at steady state" + + # Check simulation length + metrics = ss_info.get('metrics') or {} + sim_length = metrics.get('final_year', 0.0) + + if sim_length < min_years: + return False, (f"Too short " + f"({sim_length:.1f} < {min_years:.1f} yrs)") else: - # No analysis file - if we can't verify it reached min years, don't - # restart + # No analysis results — cannot verify progress; skip return False, "No analysis results to verify progress" - # Check number of restart attempts + # Check number of restart attempts (tracked as restart_attempt_N/ dirs) restart_attempts = 0 if os.path.exists(run_dir): restart_dirs = [d for d in os.listdir(run_dir) @@ -224,8 +243,8 @@ def _should_restart_run( restart_attempts = len(restart_dirs) if restart_attempts >= max_restarts: - return False, f"Max restart attempts reached \ - ({restart_attempts}/{max_restarts})" + return False, (f"Max restart attempts reached " + f"({restart_attempts}/{max_restarts})") # If all checks pass and auto_restart is enabled if not auto_restart: From 1308a583f6906101ba5766dadebba8cfcfcce5c1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 1 Apr 2026 21:25:06 +0000 Subject: [PATCH 2/2] Replace fallback= kwargs with add_from_package defaults in test_case.py compass's MpasConfigParser does not support the fallback= keyword argument on getint/getfloat/getboolean/get. The correct compass pattern is to load the shipped .cfg defaults via config.add_from_package() at the start of configure() so that all options are resolvable without fallback= kwargs. Changes: - test_case.py: call self.config.add_from_package() at the top of configure() to register ensemble_generator.cfg as the source of defaults, then remove all fallback= kwargs from getint/getfloat/getboolean/get calls - ensemble_generator.cfg: change analysis_summary_file from the REPLACE_WITH_... sentinel to `None` (the shipped default) so the existing `!= 'none'` guard correctly treats it as "not provided" Agent-Logs-Url: https://github.com/alexolinhager/compass/sessions/219c1314-48f3-4507-852a-abf8e87511e4 Co-authored-by: alexolinhager <131483939+alexolinhager@users.noreply.github.com> --- .../sgh_restart_ensemble/ensemble_generator.cfg | 9 ++++++--- .../sgh_restart_ensemble/test_case.py | 17 ++++++++++------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/ensemble_generator.cfg b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/ensemble_generator.cfg index 93f3c12496..b2d0be93de 100644 --- a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/ensemble_generator.cfg +++ b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/ensemble_generator.cfg @@ -34,11 +34,14 @@ ensemble_template = sgh_ensemble # the run000, run001, etc. subdirectories to be restarted spinup_work_dir = REPLACE_WITH_SPINUP_WORK_DIR -# REQUIRED: Path to the analysis_summary.json produced by sgh_ensemble_analysis +# Path to the analysis_summary.json produced by sgh_ensemble_analysis # This file contains per-run steady-state and validation results used to # determine which runs need restarting. -# Typical path: /sgh_ensemble_analysis/analyze_ensemble/analysis_summary.json -analysis_summary_file = REPLACE_WITH_ANALYSIS_SUMMARY_JSON +# Typical path: /sgh_ensemble_analysis/analyze_ensemble/ +# analysis_summary.json +# Set to None (default) if not available — runs will be skipped with +# "No analysis results to verify progress" +analysis_summary_file = None # Maximum consecutive restart attempts per run # After this many restarts, the run will not be restarted again diff --git a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/test_case.py b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/test_case.py index c50be87e7e..efeb02beaf 100644 --- a/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/test_case.py +++ b/compass/landice/tests/ensemble_generator/sgh_restart_ensemble/test_case.py @@ -58,6 +58,12 @@ def configure(self): 4. Sets up ensemble_manager to handle job submission """ config = self.config + # Load shipped defaults before reading any options so that the + # compass config infrastructure (MpasConfigParser) can resolve them + # without needing fallback= kwargs. + self.config.add_from_package( + 'compass.landice.tests.ensemble_generator.sgh_restart_ensemble', + 'ensemble_generator.cfg') # Bug fix: use dict-style access to get a SectionProxy, not .get() section = config['restart_ensemble'] @@ -75,19 +81,16 @@ def configure(self): raise ValueError(f"spinup_work_dir not found: {spinup_work_dir}") # Get restart configuration - max_consecutive_restarts = section.getint( - 'max_consecutive_restarts', fallback=3) + max_consecutive_restarts = section.getint('max_consecutive_restarts') min_simulation_years = section.getfloat( - 'min_simulation_years_before_restart', fallback=50.0) - auto_restart = section.getboolean( - 'auto_restart_incomplete', fallback=True) + 'min_simulation_years_before_restart') + auto_restart = section.getboolean('auto_restart_incomplete') # Load per-run analysis results from analysis_summary.json if provided. # The sgh_ensemble_analysis test case writes this file; it contains # an 'individual_results' dict keyed by run number (strings in JSON). analysis_summary = {} - analysis_summary_file = section.get('analysis_summary_file', - fallback=None) + analysis_summary_file = section.get('analysis_summary_file') if analysis_summary_file and \ analysis_summary_file.lower() != 'none': if not os.path.exists(analysis_summary_file):