Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 133 additions & 2 deletions backend/app/features/ingestion/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
keyed by case.id, to avoid repeated DB queries.
"""

import shlex
from dataclasses import dataclass, field, replace
from datetime import datetime, timezone
from pathlib import Path
Expand All @@ -32,9 +33,9 @@
from app.features.ingestion.parsers.types import ParsedSimulation
from app.features.machine.utils import resolve_machine_by_name
from app.features.simulation.config_delta import SimulationConfigSnapshot
from app.features.simulation.enums import SimulationStatus, SimulationType
from app.features.simulation.enums import ArtifactKind, SimulationStatus, SimulationType
from app.features.simulation.models import Case, Simulation
from app.features.simulation.schemas import SimulationCreate
from app.features.simulation.schemas import ArtifactCreate, SimulationCreate

logger = _setup_custom_logger(__name__)

Expand Down Expand Up @@ -357,6 +358,7 @@ def _build_simulation_create(
simulation = _validate_simulation_create(
replace(prevalidated_draft, case_id=case.id)
)
simulation = _attach_path_artifacts(simulation, parsed_simulation)
logger.info(
"Mapped reference simulation from %s: %s",
parsed_simulation.execution_dir,
Expand All @@ -372,6 +374,7 @@ def _build_simulation_create(
run_config_deltas=delta if delta else None,
)
simulation = _validate_simulation_create(simulation_draft)
simulation = _attach_path_artifacts(simulation, parsed_simulation)

if delta:
logger.info(
Expand All @@ -388,6 +391,134 @@ def _build_simulation_create(
return simulation


def _attach_path_artifacts(
simulation: SimulationCreate,
parsed_simulation: ParsedSimulation,
) -> SimulationCreate:
path_artifacts = _build_path_artifacts(parsed_simulation)
if not path_artifacts:
return simulation

return simulation.model_copy(update={"artifacts": path_artifacts})


def _build_path_artifacts(parsed_simulation: ParsedSimulation) -> list[ArtifactCreate]:
execution_dir = parsed_simulation.execution_dir
path_artifacts: list[ArtifactCreate] = []

output_path = _validate_existing_path(
parsed_simulation.output_path,
source_name="RUNDIR",
execution_dir=execution_dir,
)
archive_path = _validate_existing_path(
parsed_simulation.archive_path,
source_name="DOUT_S_ROOT",
execution_dir=execution_dir,
)
run_script_path = _derive_case_run_script_path(parsed_simulation.case_root)
run_script_path = _validate_existing_path(
run_script_path,
source_name="CASEROOT/.case.run",
execution_dir=execution_dir,
)
postprocessing_path = _extract_postprocessing_script_path(
parsed_simulation.postprocessing_script,
execution_dir=execution_dir,
)
postprocessing_path = _validate_existing_path(
postprocessing_path,
source_name="POSTRUN_SCRIPT",
execution_dir=execution_dir,
)

_append_path_artifact(path_artifacts, ArtifactKind.OUTPUT, output_path)
_append_path_artifact(path_artifacts, ArtifactKind.ARCHIVE, archive_path)
_append_path_artifact(path_artifacts, ArtifactKind.RUN_SCRIPT, run_script_path)
_append_path_artifact(
path_artifacts,
ArtifactKind.POSTPROCESS_SCRIPT,
postprocessing_path,
)

return path_artifacts


def _append_path_artifact(
artifacts: list[ArtifactCreate], kind: ArtifactKind, uri: str | None
) -> None:
if uri is None:
return

artifacts.append(ArtifactCreate(kind=kind, uri=uri))


def _derive_case_run_script_path(case_root: str | None) -> str | None:
normalized_case_root = _normalize_path_candidate(case_root)
if normalized_case_root is None:
return None

return str(Path(normalized_case_root) / ".case.run")


def _extract_postprocessing_script_path(
postprocessing_script: str | None,
execution_dir: str,
) -> str | None:
normalized_script = _normalize_path_candidate(postprocessing_script)
if normalized_script is None:
return None

try:
tokens = shlex.split(normalized_script)
except ValueError:
logger.warning(
"Skipping POSTRUN_SCRIPT artifact for '%s': could not parse value '%s'.",
execution_dir,
normalized_script,
)
return None

if not tokens:
return None

return tokens[0]


def _validate_existing_path(
path_value: str | None,
*,
source_name: str,
execution_dir: str,
) -> str | None:
normalized_path = _normalize_path_candidate(path_value)
if normalized_path is None:
return None

candidate_path = Path(normalized_path).expanduser()
if not candidate_path.exists():
Comment on lines +498 to +499
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Resolve relative artifact paths before validation

_validate_existing_path checks Path(normalized_path).expanduser() directly, so any relative value from POSTRUN_SCRIPT/CASEROOT is resolved against the API process working directory instead of the simulation context. In archives where these entries are relative (a common shell-script pattern), valid artifacts in the case/execution tree are treated as missing and silently omitted, so ingestion loses artifact links even though the files exist.

Useful? React with 👍 / 👎.

logger.warning(
"Skipping %s artifact for '%s': path does not exist on ingest host: %s",
source_name,
execution_dir,
normalized_path,
)
return None

return str(candidate_path)
Comment on lines +488 to +508
Copy link

Copilot AI Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_validate_existing_path() only checks Path.exists(). For these artifacts, the expected path type is known (e.g., RUNDIR/DOUT_S_ROOT should be directories; .case.run and POSTRUN_SCRIPT should be files). Without validating is_dir()/is_file() (and ideally requiring absolute paths), ingestion can attach incorrect artifacts when a wrong-type or relative path happens to exist on the ingest host.

Copilot uses AI. Check for mistakes.

Comment on lines +394 to +509
Copy link

Copilot AI Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_validate_existing_path() currently accepts any path that exists on the API host and then persists it as an artifact. Because ingest_archive() is used by /ingestions/from-upload (not admin-restricted), a malicious upload could cause the service to store (and later expose) arbitrary host filesystem paths that happen to exist (e.g. /etc/passwd), effectively turning ingestion into a filesystem-existence oracle. Consider gating path-artifact attachment behind an explicit flag passed from the API layer (enabled only for trusted HPC path ingests), or enforcing an allowlist (e.g., restrict to configured machine storage roots / derived CASEROOT subtree) before checking existence.

Copilot uses AI. Check for mistakes.

def _normalize_path_candidate(path_value: str | None) -> str | None:
if path_value is None:
return None

normalized = path_value.strip()
if not normalized:
return None

return normalized


def _prevalidate_simulation_create(
parsed_simulation: ParsedSimulation,
machine_id: UUID,
Expand Down
8 changes: 8 additions & 0 deletions backend/app/features/ingestion/parsers/case_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]:
machine = _extract_value_from_file(env_case_path, "MACH")
user = _extract_value_from_file(env_case_path, "REALUSER")
compset_alias = _extract_value_from_file(env_case_path, "COMPSET")
case_root = _extract_value_from_file(env_case_path, "CASEROOT")

# Extract metadata that requires special handling
campaign, experiment_type = _extract_campaign_and_experiment_type(case_name)
Expand All @@ -50,6 +51,7 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]:
"campaign": campaign,
"experiment_type": experiment_type,
"compset_alias": compset_alias,
"case_root": case_root,
}
Comment on lines 41 to 55
Copy link

Copilot AI Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parse_env_case() now returns case_root, but the function docstring’s “Dictionary with case metadata … including:” list doesn’t mention this key. Please update the docstring to reflect the new return shape so callers/tests have an accurate contract.

Copilot uses AI. Check for mistakes.


Expand Down Expand Up @@ -95,6 +97,9 @@ def parse_env_run(env_run_path: str | Path) -> dict[str, str | None]:
stop_option = _extract_value_from_file(env_run_path, "STOP_OPTION")
stop_n = _extract_value_from_file(env_run_path, "STOP_N")
stop_date = _extract_value_from_file(env_run_path, "STOP_DATE")
output_path = _extract_value_from_file(env_run_path, "RUNDIR")
archive_path = _extract_value_from_file(env_run_path, "DOUT_S_ROOT")
postprocessing_script = _extract_value_from_file(env_run_path, "POSTRUN_SCRIPT")

simulation_start_date = (
run_ref_date if initialization_type == "branch" else run_start_date
Expand All @@ -110,6 +115,9 @@ def parse_env_run(env_run_path: str | Path) -> dict[str, str | None]:
"initialization_type": initialization_type,
"simulation_start_date": simulation_start_date,
"simulation_end_date": simulation_end_date,
"output_path": output_path,
"archive_path": archive_path,
"postprocessing_script": postprocessing_script,
}


Expand Down
4 changes: 4 additions & 0 deletions backend/app/features/ingestion/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,10 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul
git_tag=metadata.get("git_tag"),
git_commit_hash=metadata.get("git_commit_hash"),
status=metadata.get("status") or SimulationStatus.UNKNOWN.value,
output_path=metadata.get("output_path"),
archive_path=metadata.get("archive_path"),
case_root=metadata.get("case_root"),
postprocessing_script=metadata.get("postprocessing_script"),
)


Expand Down
4 changes: 4 additions & 0 deletions backend/app/features/ingestion/parsers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,7 @@ class ParsedSimulation:
git_tag: str | None
git_commit_hash: str | None
status: str | None
output_path: str | None = None
archive_path: str | None = None
case_root: str | None = None
postprocessing_script: str | None = None
48 changes: 48 additions & 0 deletions backend/tests/features/ingestion/parsers/test_case_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,19 @@


class TestParseEnvCase:
def test_extracts_case_root(self, tmp_path):
xml_case = """
<config>
<entry id="CASEROOT" value="/tmp/case_root" />
</config>
"""
tmp_case = tmp_path / "env_case_caseroot.xml"
tmp_case.write_text(xml_case)

result = parse_env_case(tmp_case)

assert result["case_root"] == "/tmp/case_root"

def test_value(self, tmp_path):
xml_case = """
<config>
Expand Down Expand Up @@ -139,6 +152,41 @@ def test_missing_entry_returns_none(self, tmp_path):


class TestParseEnvRun:
def test_extracts_path_artifact_values(self, tmp_path):
xml_run = """
<config>
<entry id="RUN_TYPE" value="startup" />
<entry id="RUN_STARTDATE" value="2020-01-01" />
<entry id="RUNDIR" value="/tmp/run" />
<entry id="DOUT_S_ROOT" value="/tmp/archive" />
<entry id="POSTRUN_SCRIPT">/tmp/post.sh --flag value</entry>
</config>
"""
tmp_run = tmp_path / "env_run_paths.xml"
tmp_run.write_text(xml_run)

result = parse_env_run(tmp_run)

assert result["output_path"] == "/tmp/run"
assert result["archive_path"] == "/tmp/archive"
assert result["postprocessing_script"] == "/tmp/post.sh --flag value"

def test_missing_path_entries_return_none(self, tmp_path):
xml_run = """
<config>
<entry id="RUN_TYPE" value="startup" />
<entry id="RUN_STARTDATE" value="2020-01-01" />
</config>
"""
tmp_run = tmp_path / "env_run_missing_paths.xml"
tmp_run.write_text(xml_run)

result = parse_env_run(tmp_run)

assert result["output_path"] is None
assert result["archive_path"] is None
assert result["postprocessing_script"] is None

def test_branch_uses_run_refdate(self, tmp_path):
xml_run = """
<config>
Expand Down
Loading
Loading