diff --git a/backend/app/scripts/README.md b/backend/app/scripts/README.md index a5151cbc..ce505cf9 100644 --- a/backend/app/scripts/README.md +++ b/backend/app/scripts/README.md @@ -13,7 +13,10 @@ Scripts are organized by domain: ``` scripts/ ├── ingestion/ -│ └── nersc_archive_ingestor.py +│ ├── hpc_archive_ingestor.py +│ ├── nersc_archive_ingestor.py +│ └── sites/ +│ └── chrysalis.sh ├── db/ │ ├── seed.py │ ├── rollback_seed.py @@ -41,6 +44,7 @@ Example: python -m app.scripts.db.seed python -m app.scripts.db.rollback_seed python -m app.scripts.users.create_admin_account +python -m app.scripts.ingestion.hpc_archive_ingestor python -m app.scripts.ingestion.nersc_archive_ingestor --dry-run ``` @@ -102,6 +106,39 @@ If operational complexity increases, these scripts may later be consolidated int --- +## HPC Archive Ingestor + +The scheduler-agnostic HPC archive ingestor is the preferred entrypoint for +site wrappers. It currently delegates to the existing NERSC archive ingestor, +preserving Perlmutter behavior while giving non-NERSC schedulers a stable shared +command. + +Example: + +```bash +uv run python -m app.scripts.ingestion.hpc_archive_ingestor +``` + +Thin site wrappers live under `app/scripts/ingestion/sites/`. They should only +set site-specific environment and call the shared ingestor. Ingestion logic +belongs in Python, not in shell wrappers. + +### Chrysalis Wrapper + +`app/scripts/ingestion/sites/chrysalis.sh` is intended for the existing Sandia +Jenkins workflow. It sets Chrysalis defaults and requires caller-provided +`SIMBOARD_API_BASE_URL` and `SIMBOARD_API_TOKEN`. + +Default Chrysalis archive root: + +- `/lcrc/group/e3sm/PERF_Chrysalis/performance_archive` + +The wrapper defaults to `DRY_RUN=true`; set `DRY_RUN=false` only after validating +archive access, token storage, network egress, and candidate counts. + +Compy, Aurora, and Frontier adapters are intentionally deferred until accounts +or equivalent native-runner access exists for those sites. + ## NERSC Archive Ingestor The NERSC archive ingestor scans a bind-mounted performance archive directory, diff --git a/backend/app/scripts/ingestion/hpc_archive_ingestor.py b/backend/app/scripts/ingestion/hpc_archive_ingestor.py new file mode 100644 index 00000000..cd47c190 --- /dev/null +++ b/backend/app/scripts/ingestion/hpc_archive_ingestor.py @@ -0,0 +1,14 @@ +"""Scheduler-agnostic HPC archive ingestion entrypoint. + +This module is the stable entrypoint for site wrappers such as Jenkins or cron +jobs. The current implementation delegates to the existing NERSC archive +ingestor so Perlmutter behavior remains unchanged while non-NERSC wrappers move +to a shared command name. +""" + +from __future__ import annotations + +from app.scripts.ingestion.nersc_archive_ingestor import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/backend/app/scripts/ingestion/sites/chrysalis.sh b/backend/app/scripts/ingestion/sites/chrysalis.sh new file mode 100755 index 00000000..8e6b991b --- /dev/null +++ b/backend/app/scripts/ingestion/sites/chrysalis.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Thin SimBoard entrypoint for the existing Chrysalis Jenkins workflow. +# Keep site-specific setup here; keep ingestion logic in hpc_archive_ingestor.py. + +: "${SIMBOARD_API_BASE_URL:?SIMBOARD_API_BASE_URL is required}" +: "${SIMBOARD_API_TOKEN:?SIMBOARD_API_TOKEN is required}" + +export MACHINE_NAME="${MACHINE_NAME:-chrysalis}" +export PERF_ARCHIVE_ROOT="${PERF_ARCHIVE_ROOT:-/lcrc/group/e3sm/PERF_Chrysalis/performance_archive}" +export STATE_PATH="${STATE_PATH:-${PERF_ARCHIVE_ROOT}/../simboard-ingestion-state.json}" +export DRY_RUN="${DRY_RUN:-true}" + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +backend_root="$(cd "${script_dir}/../../../.." && pwd)" +python_bin="${PYTHON_BIN:-python}" + +cd "${backend_root}" +exec "${python_bin}" -m app.scripts.ingestion.hpc_archive_ingestor diff --git a/backend/docs/154-ingestions/codex-implementation-plan.md b/backend/docs/154-ingestions/codex-implementation-plan.md new file mode 100644 index 00000000..59322338 --- /dev/null +++ b/backend/docs/154-ingestions/codex-implementation-plan.md @@ -0,0 +1,168 @@ +# Chrysalis-First SimBoard HPC Ingestion Plan + +## Executive Summary + +Extend SimBoard metadata ingestion beyond Perlmutter/NERSC with one reusable +HPC ingestion framework and thin per-site adapters. Do not create separate +one-off implementations per site, and do not copy the NERSC Spin deployment +pattern unchanged. + +Chrysalis is the priority implementation target. It has the clearest current +path because the PACE reference and the existing GitHub script both indicate a +Sandia Jenkins workflow and a known archive root. + +Hold off on Compy, Aurora, and Frontier implementation until accounts or +equivalent native-runner access exist. Without access, implementation cannot +validate archive paths, scheduler behavior, token storage, network egress, or +metadata layout. These sites remain documented as future candidates only. + +Primary source: `/Users/vo13/Downloads/EPG-PACE Collection and Upload Reference-280426-174740.pdf`. + +Supporting GitHub sources: + +- `https://github.com/E3SM-Project/E3SM_test_scripts/blob/master/jenkins/chrysalis_pace.sh` +- `https://github.com/E3SM-Project/E3SM_test_scripts/blob/master/jenkins/compy_pace.sh` +- `https://github.com/E3SM-Project/E3SM_test_scripts/blob/master/util/pace_archive.sh` + +## Current State By Site + +| Site | Current state from source materials | SimBoard plan | +| --- | --- | --- | +| Perlmutter | Already implemented through NERSC Spin CronJob against NERSC-mounted archive storage. | Keep existing deployment. Use as reference for scanner, state, retry, dry-run, and logging behavior. | +| Chrysalis | PACE PDF and `chrysalis_pace.sh` show Sandia Jenkins and `/lcrc/group/e3sm/PERF_Chrysalis/performance_archive`. | Priority site. Add thin SimBoard shell wrapper for Jenkins and run shared Python ingestor. | +| Compy | PACE PDF and `compy_pace.sh` show Sandia Jenkins and `/compyfs/performance_archive`. | Defer until Compy account or native Jenkins validation exists. | +| Aurora | PACE PDF shows ALCF GitLab scheduled daily job and `/lus/flare/projects/E3SM_Dec/performance_archive`. | Defer until ALCF account/native-runner access exists. Do not force Jenkins. | +| Frontier | PACE PDF shows `/lustre/orion/proj-shared/cli115` and local cron/unknown frequency. | Defer until OLCF account, owner, runner, and script details are confirmed. | + +## Common Architecture Recommendation + +Use this common flow for all supported sites: + +```text +site scheduler -> thin SimBoard .sh wrapper -> shared Python ingestor -> SimBoard API +``` + +Thin site wrappers should live in SimBoard because SimBoard owns the API +contract, runtime environment variables, and ingestion behavior. Wrappers should +only: + +- load site-specific modules or Python environment when needed +- set `MACHINE_NAME` +- set `PERF_ARCHIVE_ROOT` +- set `STATE_PATH` +- require `SIMBOARD_API_BASE_URL` +- require `SIMBOARD_API_TOKEN` +- call `python -m app.scripts.ingestion.hpc_archive_ingestor` + +The shared Python ingestor should own: + +- archive scanning +- metadata validation +- idempotent state tracking +- dry-run behavior +- retry/backoff +- structured logs +- SimBoard API submission + +## Shared Components Vs Site-Specific Components + +Standardize these parts: + +- scan and parseable execution discovery +- metadata validation using existing SimBoard parser behavior +- state-file deduplication +- dry-run and capped-ingest controls +- retry/backoff and deterministic non-zero failure exits +- service-account token authentication +- structured startup, scan, candidate, success, failure, and summary logs + +Keep these parts site-specific: + +- scheduler: Jenkins, GitLab, cron, or site-native runner +- module/Python setup +- archive root and state path +- secret storage and token rotation workflow +- network/proxy/egress setup +- local filesystem permissions + +## Execution Model By Site + +| Site | Execution model | +| --- | --- | +| Perlmutter | Existing NERSC Spin CronJob. | +| Chrysalis | Sandia Jenkins wrapper. | +| Compy | Sandia Jenkins later, after account/native-runner validation. | +| Aurora | ALCF GitLab later, after ALCF access validation. | +| Frontier | Local cron or OLCF-native runner later, after owner/runtime confirmation. | + +## Reuse And Generalization Guidance + +Generalize the current NERSC ingestor by reusing its durable behavior: + +- archive scan +- execution-dir validation +- idempotent state +- dry-run mode +- retry/backoff +- structured logging + +Do not generalize by hardcoding NERSC assumptions: + +- no Perlmutter default in site wrappers +- no assumption that SimBoard can mount every remote DOE filesystem +- no assumption that every site uses NERSC Spin CronJob +- no single shared token across sites + +For Chrysalis, start with path-based ingestion only if the runtime can present a +path readable by the SimBoard ingestion API. If the SimBoard backend cannot read +the site filesystem, switch that site to upload-mode ingestion after access and +sample data validation. + +## Recommended Rollout Order + +1. Confirm with @rljacob that Chrysalis is the highest-value first site. +2. Preserve current Perlmutter/NERSC behavior. +3. Add generic `hpc_archive_ingestor` entrypoint that delegates to existing + scanner/state/retry/logging logic. +4. Add Chrysalis Jenkins wrapper. +5. Validate Chrysalis with dry-run and capped ingest. +6. Move Chrysalis to scheduled Jenkins only after state, counts, failure status, + token storage, and logs are verified. +7. Re-rank Compy, Aurora, and Frontier after accounts or native-runner access + are available. + +## Risks, Unknowns, And Assumptions + +Risks and unknowns: + +- Compy, Aurora, and Frontier cannot be safely implemented without access. +- Remote DOE filesystems may not be readable from the SimBoard backend. +- Non-NERSC sites may require upload-mode ingestion rather than path-mode + ingestion. +- Site egress to SimBoard may require proxy or firewall changes. +- Token storage and rotation are site-specific operational concerns. +- Existing PACE cleanup removes files larger than 50 MB; confirm this does not + remove metadata required by SimBoard. + +Assumptions: + +- Anvil is out of scope for this plan. +- Existing SimBoard `/ingestions/from-path` and `/ingestions/from-upload` APIs + are sufficient for initial rollout planning. +- One service-account token should be provisioned per site. +- Existing PACE scripts remain responsible for PACE collection/upload. SimBoard + wrappers only bridge archived metadata into SimBoard. + +## Concrete Next Steps + +1. Ask @rljacob to confirm Chrysalis as first priority. +2. Confirm Chrysalis Jenkins owner, token storage mechanism, and service account + rotation path. +3. Confirm Chrysalis archive root: + `/lcrc/group/e3sm/PERF_Chrysalis/performance_archive`. +4. Get one recent Chrysalis archived case path. +5. Run Chrysalis dry-run with `DRY_RUN=true`. +6. Run capped ingest with `MAX_CASES_PER_RUN`. +7. Verify SimBoard created/duplicate/error counts. +8. Verify state file prevents repeat ingestion. +9. Verify Jenkins marks ingestion failures as failed jobs. diff --git a/backend/docs/154-ingestions/ingestion-sites-onboarding.md b/backend/docs/154-ingestions/ingestion-sites-onboarding.md new file mode 100644 index 00000000..2da3aaac --- /dev/null +++ b/backend/docs/154-ingestions/ingestion-sites-onboarding.md @@ -0,0 +1,122 @@ +# Site Ingestion Onboarding + +## Purpose + +This work extends SimBoard metadata ingestion beyond Perlmutter/NERSC. The first target is Chrysalis because it already has a Jenkins-based PACE workflow and can use the same ingestion model with only site-specific environment defaults. + +Tracking issue: https://github.com/E3SM-Project/simboard/issues/154 + +Takeover PR: https://github.com/E3SM-Project/simboard/pull/169 + +Current branch: + +```bash +feature/154-ingestion-sites +``` + +## Current State + +The branch introduces a shared HPC ingestion entrypoint and a thin Chrysalis wrapper: + +- `backend/app/scripts/ingestion/hpc_archive_ingestor.py` is the scheduler-agnostic entrypoint for HPC site wrappers. It currently delegates to the existing NERSC archive ingestor so Perlmutter behavior stays unchanged. +- `backend/app/scripts/ingestion/sites/chrysalis.sh` is the Chrysalis Jenkins wrapper. It sets Chrysalis defaults, requires SimBoard API configuration from the caller, and runs the shared Python entrypoint. +- `backend/app/scripts/README.md` documents how to run the shared ingestor and how site wrappers should be structured. +- `backend/tests/features/ingestion/test_nersc_archive_ingestor.py` includes coverage for the generic entrypoint. + +PR 169 is an open draft for this Chrysalis work. Take it over from there rather than starting a new branch. The PR currently notes that local validation was blocked because PostgreSQL was unavailable at `127.0.0.1`, so backend tests still need to be rerun in a working local or CI environment. + +The key design rule is that shell wrappers should stay thin. Put reusable ingestion behavior in Python, not in site-specific shell scripts. + +## How Ingestion Works + +The existing ingestor scans a performance archive directory, finds parseable execution directories, tracks state, and calls the SimBoard path-ingestion API for changed cases. + +The shared entrypoint is intended to be stable across schedulers: + +```bash +uv run python -m app.scripts.ingestion.hpc_archive_ingestor +``` + +Site wrappers should set only local defaults such as: + +- `MACHINE_NAME` +- `PERF_ARCHIVE_ROOT` +- `STATE_PATH` +- `DRY_RUN` +- `PYTHON_BIN`, when the site needs a specific Python executable + +The SimBoard API target and token must be provided by the runner environment: + +- `SIMBOARD_API_BASE_URL` +- `SIMBOARD_API_TOKEN` + +See `docs/hpc_api_token_authentication.md` for service account and API token setup. + +## Chrysalis Handoff + +Start with Chrysalis. + +Current wrapper: + +```bash +backend/app/scripts/ingestion/sites/chrysalis.sh +``` + +The wrapper defaults to: + +- `MACHINE_NAME=chrysalis` +- `PERF_ARCHIVE_ROOT=/lcrc/group/e3sm/PERF_Chrysalis/performance_archive` +- `STATE_PATH=${PERF_ARCHIVE_ROOT}/../simboard-ingestion-state.json` +- `DRY_RUN=true` + +Before enabling real ingestion, validate: + +- The archive path exists and is readable from the Jenkins runtime. +- Jenkins can run the backend Python environment. +- Jenkins can inject `SIMBOARD_API_BASE_URL` and `SIMBOARD_API_TOKEN` without logging the token. +- The Jenkins host has network egress to the SimBoard API. +- Dry-run output shows expected candidate counts. +- The state file location is writable and persists across runs. + +Do not set `DRY_RUN=false` until the dry-run behavior has been reviewed. + +## Recommended Task Order + +1. Review the current branch implementation and confirm it matches the thin-wrapper design. +2. Rerun backend tests for PR 169 in an environment with PostgreSQL available. +3. Validate the Chrysalis archive path and Jenkins environment. +4. Create or identify the SimBoard service account and API token for HPC ingestion. +5. Configure Jenkins to provide `SIMBOARD_API_BASE_URL` and `SIMBOARD_API_TOKEN` securely. +6. Run the Chrysalis wrapper with the default dry-run mode. +7. Review candidate counts, skipped cases, errors, and state-file behavior. +8. Enable non-dry-run ingestion only after validation. +9. Apply the same wrapper pattern to additional sites once access is available. + +## Remaining Sites + +Priority and status from issue discussion: + +- Chrysalis: first target; Jenkins workflow. +- Frontier: request or confirm account access. +- Aurora: request or confirm account access. +- Compy: request or confirm account access. +- Anvil: removed from scope. + +Expected runners from the PACE references: + +- Chrysalis and Compy use Jenkins. +- Frontier uses cron. +- Aurora uses ALCF GitLab. + +Confirm these runner assumptions before implementing wrappers for non-Chrysalis sites. + +## References + +- Issue 154: https://github.com/E3SM-Project/simboard/issues/154 +- PR 169: https://github.com/E3SM-Project/simboard/pull/169 +- PACE overview: https://e3sm.atlassian.net/wiki/spaces/EPG/pages/776437853/Performance+Analytics+for+Computational+Experiments+PACE +- PACE collection/upload reference: https://e3sm.atlassian.net/wiki/spaces/EPG/pages/5477335106/PACE+Collection+and+Upload+Reference +- Existing site script wrappers: https://github.com/E3SM-Project/E3SM_test_scripts/tree/master/jenkins +- Existing PACE archive script: https://github.com/E3SM-Project/E3SM_test_scripts/blob/master/util/pace_archive.sh +- SimBoard script docs: `backend/app/scripts/README.md` +- API token docs: `docs/hpc_api_token_authentication.md` diff --git a/backend/docs/154-ingestions/simboard-hpc-ingestion-architecture.png b/backend/docs/154-ingestions/simboard-hpc-ingestion-architecture.png new file mode 100644 index 00000000..4e364fb0 Binary files /dev/null and b/backend/docs/154-ingestions/simboard-hpc-ingestion-architecture.png differ diff --git a/backend/tests/features/ingestion/test_nersc_archive_ingestor.py b/backend/tests/features/ingestion/test_nersc_archive_ingestor.py index 0a9385c0..c32c0902 100644 --- a/backend/tests/features/ingestion/test_nersc_archive_ingestor.py +++ b/backend/tests/features/ingestion/test_nersc_archive_ingestor.py @@ -801,6 +801,20 @@ def test_module_main_guard_exits_via_system_exit_on_configuration_error( assert exc_info.value.code == 1 +def test_generic_hpc_module_main_guard_delegates_to_ingestor(monkeypatch) -> None: + script_path = ( + Path(__file__).resolve().parents[3] + / "app/scripts/ingestion/hpc_archive_ingestor.py" + ) + monkeypatch.setenv("MAX_ATTEMPTS", "0") + monkeypatch.setattr(logging.Logger, "info", lambda *args, **kwargs: None) + + with pytest.raises(SystemExit) as exc_info: + runpy.run_path(str(script_path), run_name="__main__") + + assert exc_info.value.code == 1 + + @pytest.mark.parametrize( ("value", "default", "expected"), [