From 6253c5201b213c6c2172da02df92cf188227841d Mon Sep 17 00:00:00 2001 From: Alasdair Wilson Date: Thu, 28 May 2026 16:03:56 +0100 Subject: [PATCH 1/4] Add static project sync and ingestion pipeline --- .github/workflows/deploy.yml | 29 +-- scripts/install_projects_sync_cron.sh | 17 +- scripts/sync_projects_repo.sh | 2 +- vertex/project_ingestion.py | 353 ++++++++++++++++++++++++++ 4 files changed, 375 insertions(+), 26 deletions(-) create mode 100644 vertex/project_ingestion.py diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 1e37a76..85aaf5e 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -44,9 +44,6 @@ jobs: env: ECR_REPO: ${{ secrets.ECR_REPOSITORY }} INSTANCE_ID: ${{ secrets.EC2_INSTANCE_ID }} - PROJECTS_REPO_BRANCH: ${{ vars.PROJECTS_REPO_BRANCH }} - PROJECTS_GIT_SSH_KEY_PATH: ${{ vars.PROJECTS_GIT_SSH_KEY_PATH }} - PROJECTS_GIT_KNOWN_HOSTS_PATH: ${{ vars.PROJECTS_GIT_KNOWN_HOSTS_PATH }} run: | set -euo pipefail @@ -57,10 +54,10 @@ jobs: PROJECTS_DIR="/opt/vertex-projects" PROJECTS_REPO_URL="git@github.com:ISARICResearch/VERTEX-projects.git" - PROJECTS_REPO_BRANCH="${PROJECTS_REPO_BRANCH:-main}" + PROJECTS_REPO_BRANCH="main" SYNC_SCRIPT_PATH="/usr/local/bin/vertex-sync-projects.sh" - PROJECTS_GIT_SSH_KEY_PATH="${PROJECTS_GIT_SSH_KEY_PATH:-/root/.ssh/vertex_projects_deploy_key}" - PROJECTS_GIT_KNOWN_HOSTS_PATH="${PROJECTS_GIT_KNOWN_HOSTS_PATH:-/root/.ssh/known_hosts}" + PROJECTS_GIT_SSH_KEY_PATH="/root/.ssh/vertex_projects_deploy_key" + PROJECTS_GIT_KNOWN_HOSTS_PATH="/root/.ssh/known_hosts" # 1) write the indented heredoc to a temp file (variables expanded on runner) cat > /tmp/ssm-script.indented < /etc/cron.d/vertex-projects-sync <> /var/log/vertex-projects-sync.log 2>&1 + PROJECTS_REPO_BRANCH=main + VERTEX_PROJECTS_DIR=/opt/vertex-projects + PROJECTS_GIT_SSH_KEY_PATH=/root/.ssh/vertex_projects_deploy_key + PROJECTS_GIT_KNOWN_HOSTS_PATH=/root/.ssh/known_hosts + VERTEX_SYNC_CONTAINER_NAME=isaric-vertex + 0 * * * * root ${SYNC_SCRIPT_PATH} && docker exec ${VERTEX_SYNC_CONTAINER_NAME} python -m vertex.project_ingestion --projects-dir ${VERTEX_PROJECTS_DIR} >> /var/log/vertex-projects-sync.log 2>&1 CRON chmod 644 /etc/cron.d/vertex-projects-sync @@ -142,7 +136,6 @@ jobs: docker run -d --restart unless-stopped \ --env-file /etc/environment \ -e VERTEX_PROJECTS_DIR="${PROJECTS_DIR}" \ - -e VERTEX_PRELOAD_PROJECTS="false" \ -v "${PROJECTS_DIR}:${PROJECTS_DIR}:ro" \ --name isaric-vertex \ -p 8050:8050 \ @@ -157,6 +150,8 @@ jobs: exit 1 fi + docker exec isaric-vertex python -m vertex.project_ingestion --projects-dir "${PROJECTS_DIR}" + # cleanup images/volumes docker image prune -f || true docker volume prune -f || true diff --git a/scripts/install_projects_sync_cron.sh b/scripts/install_projects_sync_cron.sh index fd43023..56c8ee9 100755 --- a/scripts/install_projects_sync_cron.sh +++ b/scripts/install_projects_sync_cron.sh @@ -1,19 +1,20 @@ #!/usr/bin/env bash set -euo pipefail -SYNC_SCRIPT_PATH="${SYNC_SCRIPT_PATH:-/usr/local/bin/vertex-sync-projects.sh}" -CRON_FILE_PATH="${CRON_FILE_PATH:-/etc/cron.d/vertex-projects-sync}" -CRON_LOG_PATH="${CRON_LOG_PATH:-/var/log/vertex-projects-sync.log}" +SYNC_SCRIPT_PATH="/usr/local/bin/vertex-sync-projects.sh" +CRON_FILE_PATH="/etc/cron.d/vertex-projects-sync" +CRON_LOG_PATH="/var/log/vertex-projects-sync.log" cat > "${CRON_FILE_PATH}" <> ${CRON_LOG_PATH} 2>&1 +PROJECTS_REPO_BRANCH=main +VERTEX_PROJECTS_DIR=/opt/vertex-projects +PROJECTS_GIT_SSH_KEY_PATH=/root/.ssh/vertex_projects_deploy_key +PROJECTS_GIT_KNOWN_HOSTS_PATH=/root/.ssh/known_hosts +VERTEX_SYNC_CONTAINER_NAME=isaric-vertex +0 * * * * root ${SYNC_SCRIPT_PATH} && docker exec ${VERTEX_SYNC_CONTAINER_NAME} python -m vertex.project_ingestion --projects-dir ${VERTEX_PROJECTS_DIR} >> ${CRON_LOG_PATH} 2>&1 EOF chmod 644 "${CRON_FILE_PATH}" diff --git a/scripts/sync_projects_repo.sh b/scripts/sync_projects_repo.sh index 91df9d9..733d98b 100755 --- a/scripts/sync_projects_repo.sh +++ b/scripts/sync_projects_repo.sh @@ -2,7 +2,7 @@ set -euo pipefail PROJECTS_DIR="${VERTEX_PROJECTS_DIR:-/opt/vertex-projects}" -PROJECTS_REPO_URL="git@github.com:ISARICResearch/VERTEX-projects.git" +PROJECTS_REPO_URL="${PROJECTS_REPO_URL:-git@github.com:ISARICResearch/VERTEX-projects.git}" PROJECTS_REPO_BRANCH="${PROJECTS_REPO_BRANCH:-main}" PROJECTS_GIT_SSH_KEY_PATH="${PROJECTS_GIT_SSH_KEY_PATH:-/root/.ssh/vertex_projects_deploy_key}" PROJECTS_GIT_KNOWN_HOSTS_PATH="${PROJECTS_GIT_KNOWN_HOSTS_PATH:-/root/.ssh/known_hosts}" diff --git a/vertex/project_ingestion.py b/vertex/project_ingestion.py new file mode 100644 index 0000000..f48d0c6 --- /dev/null +++ b/vertex/project_ingestion.py @@ -0,0 +1,353 @@ +import argparse +import json +import os +import uuid +from datetime import datetime, timezone +from pathlib import Path + +from sqlalchemy import MetaData, Table, create_engine, func, inspect, select +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.orm import Session + +from vertex.logging.logger import setup_logger +from vertex.vertex_secrets import get_database_url + +logger = setup_logger(__name__) + +REQUIRED_PROJECT_COLUMNS = ("id", "vertex_id", "owner_id", "is_public") +REQUIRED_USER_COLUMNS = ("id", "email") +REQUIRED_USER_PROJECT_MAPPING_COLUMNS = ("user_id", "project_id") + + +def _normalise_owner_email(value): + if value in (None, ""): + return None + return str(value).strip().lower() + + +def _normalise_is_public(value): + if isinstance(value, bool): + return value + if value is None: + return True + return str(value).strip().lower() in {"1", "true", "yes", "y"} + + +def discover_static_projects(projects_dir: str | Path) -> list[dict]: + root = Path(projects_dir).expanduser() + records: list[dict] = [] + if not root.exists(): + logger.warning(f"Static projects directory does not exist: {root}") + return records + + for project_path in sorted(root.iterdir()): + if not project_path.is_dir() or project_path.name.startswith("."): + continue + + config_file = project_path / "config_file.json" + if not config_file.exists(): + logger.warning(f"Skipping folder without config_file.json: {project_path}") + continue + + try: + config = json.loads(config_file.read_text()) + except (OSError, json.JSONDecodeError) as exc: + logger.warning(f"Skipping project with invalid config_file.json at {project_path}: {exc}") + continue + + project_id = str(config.get("project_id") or "").strip() or None + if not project_id: + logger.warning(f"Skipping static project with missing project_id: {project_path}") + continue + + project_name = str(config.get("project_name") or "").strip() or project_path.name + records.append( + { + "project_id": project_id, + "name": project_name, + "project_owner": _normalise_owner_email(config.get("project_owner") or config.get("owner_email")), + "is_public": _normalise_is_public(config.get("is_public", True)), + "project_dir": str(project_path), + } + ) + + return records + + +def _db_schema_name(engine, schema: str | None) -> str | None: + if engine.dialect.name != "postgresql": + return None + return schema or "public" + + +def _reflect_table(engine, table_name: str, schema_name: str | None) -> Table: + metadata = MetaData() + return Table(table_name, metadata, autoload_with=engine, schema=schema_name) + + +def _require_columns(table: Table, required_columns: tuple[str, ...]) -> None: + missing_columns = [column_name for column_name in required_columns if column_name not in table.c] + if missing_columns: + raise RuntimeError(f"{table.name} table is missing required columns: {', '.join(sorted(missing_columns))}") + + +def _load_tables(engine, schema: str | None) -> dict: + schema_name = _db_schema_name(engine, schema) + inspector = inspect(engine) + + required_tables = ("projects", "users", "user_project_mapping") + for table_name in required_tables: + if not inspector.has_table(table_name, schema=schema_name): + raise RuntimeError(f"Required table is missing: {table_name}") + + tables = { + "projects": _reflect_table(engine, "projects", schema_name), + "users": _reflect_table(engine, "users", schema_name), + "user_project_mapping": _reflect_table(engine, "user_project_mapping", schema_name), + } + _require_columns(tables["projects"], REQUIRED_PROJECT_COLUMNS) + _require_columns(tables["users"], REQUIRED_USER_COLUMNS) + _require_columns(tables["user_project_mapping"], REQUIRED_USER_PROJECT_MAPPING_COLUMNS) + return tables + + +def _project_debug_summary(project: dict) -> str: + return json.dumps( + { + "project_id": project.get("project_id"), + "name": project.get("name"), + "project_owner": project.get("project_owner"), + "is_public": project.get("is_public"), + "project_dir": project.get("project_dir"), + }, + sort_keys=True, + ) + + +def _build_project_insert_values(project: dict, projects_table: Table, owner_id) -> dict: + values = { + "vertex_id": project["project_id"], + "owner_id": owner_id, + "is_public": project["is_public"], + } + now_utc = datetime.now(timezone.utc) + + if "name" in projects_table.c: + values["name"] = project["name"] + if "project_dir" in projects_table.c: + values["project_dir"] = project["project_dir"] + if "created_at" in projects_table.c: + values.setdefault("created_at", now_utc) + if "updated_at" in projects_table.c: + values.setdefault("updated_at", now_utc) + if "created" in projects_table.c: + values.setdefault("created", now_utc) + if "updated" in projects_table.c: + values.setdefault("updated", now_utc) + + id_column = projects_table.c["id"] + id_is_uuid_like = False + try: + id_is_uuid_like = id_column.type.python_type is uuid.UUID + except NotImplementedError: + id_is_uuid_like = "uuid" in id_column.type.__class__.__name__.lower() + if ( + id_is_uuid_like + and not id_column.nullable + and id_column.default is None + and id_column.server_default is None + and "id" not in values + ): + values["id"] = uuid.uuid4() + + return values + + +def _find_project_by_vertex_id(session: Session, projects_table: Table, project_id: str): + row = session.execute(select(projects_table).where(projects_table.c["vertex_id"] == project_id).limit(1)).mappings().first() + return row + + +def _find_user_by_email(session: Session, users_table: Table, email: str): + return ( + session.execute(select(users_table).where(func.lower(users_table.c["email"]) == email.lower()).limit(1)) + .mappings() + .first() + ) + + +def _project_has_owner_mapping(session: Session, tables: dict, project_row) -> bool: + mapping_table = tables["user_project_mapping"] + project_pk_value = project_row["id"] + existing_link = session.execute( + select(mapping_table.c["project_id"]).where(mapping_table.c["project_id"] == project_pk_value).limit(1) + ).first() + return existing_link is not None + + +def _try_link_owner(session: Session, tables: dict, project_row, owner_email: str | None, stats: dict): + users_table = tables["users"] + mapping_table = tables["user_project_mapping"] + + if not owner_email: + stats["owner_missing_in_config"] += 1 + return + + project_pk_value = project_row["id"] + user_row = _find_user_by_email(session, users_table, owner_email) + + if user_row is None: + stats["owner_pending_users"] += 1 + logger.info( + "Owner account not found yet; project owner link will be created later. " + f"owner_email={owner_email} project_vertex_id={project_row.get('vertex_id')}" + ) + return + + user_id = user_row.get("id") + if user_id is None: + stats["owner_linking_unavailable"] += 1 + logger.warning(f"Skipping owner linking because users row has no id for owner_email={owner_email}") + return + + existing_link = session.execute( + select(mapping_table) + .where(mapping_table.c["user_id"] == user_id) + .where(mapping_table.c["project_id"] == project_pk_value) + .limit(1) + ).first() + + if existing_link: + stats["owner_links_existing"] += 1 + return + + session.execute( + mapping_table.insert().values( + { + "user_id": user_id, + "project_id": project_pk_value, + } + ) + ) + stats["owner_links_inserted"] += 1 + + +def ingest_static_projects( + database_url: str, + projects_dir: str | Path, + schema: str = "public", + dry_run: bool = False, +) -> dict: + projects = discover_static_projects(projects_dir) + stats = { + "projects_seen": len(projects), + "projects_inserted": 0, + "projects_existing": 0, + "projects_skipped": 0, + "owner_links_inserted": 0, + "owner_links_existing": 0, + "owner_pending_users": 0, + "owner_missing_in_config": 0, + "owner_linking_unavailable": 0, + "owner_immutable_skipped": 0, + } + + if not projects: + logger.info("No static projects discovered; nothing to ingest") + return stats + + engine = create_engine(database_url) + tables = _load_tables(engine, schema=schema) + projects_table = tables["projects"] + users_table = tables["users"] + + with Session(engine) as session: + for project in projects: + project_already_existed = False + existing_row = _find_project_by_vertex_id(session, projects_table, project["project_id"]) + if existing_row is None: + owner_email = project.get("project_owner") + if not owner_email: + raise RuntimeError( + "Cannot insert project: config_file.json is missing project_owner. " + f"Parsed config={_project_debug_summary(project)}" + ) + + owner_row = _find_user_by_email(session, users_table, owner_email) + if owner_row is None: + raise RuntimeError( + f"Cannot insert project: owner user does not exist for {owner_email}. " + f"Parsed config={_project_debug_summary(project)}" + ) + + if not dry_run: + insert_values = _build_project_insert_values(project, projects_table, owner_row["id"]) + session.execute(projects_table.insert().values(insert_values)) + existing_row = _find_project_by_vertex_id(session, projects_table, project["project_id"]) + stats["projects_inserted"] += 1 + else: + stats["projects_existing"] += 1 + project_already_existed = True + + if existing_row is None: + stats["projects_skipped"] += 1 + logger.warning( + "Could not resolve projects row after insert attempt; skipping owner mapping for " + f"project_id={project['project_id']}" + ) + continue + + if not dry_run: + if project_already_existed and _project_has_owner_mapping(session, tables, existing_row): + stats["owner_immutable_skipped"] += 1 + continue + _try_link_owner(session, tables, existing_row, project.get("project_owner"), stats) + + if dry_run: + session.rollback() + else: + session.commit() + + logger.info( + "Static project ingestion summary: " + f"seen={stats['projects_seen']} inserted={stats['projects_inserted']} existing={stats['projects_existing']} " + f"owner_links_inserted={stats['owner_links_inserted']} owner_links_existing={stats['owner_links_existing']} " + f"owner_pending_users={stats['owner_pending_users']} owner_immutable_skipped={stats['owner_immutable_skipped']}" + ) + return stats + + +def parse_args(argv=None): + parser = argparse.ArgumentParser(description="Ingest static VERTEX projects into auth/access database tables") + parser.add_argument( + "--projects-dir", + default=None, + help="Directory containing static project folders (defaults to VERTEX_PROJECTS_DIR or projects/)", + ) + parser.add_argument( + "--database-url", + default=None, + help="Database URL override. Defaults to vertex configuration resolution.", + ) + parser.add_argument("--schema", default="public", help="Database schema name for projects/users mapping tables") + parser.add_argument("--dry-run", action="store_true", help="Run without writing to the database") + return parser.parse_args(argv) + + +def main(argv=None): + args = parse_args(argv) + + projects_dir = Path(args.projects_dir or os.getenv("VERTEX_PROJECTS_DIR") or "projects") + + database_url = args.database_url or get_database_url() + + try: + ingest_static_projects(database_url=database_url, projects_dir=projects_dir, schema=args.schema, dry_run=args.dry_run) + return 0 + except (RuntimeError, SQLAlchemyError) as exc: + logger.error(f"Static project ingestion failed: {exc}") + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) From 3a7c9dedb744533f047d8f1c36f667400715a991 Mon Sep 17 00:00:00 2001 From: Alasdair Wilson Date: Thu, 28 May 2026 16:04:13 +0100 Subject: [PATCH 2/4] Add unit tests for project ingestion behavior --- tests/unit/test_project_ingestion.py | 160 +++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 tests/unit/test_project_ingestion.py diff --git a/tests/unit/test_project_ingestion.py b/tests/unit/test_project_ingestion.py new file mode 100644 index 0000000..1972565 --- /dev/null +++ b/tests/unit/test_project_ingestion.py @@ -0,0 +1,160 @@ +import json +from pathlib import Path + +import pytest +from sqlalchemy import Boolean, Column, DateTime, Integer, MetaData, String, Table, UniqueConstraint, create_engine, select +from sqlalchemy.orm import Session + +from vertex.project_ingestion import ingest_static_projects + + +def _write_project(root: Path, folder: str, project_id: str, name: str, owner: str, is_public: bool = True): + project_dir = root / folder + project_dir.mkdir(parents=True, exist_ok=True) + (project_dir / "config_file.json").write_text( + json.dumps( + { + "project_id": project_id, + "project_name": name, + "project_owner": owner, + "is_public": is_public, + }, + indent=2, + ) + + "\n" + ) + + +def _prepare_schema(database_url: str): + engine = create_engine(database_url) + metadata = MetaData() + + users = Table( + "users", + metadata, + Column("id", Integer, primary_key=True, autoincrement=True), + Column("email", String, unique=True, nullable=False), + Column("is_admin", Boolean, nullable=False, default=False), + Column("created", DateTime(timezone=True), nullable=True), + Column("updated", DateTime(timezone=True), nullable=True), + Column("last_login", DateTime(timezone=True), nullable=True), + ) + + projects = Table( + "projects", + metadata, + Column("id", Integer, primary_key=True, autoincrement=True), + Column("vertex_id", String, unique=True, nullable=False), + Column("owner_id", Integer, nullable=False), + Column("is_public", Boolean, nullable=False), + Column("created", DateTime(timezone=True), nullable=True), + Column("updated", DateTime(timezone=True), nullable=True), + ) + + mapping = Table( + "user_project_mapping", + metadata, + Column("id", Integer, primary_key=True, autoincrement=True), + Column("user_id", Integer, nullable=False), + Column("project_id", Integer, nullable=False), + Column("created", DateTime(timezone=True), nullable=True), + Column("updated", DateTime(timezone=True), nullable=True), + UniqueConstraint("user_id", "project_id", name="uq_user_project_mapping"), + ) + + metadata.create_all(engine) + return engine, users, projects, mapping + + +def test_ingest_static_projects_inserts_new_rows_and_maps_existing_owners(tmp_path): + projects_dir = tmp_path / "vertex-projects" + projects_dir.mkdir() + _write_project(projects_dir, "proj-a", "vertex-a", "Vertex A", "owner-a@example.com") + _write_project(projects_dir, "proj-b", "vertex-b", "Vertex B", "owner-b@example.com") + + database_url = f"sqlite+pysqlite:///{tmp_path / 'auth.sqlite'}" + engine, users, projects, mapping = _prepare_schema(database_url) + + with Session(engine) as session: + session.execute(users.insert().values(email="owner-a@example.com", is_admin=False)) + session.execute(users.insert().values(email="owner-b@example.com", is_admin=False)) + session.commit() + + stats = ingest_static_projects(database_url=database_url, projects_dir=projects_dir, schema="public") + + assert stats["projects_seen"] == 2 + assert stats["projects_inserted"] == 2 + assert stats["projects_existing"] == 0 + assert stats["owner_links_inserted"] == 2 + assert stats["owner_pending_users"] == 0 + + with Session(engine) as session: + db_projects = session.execute(select(projects.c.vertex_id)).scalars().all() + assert sorted(db_projects) == ["vertex-a", "vertex-b"] + + mapping_rows = session.execute(select(mapping.c.user_id, mapping.c.project_id)).all() + assert len(mapping_rows) == 2 + + +def test_ingest_static_projects_skips_completely_invalid_json_and_exits_early(tmp_path): + projects_dir = tmp_path / "vertex-projects" + projects_dir.mkdir() + broken_project_dir = projects_dir / "proj-bad" + broken_project_dir.mkdir() + (broken_project_dir / "config_file.json").write_text("{ this is not valid json") + + database_url = f"sqlite+pysqlite:///{tmp_path / 'auth.sqlite'}" + engine, users, projects, mapping = _prepare_schema(database_url) + + stats = ingest_static_projects(database_url=database_url, projects_dir=projects_dir, schema="public") + + assert stats["projects_seen"] == 0 + assert stats["projects_inserted"] == 0 + assert stats["projects_existing"] == 0 + + with Session(engine) as session: + assert session.execute(select(users.c.id)).first() is None + assert session.execute(select(projects.c.id)).first() is None + assert session.execute(select(mapping.c.id)).first() is None + + +def test_ingest_static_projects_errors_when_owner_user_is_missing_for_new_project(tmp_path): + projects_dir = tmp_path / "vertex-projects" + projects_dir.mkdir() + _write_project(projects_dir, "proj-a", "vertex-a", "Vertex A", "owner-a@example.com") + + database_url = f"sqlite+pysqlite:///{tmp_path / 'auth.sqlite'}" + _prepare_schema(database_url) + + with pytest.raises(RuntimeError, match="owner user does not exist"): + ingest_static_projects(database_url=database_url, projects_dir=projects_dir, schema="public") + + +def test_ingest_static_projects_keeps_existing_owner_mapping_immutable(tmp_path): + projects_dir = tmp_path / "vertex-projects" + projects_dir.mkdir() + _write_project(projects_dir, "proj-a", "vertex-a", "Vertex A", "owner-a@example.com") + + database_url = f"sqlite+pysqlite:///{tmp_path / 'auth.sqlite'}" + engine, users, projects, mapping = _prepare_schema(database_url) + + with Session(engine) as session: + session.execute(users.insert().values(email="owner-a@example.com", is_admin=False)) + session.execute(users.insert().values(email="owner-b@example.com", is_admin=False)) + session.commit() + + first_stats = ingest_static_projects(database_url=database_url, projects_dir=projects_dir, schema="public") + assert first_stats["owner_links_inserted"] == 1 + + _write_project(projects_dir, "proj-a", "vertex-a", "Vertex A", "owner-b@example.com") + second_stats = ingest_static_projects(database_url=database_url, projects_dir=projects_dir, schema="public") + assert second_stats["projects_existing"] == 1 + assert second_stats["owner_links_inserted"] == 0 + assert second_stats["owner_immutable_skipped"] == 1 + + with Session(engine) as session: + project_pk = session.execute(select(projects.c.id).where(projects.c.vertex_id == "vertex-a")).scalar_one() + mapping_rows = session.execute( + select(mapping.c.user_id, mapping.c.project_id).where(mapping.c.project_id == project_pk) + ).all() + assert len(mapping_rows) == 1 From 0e56ef57b9cef0fa8bb10b162eed4a2bc61e2a03 Mon Sep 17 00:00:00 2001 From: Alasdair Wilson Date: Thu, 28 May 2026 16:04:31 +0100 Subject: [PATCH 3/4] Document external projects sync and ingestion --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index 07d2484..f9a8ca5 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,22 @@ Both paths are configurable with: - `VERTEX_PROJECTS_DIR` +Static projects can be sourced from an external repository (`ISARICResearch/VERTEX-projects`) and synced on deploy plus hourly via cron using: + +- `scripts/sync_projects_repo.sh` +- `scripts/install_projects_sync_cron.sh` + +After each sync, VERTEX can ingest static project access metadata into the auth database (`public.projects` and `public.user_project_mapping`) using: + +- `python -m vertex.project_ingestion --projects-dir "$VERTEX_PROJECTS_DIR"` + +Ingestion behavior is: + +- inserts only projects that do not already exist by `vertex_id`/`project_id` +- does not overwrite existing project rows +- links `project_owner` to project when the owner user exists. +- if owner user does not exist yet, logs and retries on the next run (non-blocking) + For prebuilt/static projects, `config_file.json` should include: - `project_name` From d0fce03034ba56db49e232d0fb6fe9332b72211d Mon Sep 17 00:00:00 2001 From: Alasdair Wilson Date: Thu, 28 May 2026 16:07:35 +0100 Subject: [PATCH 4/4] add timestaps to project sync/deploy logs --- .github/workflows/deploy.yml | 6 +++++- scripts/install_projects_sync_cron.sh | 2 +- scripts/sync_projects_repo.sh | 6 ++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 85aaf5e..107a7d9 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -95,6 +95,8 @@ jobs: PROJECTS_REPO_BRANCH="${PROJECTS_REPO_BRANCH:-main}" PROJECTS_GIT_SSH_KEY_PATH="${PROJECTS_GIT_SSH_KEY_PATH:-/root/.ssh/vertex_projects_deploy_key}" PROJECTS_GIT_KNOWN_HOSTS_PATH="${PROJECTS_GIT_KNOWN_HOSTS_PATH:-/root/.ssh/known_hosts}" + start_ts="$(date -u --iso-8601=seconds)" + echo "[${start_ts}] Sync started: repo=${PROJECTS_REPO_URL} branch=${PROJECTS_REPO_BRANCH} dir=${PROJECTS_DIR}" export GIT_SSH_COMMAND="ssh -i ${PROJECTS_GIT_SSH_KEY_PATH} -o IdentitiesOnly=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${PROJECTS_GIT_KNOWN_HOSTS_PATH}" mkdir -p "${PROJECTS_DIR}" if [ -d "${PROJECTS_DIR}/.git" ]; then @@ -105,6 +107,8 @@ jobs: rm -rf "${PROJECTS_DIR}" git clone --branch "${PROJECTS_REPO_BRANCH}" "${PROJECTS_REPO_URL}" "${PROJECTS_DIR}" fi + end_ts="$(date -u --iso-8601=seconds)" + echo "[${end_ts}] Sync finished successfully" SYNC chmod +x "${SYNC_SCRIPT_PATH}" @@ -119,7 +123,7 @@ jobs: PROJECTS_GIT_SSH_KEY_PATH=/root/.ssh/vertex_projects_deploy_key PROJECTS_GIT_KNOWN_HOSTS_PATH=/root/.ssh/known_hosts VERTEX_SYNC_CONTAINER_NAME=isaric-vertex - 0 * * * * root ${SYNC_SCRIPT_PATH} && docker exec ${VERTEX_SYNC_CONTAINER_NAME} python -m vertex.project_ingestion --projects-dir ${VERTEX_PROJECTS_DIR} >> /var/log/vertex-projects-sync.log 2>&1 + 0 * * * * root { ts=$(date -u --iso-8601=seconds); echo "[${ts}] vertex-projects cron run started"; ${SYNC_SCRIPT_PATH}; sync_rc=$?; if [ ${sync_rc} -ne 0 ]; then ts=$(date -u --iso-8601=seconds); echo "[${ts}] sync failed rc=${sync_rc}"; exit ${sync_rc}; fi; ts=$(date -u --iso-8601=seconds); echo "[${ts}] ingestion started container=${VERTEX_SYNC_CONTAINER_NAME} projects_dir=${VERTEX_PROJECTS_DIR}"; docker exec ${VERTEX_SYNC_CONTAINER_NAME} python -m vertex.project_ingestion --projects-dir ${VERTEX_PROJECTS_DIR}; ingest_rc=$?; ts=$(date -u --iso-8601=seconds); echo "[${ts}] vertex-projects cron run finished rc=${ingest_rc}"; exit ${ingest_rc}; } >> /var/log/vertex-projects-sync.log 2>&1 CRON chmod 644 /etc/cron.d/vertex-projects-sync diff --git a/scripts/install_projects_sync_cron.sh b/scripts/install_projects_sync_cron.sh index 56c8ee9..22e9c2f 100755 --- a/scripts/install_projects_sync_cron.sh +++ b/scripts/install_projects_sync_cron.sh @@ -14,7 +14,7 @@ VERTEX_PROJECTS_DIR=/opt/vertex-projects PROJECTS_GIT_SSH_KEY_PATH=/root/.ssh/vertex_projects_deploy_key PROJECTS_GIT_KNOWN_HOSTS_PATH=/root/.ssh/known_hosts VERTEX_SYNC_CONTAINER_NAME=isaric-vertex -0 * * * * root ${SYNC_SCRIPT_PATH} && docker exec ${VERTEX_SYNC_CONTAINER_NAME} python -m vertex.project_ingestion --projects-dir ${VERTEX_PROJECTS_DIR} >> ${CRON_LOG_PATH} 2>&1 +0 * * * * root { ts=$(date -u --iso-8601=seconds); echo "[${ts}] vertex-projects cron run started"; ${SYNC_SCRIPT_PATH}; sync_rc=$?; if [ ${sync_rc} -ne 0 ]; then ts=$(date -u --iso-8601=seconds); echo "[${ts}] sync failed rc=${sync_rc}"; exit ${sync_rc}; fi; ts=$(date -u --iso-8601=seconds); echo "[${ts}] ingestion started container=${VERTEX_SYNC_CONTAINER_NAME} projects_dir=${VERTEX_PROJECTS_DIR}"; docker exec ${VERTEX_SYNC_CONTAINER_NAME} python -m vertex.project_ingestion --projects-dir ${VERTEX_PROJECTS_DIR}; ingest_rc=$?; ts=$(date -u --iso-8601=seconds); echo "[${ts}] vertex-projects cron run finished rc=${ingest_rc}"; exit ${ingest_rc}; } >> ${CRON_LOG_PATH} 2>&1 EOF chmod 644 "${CRON_FILE_PATH}" diff --git a/scripts/sync_projects_repo.sh b/scripts/sync_projects_repo.sh index 733d98b..5d84d36 100755 --- a/scripts/sync_projects_repo.sh +++ b/scripts/sync_projects_repo.sh @@ -7,6 +7,9 @@ PROJECTS_REPO_BRANCH="${PROJECTS_REPO_BRANCH:-main}" PROJECTS_GIT_SSH_KEY_PATH="${PROJECTS_GIT_SSH_KEY_PATH:-/root/.ssh/vertex_projects_deploy_key}" PROJECTS_GIT_KNOWN_HOSTS_PATH="${PROJECTS_GIT_KNOWN_HOSTS_PATH:-/root/.ssh/known_hosts}" +start_ts="$(date -u --iso-8601=seconds)" +echo "[${start_ts}] Sync started: repo=${PROJECTS_REPO_URL} branch=${PROJECTS_REPO_BRANCH} dir=${PROJECTS_DIR}" + if [[ ! -f "${PROJECTS_GIT_SSH_KEY_PATH}" ]]; then echo "Missing SSH deploy key: ${PROJECTS_GIT_SSH_KEY_PATH}" >&2 exit 1 @@ -26,3 +29,6 @@ else fi echo "Projects repo synced at ${PROJECTS_DIR} (branch: ${PROJECTS_REPO_BRANCH})" + +end_ts="$(date -u --iso-8601=seconds)" +echo "[${end_ts}] Sync finished successfully"