Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 16 additions & 17 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@ jobs:
env:
ECR_REPO: ${{ secrets.ECR_REPOSITORY }}
INSTANCE_ID: ${{ secrets.EC2_INSTANCE_ID }}
PROJECTS_REPO_BRANCH: ${{ vars.PROJECTS_REPO_BRANCH }}
PROJECTS_GIT_SSH_KEY_PATH: ${{ vars.PROJECTS_GIT_SSH_KEY_PATH }}
PROJECTS_GIT_KNOWN_HOSTS_PATH: ${{ vars.PROJECTS_GIT_KNOWN_HOSTS_PATH }}
run: |
set -euo pipefail

Expand All @@ -57,10 +54,10 @@ jobs:

PROJECTS_DIR="/opt/vertex-projects"
PROJECTS_REPO_URL="git@github.com:ISARICResearch/VERTEX-projects.git"
PROJECTS_REPO_BRANCH="${PROJECTS_REPO_BRANCH:-main}"
PROJECTS_REPO_BRANCH="main"
SYNC_SCRIPT_PATH="/usr/local/bin/vertex-sync-projects.sh"
PROJECTS_GIT_SSH_KEY_PATH="${PROJECTS_GIT_SSH_KEY_PATH:-/root/.ssh/vertex_projects_deploy_key}"
PROJECTS_GIT_KNOWN_HOSTS_PATH="${PROJECTS_GIT_KNOWN_HOSTS_PATH:-/root/.ssh/known_hosts}"
PROJECTS_GIT_SSH_KEY_PATH="/root/.ssh/vertex_projects_deploy_key"
PROJECTS_GIT_KNOWN_HOSTS_PATH="/root/.ssh/known_hosts"

# 1) write the indented heredoc to a temp file (variables expanded on runner)
cat > /tmp/ssm-script.indented <<INDENTED
Expand Down Expand Up @@ -94,10 +91,12 @@ jobs:
#!/bin/bash
set -euo pipefail
PROJECTS_DIR="${VERTEX_PROJECTS_DIR:-/opt/vertex-projects}"
PROJECTS_REPO_URL="git@github.com:ISARICResearch/VERTEX-projects.git"
PROJECTS_REPO_URL="${PROJECTS_REPO_URL:-git@github.com:ISARICResearch/VERTEX-projects.git}"
PROJECTS_REPO_BRANCH="${PROJECTS_REPO_BRANCH:-main}"
PROJECTS_GIT_SSH_KEY_PATH="${PROJECTS_GIT_SSH_KEY_PATH:-/root/.ssh/vertex_projects_deploy_key}"
PROJECTS_GIT_KNOWN_HOSTS_PATH="${PROJECTS_GIT_KNOWN_HOSTS_PATH:-/root/.ssh/known_hosts}"
start_ts="$(date -u --iso-8601=seconds)"
echo "[${start_ts}] Sync started: repo=${PROJECTS_REPO_URL} branch=${PROJECTS_REPO_BRANCH} dir=${PROJECTS_DIR}"
export GIT_SSH_COMMAND="ssh -i ${PROJECTS_GIT_SSH_KEY_PATH} -o IdentitiesOnly=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${PROJECTS_GIT_KNOWN_HOSTS_PATH}"
mkdir -p "${PROJECTS_DIR}"
if [ -d "${PROJECTS_DIR}/.git" ]; then
Expand All @@ -108,24 +107,23 @@ jobs:
rm -rf "${PROJECTS_DIR}"
git clone --branch "${PROJECTS_REPO_BRANCH}" "${PROJECTS_REPO_URL}" "${PROJECTS_DIR}"
fi
end_ts="$(date -u --iso-8601=seconds)"
echo "[${end_ts}] Sync finished successfully"
Copy link
Copy Markdown
Member

@sr-murthy sr-murthy May 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be useful for debugging to include the time diff end_ts - start_ts in the echo message to get an idea of sync times, so something like:

time_diff=$(echo "$(date --date=$end_ts +%s) - $(date --date=$start_ts +%s)" |  bc -l)
echo "[${end_ts}] Sync finished successfully in $time_diff seconds"

Had to install GNU coreutils on Mac to test the time diff with gdate (not date which errors), but this should work.

SYNC
chmod +x "${SYNC_SCRIPT_PATH}"

export PROJECTS_REPO_BRANCH
export VERTEX_PROJECTS_DIR="${PROJECTS_DIR}"
export PROJECTS_GIT_SSH_KEY_PATH
export PROJECTS_GIT_KNOWN_HOSTS_PATH
"${SYNC_SCRIPT_PATH}"

cat > /etc/cron.d/vertex-projects-sync <<CRON
SHELL=/bin/bash
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
PROJECTS_REPO_URL=git@github.com:ISARICResearch/VERTEX-projects.git
PROJECTS_REPO_BRANCH=${PROJECTS_REPO_BRANCH:-main}
VERTEX_PROJECTS_DIR=${PROJECTS_DIR}
PROJECTS_GIT_SSH_KEY_PATH=${PROJECTS_GIT_SSH_KEY_PATH:-/root/.ssh/vertex_projects_deploy_key}
PROJECTS_GIT_KNOWN_HOSTS_PATH=${PROJECTS_GIT_KNOWN_HOSTS_PATH:-/root/.ssh/known_hosts}
0 * * * * root ${SYNC_SCRIPT_PATH} >> /var/log/vertex-projects-sync.log 2>&1
PROJECTS_REPO_BRANCH=main
VERTEX_PROJECTS_DIR=/opt/vertex-projects
PROJECTS_GIT_SSH_KEY_PATH=/root/.ssh/vertex_projects_deploy_key
PROJECTS_GIT_KNOWN_HOSTS_PATH=/root/.ssh/known_hosts
VERTEX_SYNC_CONTAINER_NAME=isaric-vertex
0 * * * * root { ts=$(date -u --iso-8601=seconds); echo "[${ts}] vertex-projects cron run started"; ${SYNC_SCRIPT_PATH}; sync_rc=$?; if [ ${sync_rc} -ne 0 ]; then ts=$(date -u --iso-8601=seconds); echo "[${ts}] sync failed rc=${sync_rc}"; exit ${sync_rc}; fi; ts=$(date -u --iso-8601=seconds); echo "[${ts}] ingestion started container=${VERTEX_SYNC_CONTAINER_NAME} projects_dir=${VERTEX_PROJECTS_DIR}"; docker exec ${VERTEX_SYNC_CONTAINER_NAME} python -m vertex.project_ingestion --projects-dir ${VERTEX_PROJECTS_DIR}; ingest_rc=$?; ts=$(date -u --iso-8601=seconds); echo "[${ts}] vertex-projects cron run finished rc=${ingest_rc}"; exit ${ingest_rc}; } >> /var/log/vertex-projects-sync.log 2>&1
CRON
chmod 644 /etc/cron.d/vertex-projects-sync

Expand All @@ -142,7 +140,6 @@ jobs:
docker run -d --restart unless-stopped \
--env-file /etc/environment \
-e VERTEX_PROJECTS_DIR="${PROJECTS_DIR}" \
-e VERTEX_PRELOAD_PROJECTS="false" \
-v "${PROJECTS_DIR}:${PROJECTS_DIR}:ro" \
--name isaric-vertex \
-p 8050:8050 \
Expand All @@ -157,6 +154,8 @@ jobs:
exit 1
fi

docker exec isaric-vertex python -m vertex.project_ingestion --projects-dir "${PROJECTS_DIR}"

# cleanup images/volumes
docker image prune -f || true
docker volume prune -f || true
Expand Down
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,22 @@ Both paths are configurable with:

- `VERTEX_PROJECTS_DIR`

Static projects can be sourced from an external repository (`ISARICResearch/VERTEX-projects`) and synced on deploy plus hourly via cron using:

- `scripts/sync_projects_repo.sh`
- `scripts/install_projects_sync_cron.sh`

After each sync, VERTEX can ingest static project access metadata into the auth database (`public.projects` and `public.user_project_mapping`) using:

- `python -m vertex.project_ingestion --projects-dir "$VERTEX_PROJECTS_DIR"`

Ingestion behavior is:

- inserts only projects that do not already exist by `vertex_id`/`project_id`
Comment thread
sr-murthy marked this conversation as resolved.
- does not overwrite existing project rows
- links `project_owner` to project when the owner user exists.
Comment thread
sr-murthy marked this conversation as resolved.
- if owner user does not exist yet, logs and retries on the next run (non-blocking)

For prebuilt/static projects, `config_file.json` should include:

- `project_name`
Expand Down
17 changes: 9 additions & 8 deletions scripts/install_projects_sync_cron.sh
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
#!/usr/bin/env bash
set -euo pipefail

SYNC_SCRIPT_PATH="${SYNC_SCRIPT_PATH:-/usr/local/bin/vertex-sync-projects.sh}"
CRON_FILE_PATH="${CRON_FILE_PATH:-/etc/cron.d/vertex-projects-sync}"
CRON_LOG_PATH="${CRON_LOG_PATH:-/var/log/vertex-projects-sync.log}"
SYNC_SCRIPT_PATH="/usr/local/bin/vertex-sync-projects.sh"
CRON_FILE_PATH="/etc/cron.d/vertex-projects-sync"
CRON_LOG_PATH="/var/log/vertex-projects-sync.log"

cat > "${CRON_FILE_PATH}" <<EOF
SHELL=/bin/bash
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
PROJECTS_REPO_URL=git@github.com:ISARICResearch/VERTEX-projects.git
PROJECTS_REPO_BRANCH=${PROJECTS_REPO_BRANCH:-main}
VERTEX_PROJECTS_DIR=${VERTEX_PROJECTS_DIR:-/opt/vertex-projects}
PROJECTS_GIT_SSH_KEY_PATH=${PROJECTS_GIT_SSH_KEY_PATH:-/root/.ssh/vertex_projects_deploy_key}
PROJECTS_GIT_KNOWN_HOSTS_PATH=${PROJECTS_GIT_KNOWN_HOSTS_PATH:-/root/.ssh/known_hosts}
0 * * * * root ${SYNC_SCRIPT_PATH} >> ${CRON_LOG_PATH} 2>&1
PROJECTS_REPO_BRANCH=main
VERTEX_PROJECTS_DIR=/opt/vertex-projects
PROJECTS_GIT_SSH_KEY_PATH=/root/.ssh/vertex_projects_deploy_key
PROJECTS_GIT_KNOWN_HOSTS_PATH=/root/.ssh/known_hosts
VERTEX_SYNC_CONTAINER_NAME=isaric-vertex
0 * * * * root { ts=$(date -u --iso-8601=seconds); echo "[${ts}] vertex-projects cron run started"; ${SYNC_SCRIPT_PATH}; sync_rc=$?; if [ ${sync_rc} -ne 0 ]; then ts=$(date -u --iso-8601=seconds); echo "[${ts}] sync failed rc=${sync_rc}"; exit ${sync_rc}; fi; ts=$(date -u --iso-8601=seconds); echo "[${ts}] ingestion started container=${VERTEX_SYNC_CONTAINER_NAME} projects_dir=${VERTEX_PROJECTS_DIR}"; docker exec ${VERTEX_SYNC_CONTAINER_NAME} python -m vertex.project_ingestion --projects-dir ${VERTEX_PROJECTS_DIR}; ingest_rc=$?; ts=$(date -u --iso-8601=seconds); echo "[${ts}] vertex-projects cron run finished rc=${ingest_rc}"; exit ${ingest_rc}; } >> ${CRON_LOG_PATH} 2>&1
EOF

chmod 644 "${CRON_FILE_PATH}"
Expand Down
8 changes: 7 additions & 1 deletion scripts/sync_projects_repo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
set -euo pipefail

PROJECTS_DIR="${VERTEX_PROJECTS_DIR:-/opt/vertex-projects}"
PROJECTS_REPO_URL="git@github.com:ISARICResearch/VERTEX-projects.git"
PROJECTS_REPO_URL="${PROJECTS_REPO_URL:-git@github.com:ISARICResearch/VERTEX-projects.git}"
PROJECTS_REPO_BRANCH="${PROJECTS_REPO_BRANCH:-main}"
PROJECTS_GIT_SSH_KEY_PATH="${PROJECTS_GIT_SSH_KEY_PATH:-/root/.ssh/vertex_projects_deploy_key}"
PROJECTS_GIT_KNOWN_HOSTS_PATH="${PROJECTS_GIT_KNOWN_HOSTS_PATH:-/root/.ssh/known_hosts}"

start_ts="$(date -u --iso-8601=seconds)"
echo "[${start_ts}] Sync started: repo=${PROJECTS_REPO_URL} branch=${PROJECTS_REPO_BRANCH} dir=${PROJECTS_DIR}"

if [[ ! -f "${PROJECTS_GIT_SSH_KEY_PATH}" ]]; then
echo "Missing SSH deploy key: ${PROJECTS_GIT_SSH_KEY_PATH}" >&2
exit 1
Expand All @@ -26,3 +29,6 @@ else
fi

echo "Projects repo synced at ${PROJECTS_DIR} (branch: ${PROJECTS_REPO_BRANCH})"

end_ts="$(date -u --iso-8601=seconds)"
echo "[${end_ts}] Sync finished successfully"
160 changes: 160 additions & 0 deletions tests/unit/test_project_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import json
from pathlib import Path

import pytest
from sqlalchemy import Boolean, Column, DateTime, Integer, MetaData, String, Table, UniqueConstraint, create_engine, select
from sqlalchemy.orm import Session

from vertex.project_ingestion import ingest_static_projects


def _write_project(root: Path, folder: str, project_id: str, name: str, owner: str, is_public: bool = True):
project_dir = root / folder
project_dir.mkdir(parents=True, exist_ok=True)
(project_dir / "config_file.json").write_text(
json.dumps(
{
"project_id": project_id,
"project_name": name,
"project_owner": owner,
"is_public": is_public,
},
indent=2,
)
+ "\n"
)


def _prepare_schema(database_url: str):
engine = create_engine(database_url)
metadata = MetaData()

users = Table(
"users",
metadata,
Column("id", Integer, primary_key=True, autoincrement=True),
Column("email", String, unique=True, nullable=False),
Column("is_admin", Boolean, nullable=False, default=False),
Column("created", DateTime(timezone=True), nullable=True),
Column("updated", DateTime(timezone=True), nullable=True),
Column("last_login", DateTime(timezone=True), nullable=True),
)

projects = Table(
"projects",
metadata,
Column("id", Integer, primary_key=True, autoincrement=True),
Column("vertex_id", String, unique=True, nullable=False),
Column("owner_id", Integer, nullable=False),
Column("is_public", Boolean, nullable=False),
Column("created", DateTime(timezone=True), nullable=True),
Column("updated", DateTime(timezone=True), nullable=True),
)

mapping = Table(
"user_project_mapping",
metadata,
Column("id", Integer, primary_key=True, autoincrement=True),
Column("user_id", Integer, nullable=False),
Column("project_id", Integer, nullable=False),
Column("created", DateTime(timezone=True), nullable=True),
Column("updated", DateTime(timezone=True), nullable=True),
UniqueConstraint("user_id", "project_id", name="uq_user_project_mapping"),
)

metadata.create_all(engine)
return engine, users, projects, mapping


def test_ingest_static_projects_inserts_new_rows_and_maps_existing_owners(tmp_path):
projects_dir = tmp_path / "vertex-projects"
projects_dir.mkdir()
_write_project(projects_dir, "proj-a", "vertex-a", "Vertex A", "owner-a@example.com")
_write_project(projects_dir, "proj-b", "vertex-b", "Vertex B", "owner-b@example.com")

database_url = f"sqlite+pysqlite:///{tmp_path / 'auth.sqlite'}"
engine, users, projects, mapping = _prepare_schema(database_url)

with Session(engine) as session:
session.execute(users.insert().values(email="owner-a@example.com", is_admin=False))
session.execute(users.insert().values(email="owner-b@example.com", is_admin=False))
session.commit()

stats = ingest_static_projects(database_url=database_url, projects_dir=projects_dir, schema="public")

assert stats["projects_seen"] == 2
assert stats["projects_inserted"] == 2
assert stats["projects_existing"] == 0
assert stats["owner_links_inserted"] == 2
assert stats["owner_pending_users"] == 0

with Session(engine) as session:
db_projects = session.execute(select(projects.c.vertex_id)).scalars().all()
assert sorted(db_projects) == ["vertex-a", "vertex-b"]

mapping_rows = session.execute(select(mapping.c.user_id, mapping.c.project_id)).all()
assert len(mapping_rows) == 2


def test_ingest_static_projects_skips_completely_invalid_json_and_exits_early(tmp_path):
projects_dir = tmp_path / "vertex-projects"
projects_dir.mkdir()
broken_project_dir = projects_dir / "proj-bad"
broken_project_dir.mkdir()
(broken_project_dir / "config_file.json").write_text("{ this is not valid json")

database_url = f"sqlite+pysqlite:///{tmp_path / 'auth.sqlite'}"
engine, users, projects, mapping = _prepare_schema(database_url)

stats = ingest_static_projects(database_url=database_url, projects_dir=projects_dir, schema="public")

assert stats["projects_seen"] == 0
assert stats["projects_inserted"] == 0
assert stats["projects_existing"] == 0

with Session(engine) as session:
assert session.execute(select(users.c.id)).first() is None
assert session.execute(select(projects.c.id)).first() is None
assert session.execute(select(mapping.c.id)).first() is None


def test_ingest_static_projects_errors_when_owner_user_is_missing_for_new_project(tmp_path):
projects_dir = tmp_path / "vertex-projects"
projects_dir.mkdir()
_write_project(projects_dir, "proj-a", "vertex-a", "Vertex A", "owner-a@example.com")

database_url = f"sqlite+pysqlite:///{tmp_path / 'auth.sqlite'}"
_prepare_schema(database_url)

with pytest.raises(RuntimeError, match="owner user does not exist"):
ingest_static_projects(database_url=database_url, projects_dir=projects_dir, schema="public")


def test_ingest_static_projects_keeps_existing_owner_mapping_immutable(tmp_path):
projects_dir = tmp_path / "vertex-projects"
projects_dir.mkdir()
_write_project(projects_dir, "proj-a", "vertex-a", "Vertex A", "owner-a@example.com")

database_url = f"sqlite+pysqlite:///{tmp_path / 'auth.sqlite'}"
engine, users, projects, mapping = _prepare_schema(database_url)

with Session(engine) as session:
session.execute(users.insert().values(email="owner-a@example.com", is_admin=False))
session.execute(users.insert().values(email="owner-b@example.com", is_admin=False))
session.commit()

first_stats = ingest_static_projects(database_url=database_url, projects_dir=projects_dir, schema="public")
assert first_stats["owner_links_inserted"] == 1

_write_project(projects_dir, "proj-a", "vertex-a", "Vertex A", "owner-b@example.com")
second_stats = ingest_static_projects(database_url=database_url, projects_dir=projects_dir, schema="public")
assert second_stats["projects_existing"] == 1
assert second_stats["owner_links_inserted"] == 0
assert second_stats["owner_immutable_skipped"] == 1

with Session(engine) as session:
project_pk = session.execute(select(projects.c.id).where(projects.c.vertex_id == "vertex-a")).scalar_one()
mapping_rows = session.execute(
select(mapping.c.user_id, mapping.c.project_id).where(mapping.c.project_id == project_pk)
).all()
assert len(mapping_rows) == 1
Loading
Loading