diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..15b6da6 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,68 @@ +name: Tests + +on: + push: + branches: [main] + pull_request: + +jobs: + sqlite-tests: + name: SQLite tests (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Run tests + run: pytest -q + + postgres-tests: + name: PostgreSQL tests (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.12", "3.13"] + + services: + postgres: + image: postgres:16 + env: + POSTGRES_USER: test + POSTGRES_PASSWORD: test + POSTGRES_DB: test_db + ports: + - 55432:5432 + options: >- + --health-cmd "pg_isready -U test -d test_db" + --health-interval 2s + --health-timeout 5s + --health-retries 10 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies (including postgres extra) + run: pip install -e ".[dev,postgres]" + + - name: Run tests + run: pytest -v + env: + ENGINE_CDM: postgresql+psycopg://test:test@localhost:55432/test_db diff --git a/.gitignore b/.gitignore index 3b77ef3..92ea8da 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,9 @@ RELATIONSHIP.csv DOMAIN.csv CONCEPT_ANCESTOR.csv CONCEPT_SYNONYM.csv +# Allow committed test fixtures (minimal CSVs, not real Athena downloads) +!tests/fixtures/athena_source/ +!tests/fixtures/athena_source/*.csv data/ *.db-journal vocabulary_files/ @@ -66,4 +69,7 @@ logging/ _temp/ temp/ *.dump -*.bak \ No newline at end of file +*.bak +notebooks/ +.dockerignore +docker/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c29534..c43b5ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -91,4 +91,11 @@ - set minimum versions per dependabot (dev and required deps) ## 0.6.2 -- capped maximum `orm-loader` version to avoid pulling in future breaking changes \ No newline at end of file +- capped maximum `orm-loader` version to avoid pulling in future breaking changes + +## 0.6.3 +- fix CSV quote mode for Athena vocabulary loading: switch from `literal` to `auto` to prevent quoted concept names from overflowing `VARCHAR(255)` database columns +- make `chunksize=100_000` the default for `load-vocab-source` (was `None`/disabled); pass `--chunksize 0` to disable chunking explicitly +- **breaking:** `load-vocab-source` CLI now defaults `--merge-strategy` to `replace` (was `upsert`) to match the Python API default and ensure retired concepts are purged on vocabulary refresh; pass `--merge-strategy upsert` to restore the previous behaviour +- **breaking:** CLI entry point renamed from `omop-maint` to `omop-alchemy`; update any scripts or aliases accordingly (saved `.omop-maint.toml` defaults files are unaffected) +- remove stale notebooks from repository diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml deleted file mode 100644 index bfc1c9b..0000000 --- a/docker/docker-compose.yml +++ /dev/null @@ -1,94 +0,0 @@ -volumes: - postgres-data: - name: postgres-data - pgadmin-data: - name: pgadmin-data - -networks: - cava-network: - name: cava-network - driver: bridge - -services: - pgadmin: - profiles: [ "pgadmin"] - image: dpage/pgadmin4:latest - restart: unless-stopped - networks: - - cava-network - environment: - PGADMIN_DEFAULT_EMAIL: a@b.c - PGADMIN_DEFAULT_PASSWORD: pwd - SCRIPT_NAME: /pgadmin4 - volumes: - - pgadmin-data:/var/lib/pgadmin - ports: - - "5050:80" - python: - build: ./python - restart: unless-stopped - networks: - - cava-network - environment: - POSTGRES_USER: ${POSTGRES_USER} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - POSTGRES_DB: ${POSTGRES_DB} - ENGINE_CDM: postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB} - env_file: - - .env - depends_on: - postgres: - condition: service_healthy - volumes: - - ..:/workspace:rw - command: tail -f /dev/null - postgres: - build: ./postgres - networks: - - cava-network - environment: - POSTGRES_USER: ${POSTGRES_USER} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - POSTGRES_DB: ${POSTGRES_DB} - env_file: - - .env - restart: unless-stopped - volumes: - - ./data:/home/data:rw - - postgres-data:/var/lib/postgresql - - ./custom.conf:/etc/postgresql/custom.conf - healthcheck: - test: ["CMD-SHELL", "pg_isready -U postgres"] - interval: 5s - timeout: 5s - retries: 10 - command: - - postgres - - -c - - include_if_exists=/etc/postgresql/custom.conf - cava-jupyter-notebook: - profiles: [ "jupyter"] - build: ./jupyter - restart: unless-stopped - depends_on: - postgres: - condition: service_healthy - networks: - - cava-network - environment: - JUPYTERHUB_SERVICE_PREFIX: /jupyter/ - JUPYTERHUB_BASE_URL: ${HTTP_TYPE}://${HOST} - env_file: - - .env - volumes: - - ./work:/home/jovyan/work:rw - command: - - jupyter-lab - - --ip=* - - --NotebookApp.token= - - --NotebookApp.password= - - --NotebookApp.base_url=/jupyter - ports: - - "8888:8888" - mem_limit: 12g - shm_size: 4g \ No newline at end of file diff --git a/docker/jupyter/Dockerfile b/docker/jupyter/Dockerfile deleted file mode 100644 index 7a6abc6..0000000 --- a/docker/jupyter/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM quay.io/jupyter/minimal-notebook:python-3.13 - -USER root - -# Force uv install location -ENV HOME=/root -ENV PATH="/root/.local/bin:${PATH}" - -# Install uv -RUN curl -LsSf https://astral.sh/uv/install.sh | sh - -# Create uv venv -RUN uv venv /opt/venv -ENV VIRTUAL_ENV=/opt/venv -ENV PATH="/opt/venv/bin:${PATH}" - -# Install Python deps -RUN uv pip install omop-alchemy psycopg2-binary pip omop-graph -RUN /opt/venv/bin/python -m pip install ipykernel && \ - /opt/venv/bin/python -m ipykernel install \ - --name uv-venv \ - --display-name "Python (uv venv)" -# Switch back to notebook user -USER jovyan -ENV HOME=/home/jovyan -COPY ./.env /home/jovyan/work/.env -WORKDIR /home/jovyan/work \ No newline at end of file diff --git a/docker/postgres/Dockerfile b/docker/postgres/Dockerfile deleted file mode 100644 index 93f0fba..0000000 --- a/docker/postgres/Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -#FROM timescale/timescaledb-ha:pg18 -FROM postgres:18 - -# Optional: timezone / locale tweaks -ENV TZ=UTC - -# Expose is informational only -EXPOSE 5432 \ No newline at end of file diff --git a/docker/postgres/custom.conf b/docker/postgres/custom.conf deleted file mode 100644 index 9927308..0000000 --- a/docker/postgres/custom.conf +++ /dev/null @@ -1,10 +0,0 @@ -# Performance tuning for bulk loads -max_wal_size = '20GB' -checkpoint_timeout = '30min' -wal_compression = on - -# Memory -shared_buffers = '6GB' -work_mem = '256MB' -maintenance_work_mem = '2GB' -effective_cache_size = '16GB' \ No newline at end of file diff --git a/docker/python/.dockerignore b/docker/python/.dockerignore deleted file mode 100644 index 4a27e3c..0000000 --- a/docker/python/.dockerignore +++ /dev/null @@ -1,6 +0,0 @@ -.venv -__pycache__ -.git -.gitignore -.env -data \ No newline at end of file diff --git a/docker/python/Dockerfile b/docker/python/Dockerfile deleted file mode 100644 index 6a54075..0000000 --- a/docker/python/Dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -# ---- Stage 1: postgres tools ---- -FROM postgres:18 AS pgtools - -# ---- Stage 2: python ---- -FROM python:3.13 - -ENV PYTHONPYCACHEPREFIX=/tmp/pycache \ - PYTHONUNBUFFERED=1 \ - UV_PROJECT_ENVIRONMENT=/home/vscode/.venv \ - UV_CACHE_DIR=/home/vscode/.cache/uv \ - PATH="/usr/local/bin:/home/vscode/.venv/bin:$PATH" \ - LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 - -# system deps -RUN apt-get update && apt-get install -y --no-install-recommends \ - git \ - curl \ - bash \ - bash-completion \ - less \ - vim \ - && rm -rf /var/lib/apt/lists/* - -# copy binaries from pgtools stage -COPY --from=pgtools /usr/lib/postgresql /usr/lib/postgresql -COPY --from=pgtools /usr/lib/aarch64-linux-gnu/libpq* /usr/lib/aarch64-linux-gnu/ - -RUN ln -s /usr/lib/postgresql/18/bin/psql /usr/local/bin/psql \ - && ln -s /usr/lib/postgresql/18/bin/pg_dump /usr/local/bin/pg_dump \ - && ln -s /usr/lib/postgresql/18/bin/pg_restore /usr/local/bin/pg_restore - -# ---- uv install ---- -COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv - -# ---- User setup ---- -RUN useradd -m -s /bin/bash vscode - -WORKDIR /workspace - -# ---- Auto-activate venv ---- -RUN printf '\nif [ -f /home/vscode/.venv/bin/activate ] && [ -z "$VIRTUAL_ENV" ]; then\n . /home/vscode/.venv/bin/activate\nfi\n' >> /home/vscode/.bashrc \ - && chown vscode:vscode /home/vscode/.bashrc - -USER vscode - -CMD ["sleep", "infinity"] \ No newline at end of file diff --git a/docs/advanced/fulltext.md b/docs/advanced/fulltext.md index ab1c531..6cadc08 100644 --- a/docs/advanced/fulltext.md +++ b/docs/advanced/fulltext.md @@ -58,8 +58,8 @@ SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector; To enable the optional full-text sidecars in a PostgreSQL environment: ```bash -omop-maint fulltext install -omop-maint fulltext populate +omop-alchemy fulltext install +omop-alchemy fulltext populate ``` If your running Python process should use the stored sidecar columns through ORM @@ -164,28 +164,28 @@ This is the mode you want when: The maintenance CLI manages the full-text sidecars through: ```bash -omop-maint fulltext install -omop-maint fulltext populate -omop-maint fulltext drop +omop-alchemy fulltext install +omop-alchemy fulltext populate +omop-alchemy fulltext drop ``` Typical workflow: ```bash -omop-maint fulltext install -omop-maint fulltext populate +omop-alchemy fulltext install +omop-alchemy fulltext populate ``` If you later reload or update vocabulary data, refresh the stored vectors with: ```bash -omop-maint fulltext populate +omop-alchemy fulltext populate ``` If you want to remove the feature completely: ```bash -omop-maint fulltext drop +omop-alchemy fulltext drop ``` --- @@ -280,7 +280,7 @@ drop lifecycle is only meaningful on PostgreSQL. ## Operational Gotchas - treat the sidecar columns as **derived search state**, not source-of-truth data -- if you bulk-load new vocabulary rows, rerun `omop-maint fulltext populate` +- if you bulk-load new vocabulary rows, rerun `omop-alchemy fulltext populate` - if you use `reconcile-schema`, the sidecar columns and indexes are intentional database additions outside the core OMOP schema - GIN indexes can be expensive to build on large vocabularies, so plan that as a real diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index ff0f3ea..d8b7a47 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -180,14 +180,14 @@ At the database level: Typical maintenance workflow: ```bash -omop-maint fulltext install -omop-maint fulltext populate +omop-alchemy fulltext install +omop-alchemy fulltext populate ``` If you later reload vocabulary data, rerun: ```bash -omop-maint fulltext populate +omop-alchemy fulltext populate ``` For the full design and query patterns, see: diff --git a/docs/getting-started/maintenance.md b/docs/getting-started/maintenance.md index 98b4114..4f8f09a 100644 --- a/docs/getting-started/maintenance.md +++ b/docs/getting-started/maintenance.md @@ -11,7 +11,7 @@ database. ## Entrypoint ```bash -omop-maint --help +omop-alchemy --help python -m omop_alchemy.maintenance.cli --help ``` @@ -33,27 +33,27 @@ Common flags used by many commands: !!! info "Defaults file discovery" - Project-local defaults are stored in `.omop-maint.toml`. + Project-local defaults are stored in `.omop-alchemy.toml`. - the CLI looks for the nearest ancestor directory containing `pyproject.toml` - and uses `/.omop-maint.toml` - - if no ancestor project marker is found, it falls back to `./.omop-maint.toml` + and uses `/.omop-alchemy.toml` + - if no ancestor project marker is found, it falls back to `./.omop-alchemy.toml` in the current working directory - to force a fixed path, set `OMOP_MAINT_DEFAULTS_FILE` - - running `omop-maint` from outside your intended project tree may use a different + - running `omop-alchemy` from outside your intended project tree may use a different defaults file than expected ```bash -omop-maint config show -omop-maint config set-overrides --dotenv .env --engine-schema cdm --db-schema public --athena-source ./athena_source -omop-maint config clear-overrides -omop-maint config clear-overrides --db-schema +omop-alchemy config show +omop-alchemy config set-overrides --dotenv .env --engine-schema cdm --db-schema public --athena-source ./athena_source +omop-alchemy config clear-overrides +omop-alchemy config clear-overrides --db-schema ``` Resolution order: 1. explicit CLI flag -2. saved `.omop-maint.toml` default +2. saved `.omop-alchemy.toml` default 3. command fallback `engine_schema` selects the configured engine URL (`ENGINE_` or `ENGINE`). @@ -99,49 +99,49 @@ user-facing error. ### Inspect ```bash -omop-maint info -omop-maint doctor -omop-maint doctor --deep +omop-alchemy info +omop-alchemy doctor +omop-alchemy doctor --deep ``` ### Schema ```bash -omop-maint reconcile-schema -omop-maint create-missing-tables --dry-run -omop-maint create-missing-tables +omop-alchemy reconcile-schema +omop-alchemy create-missing-tables --dry-run +omop-alchemy create-missing-tables ``` ### Vocabulary ```bash -omop-maint load-vocab-source -omop-maint load-vocab-source --athena-source ./athena_source --dry-run +omop-alchemy load-vocab-source +omop-alchemy load-vocab-source --athena-source ./athena_source --dry-run ``` ### Bulk reload helpers ```bash -omop-maint foreign-keys disable -omop-maint indexes disable -omop-maint truncate-tables --scope clinical --restart-identities --yes +omop-alchemy foreign-keys disable +omop-alchemy indexes disable +omop-alchemy truncate-tables --scope clinical --restart-identities --yes ``` After ETL: ```bash -omop-maint reset-sequences -omop-maint indexes enable -omop-maint foreign-keys enable --strict -omop-maint analyze-tables --scope clinical +omop-alchemy reset-sequences +omop-alchemy indexes enable +omop-alchemy foreign-keys enable --strict +omop-alchemy analyze-tables --scope clinical ``` ### Full-text sidecars ```bash -omop-maint fulltext install -omop-maint fulltext populate -omop-maint fulltext drop +omop-alchemy fulltext install +omop-alchemy fulltext populate +omop-alchemy fulltext drop ``` For query-side usage and optional ORM metadata registration, see @@ -150,8 +150,8 @@ For query-side usage and optional ORM metadata registration, see ### Backup and restore ```bash -omop-maint backup-database --engine-schema source --output-path ./cdm.dump -omop-maint restore-database ./cdm.dump --format custom --engine-schema target +omop-alchemy backup-database --engine-schema source --output-path ./cdm.dump +omop-alchemy restore-database ./cdm.dump --format custom --engine-schema target ``` --- @@ -176,8 +176,8 @@ omop-maint restore-database ./cdm.dump --format custom --engine-schema target ## Help ```bash -omop-maint --help -omop-maint doctor --help -omop-maint fulltext --help -omop-maint config --help +omop-alchemy --help +omop-alchemy doctor --help +omop-alchemy fulltext --help +omop-alchemy config --help ``` diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index c2b67e0..03a8036 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -15,13 +15,16 @@ The goal is to provide a fast, reproducible environment for: When started with the appropriate profile, this stack runs: -- **PostgreSQL** (`cava-database`) - - Custom-built image (see `docker/postgres/Dockerfile`) +- **PostgreSQL** (`postgres`) + - Official `postgres:18` image with bulk-load-oriented runtime tuning in compose - Persistent storage via Docker volumes +- **Python workspace** (`python`) + - Local OMOP Alchemy source installed into a reusable container image + - PostgreSQL client tools included for direct `psql` / `pg_dump` access - **pgAdmin** (`pgadmin`) - - Web UI for inspecting and querying PostgreSQL + - Web UI for inspecting and querying PostgreSQL (optional) - **JupyterLab** (`cava-jupyter-notebook`, optional) - - Notebook environment wired to the same database + - Notebook environment built from the local repo and wired to the same database All services communicate on a dedicated Docker bridge network (`cava-network`). @@ -48,23 +51,27 @@ POSTGRES_DB=cava HOST=localhost HTTP_TYPE=http - -PYTHON_BIND_MOUNT=/absolute/path/to/your/code_or_data ``` These credentials are not secure and are intentionally simple for local use. ### Starting the stack -From the `docker` directory +From the `docker/` directory. + +#### Database + Python workspace + +``` +docker compose up -d +``` -#### Database + pgAdmin only +#### Database + Python workspace + pgAdmin ``` -docker compose --profile default up -d +docker compose --profile pgadmin up -d ``` -#### Database + pgAdmin + Jupyter +#### Database + Python workspace + Jupyter ``` docker compose --profile jupyter up -d diff --git a/notebooks/00_select_test_fixtures.ipynb b/notebooks/00_select_test_fixtures.ipynb deleted file mode 100644 index 8385b37..0000000 --- a/notebooks/00_select_test_fixtures.ipynb +++ /dev/null @@ -1,293 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "7113aac3", - "metadata": {}, - "outputs": [], - "source": [ - "from omop_alchemy import get_engine_name, load_environment, TEST_PATH, ROOT_PATH\n", - "from orm_loader.helpers import get_logger\n", - "from dotenv import load_dotenv\n", - "from pathlib import Path\n", - "import os\n", - "import pandas as pd\n", - "# old enumerator classes from monolithic version of omop_alchemy - selection of cancer-relevant codes\n", - "import concept_enums\n", - "\n", - "base_path = TEST_PATH / \"fixtures\" / \"athena_source\"\n", - "load_dotenv()\n", - "source_path = Path(os.getenv('SOURCE_PATH', 'update/path/to/athena/source/as/required'))" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d7b63035", - "metadata": {}, - "outputs": [], - "source": [ - "concept = pd.read_csv(source_path / 'CONCEPT.csv', delimiter='\\t', low_memory=False)\n", - "concept_class = pd.read_csv(source_path / 'CONCEPT_CLASS.csv', delimiter='\\t')\n", - "relationship = pd.read_csv(source_path / 'RELATIONSHIP.csv', delimiter='\\t')\n", - "domain = pd.read_csv(source_path / 'DOMAIN.csv', delimiter='\\t')\n", - "vocabulary = pd.read_csv(source_path / 'VOCABULARY.csv', delimiter='\\t')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bff8c220", - "metadata": {}, - "outputs": [], - "source": [ - "required_concepts = set(concept_class.concept_class_concept_id) | set(relationship.relationship_concept_id) | set(domain.domain_concept_id) | set(vocabulary.vocabulary_concept_id)\n", - "required_concepts_df = concept[concept.concept_id.isin(required_concepts)]\n", - "\n", - "selected = []\n", - "for d in set(domain.domain_id):\n", - " try:\n", - " c = concept[(concept.domain_id == d) & (concept.standard_concept == 'S')]\n", - " selected.append(c.sample(min(50, len(c)), random_state=1))\n", - " except ValueError:\n", - " print(f\"Not enough standard concepts in domain {d}\")\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d4b273fa", - "metadata": {}, - "outputs": [], - "source": [ - "standard_concept_by_domain_df = pd.concat(selected)\n", - "\n", - "additional_test_concepts = set([x for y in \n", - " [concept_enums.__dict__[cls].member_values() \n", - " for cls in dir(concept_enums) \n", - " if hasattr(concept_enums.__dict__[cls], 'member_values')\n", - " ] \n", - " for x in y])\n", - "\n", - "additional_test_concept_df = concept[concept.concept_id.isin(additional_test_concepts)]\n", - "\n", - "metadata = concept[concept.domain_id == 'Metadata']\n", - "language = concept[concept.domain_id == 'Language']\n", - "locations = concept[(concept.concept_class_id=='Location') & (concept.standard_concept.notna())].sample(frac=0.1, replace=False)\n", - "\n", - "additional_cancer_ones = []\n", - "\n", - "for vocab, frac in {'Cancer Modifier': 1.0, 'HemOnc': 0.1, 'ICDO3': 0.05}.items():\n", - " additional_cancer_ones.append(concept[(concept.vocabulary_id == vocab) & concept.standard_concept.notna()].sample(frac=frac, replace=False))\n", - "\n", - "cancer_specific_df = pd.concat(additional_cancer_ones)\n", - "\n", - "selected_concept_df = pd.concat(\n", - " [\n", - " standard_concept_by_domain_df,\n", - " required_concepts_df,\n", - " additional_test_concept_df,\n", - " cancer_specific_df,\n", - " locations,\n", - " metadata,\n", - " language\n", - " ]\n", - ").drop_duplicates()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d40f0ebd", - "metadata": {}, - "outputs": [], - "source": [ - "selected_relationships = []\n", - "\n", - "for concept_rel in pd.read_csv(source_path / 'CONCEPT_RELATIONSHIP.csv', delimiter='\\t', low_memory=False, chunksize=100000):\n", - " filtered = concept_rel[\n", - " (concept_rel.concept_id_1.isin(selected_concept_df.concept_id)) &\n", - " (concept_rel.concept_id_2.isin(selected_concept_df.concept_id))\n", - " ]\n", - " if not filtered.empty:\n", - " selected_relationships.append(filtered)\n", - "\n", - "selected_ancestry = []\n", - "\n", - "for concept_anc in pd.read_csv(source_path / 'CONCEPT_ANCESTOR.csv', delimiter='\\t', low_memory=False, chunksize=100000):\n", - " filtered = concept_anc[\n", - " (concept_anc.ancestor_concept_id.isin(selected_concept_df.concept_id)) &\n", - " (concept_anc.descendant_concept_id.isin(selected_concept_df.concept_id))\n", - " ]\n", - " if not filtered.empty:\n", - " selected_ancestry.append(filtered)\n", - "\n", - "selected_synonyms = []\n", - "\n", - "for concept_syn in pd.read_csv(source_path / 'CONCEPT_SYNONYM.csv', delimiter='\\t', low_memory=False, chunksize=100000):\n", - " filtered = concept_syn[\n", - " (concept_syn.concept_id.isin(selected_concept_df.concept_id))\n", - " ]\n", - " if not filtered.empty:\n", - " selected_synonyms.append(filtered)\n", - "\n", - "\n", - "selected_relationship_df = pd.concat(selected_relationships)\n", - "selected_ancestry_df = pd.concat(selected_ancestry)\n", - "selected_synonyms_df = pd.concat(selected_synonyms)\n", - "\n", - "\n", - "selected_relationship_df.to_csv(base_path / 'CONCEPT_RELATIONSHIP.csv', sep='\\t', index=False)\n", - "selected_synonyms_df.to_csv(base_path / 'CONCEPT_SYNONYM.csv', sep='\\t', index=False)\n", - "selected_ancestry_df.to_csv(base_path / 'CONCEPT_ANCESTOR.csv', sep='\\t', index=False)\n", - "selected_concept_df.to_csv(base_path / 'CONCEPT.csv', sep='\\t', index=False)\n", - "domain.to_csv(base_path / 'DOMAIN.csv', sep='\\t', index=False)\n", - "vocabulary.to_csv(base_path / 'VOCABULARY.csv', sep='\\t', index=False)\n", - "relationship.to_csv(base_path / 'RELATIONSHIP.csv', sep='\\t', index=False)\n", - "concept_class.to_csv(base_path / 'CONCEPT_CLASS.csv', sep='\\t', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9c4c1353", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "796f5be8", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9c5b8b3", - "metadata": {}, - "outputs": [], - "source": [ - "for f in [domain, vocabulary, relationship, concept_class, selected_relationship_df, selected_ancestry_df, selected_synonyms_df]:\n", - " for col in f.columns:\n", - " if 'concept_id' in col:\n", - " if len(f[~f[col].isin(selected_concept_df.concept_id)]) > 0:\n", - " raise ValueError(f\"Found concept_id in {col} not in selected concepts\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b465bc6c", - "metadata": {}, - "outputs": [], - "source": [ - "assert len(selected_relationship_df[~selected_relationship_df.relationship_id.isin(relationship.relationship_id.unique())]) == 0, \"Found relationship_id not in selected relationships\"\n", - "assert len(concept[~concept.concept_class_id.isin(concept_class.concept_class_id.unique())]) == 0, \"Found concept_class_id not in selected concepts\"\n", - "assert len(concept[~concept.domain_id.isin(domain.domain_id.unique())]) == 0, \"Found domain_id not in selected domains\"\n", - "assert len(concept[~concept.vocabulary_id.isin(vocabulary.vocabulary_id.unique())]) == 0, \"Found vocabulary_id not in selected vocabularies\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f65cc24f", - "metadata": {}, - "outputs": [], - "source": [ - "for f in [selected_concept_df, domain, vocabulary, relationship, concept_class, selected_relationship_df, selected_ancestry_df]:\n", - " assert(len(f[f.duplicated()]) == 0), f\"Found duplicated rows in {f}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97014890", - "metadata": {}, - "outputs": [], - "source": [ - "# this is the import issue...TODO: add pk null normalisation on load\n", - "vocabulary.loc[vocabulary.vocabulary_id.isna(), 'vocabulary_id'] = 'Unknown_Vocabulary'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "322e679f", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ff54924", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8326f2a3", - "metadata": {}, - "outputs": [], - "source": [ - "metadata[metadata.concept_id==1147138]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc803944", - "metadata": {}, - "outputs": [], - "source": [ - "len(selected_concept_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "acb592e2", - "metadata": {}, - "outputs": [], - "source": [ - "os.environ.get('SOURCE_PATH')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6b7cfd3", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/01_validate_model.ipynb b/notebooks/01_validate_model.ipynb deleted file mode 100644 index b18e149..0000000 --- a/notebooks/01_validate_model.ipynb +++ /dev/null @@ -1,255 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "3175451e", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-22 15:26:50,588 | INFO | sql_loader.omop_alchemy.config | Environment variables loaded from .env file\n", - "2026-01-22 15:26:50,589 | INFO | sql_loader.omop_alchemy.config | Default database engine configured\n" - ] - } - ], - "source": [ - "from orm_loader.registry import ModelRegistry, ValidationRunner, always_on_validators\n", - "from orm_loader.helpers import configure_logging, bootstrap\n", - "from omop_alchemy.cdm.specification import TABLE_LEVEL_CSV, FIELD_LEVEL_CSV\n", - "from omop_alchemy import get_engine_name, load_environment, TEST_PATH, ROOT_PATH\n", - "import sqlalchemy as sa\n", - "from sqlalchemy.orm import sessionmaker\n", - "\n", - "configure_logging()\n", - "load_environment()\n", - "\n", - "engine_string = get_engine_name()\n", - "engine = sa.create_engine(engine_string, future=True, echo=False)\n", - "registry = ModelRegistry(model_name='CDM', model_version=\"5.4\")\n", - "\n", - "registry.load_table_specs(\n", - " table_csv=TABLE_LEVEL_CSV,\n", - " field_csv=FIELD_LEVEL_CSV,\n", - ")\n", - "\n", - "registry.discover_models(\"omop_alchemy.cdm.model\")\n", - "bootstrap(engine, create=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9875dc2f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['concept_synonym',\n", - " 'observation_period',\n", - " 'observation',\n", - " 'payer_plan_period',\n", - " 'dose_era']" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(registry.known_tables())[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "4e144e8a", - "metadata": {}, - "outputs": [], - "source": [ - "validators = always_on_validators()\n", - "runner = ValidationRunner(\n", - " validators=validators,\n", - " fail_fast=False,\n", - ")\n", - "report = runner.run(registry)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "75a09c70", - "metadata": {}, - "outputs": [], - "source": [ - "# report = registry.validate(engine=engine, check_domain_semantics=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "9cfa9046", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MODEL v5.4: 0 error(s), 28 warning(s), 8 info\n" - ] - } - ], - "source": [ - "print(report.summary())" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "a8fea713", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "📦 cdm_source\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: cdm_source_name) Hint: ORM primary key not marked as primary key in specification\n", - "\n", - "📦 cohort\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: cohort_definition_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: subject_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 cohort_definition\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: cohort_definition_id) Hint: ORM primary key not marked as primary key in specification\n", - "\n", - "📦 concept_ancestor\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: ancestor_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: descendant_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 concept_relationship\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: concept_id_1) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: concept_id_2) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: relationship_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 concept_synonym\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: concept_synonym_name) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: language_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 death\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: person_id) Hint: ORM primary key not marked as primary key in specification\n", - "\n", - "📦 drug_strength\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: drug_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: ingredient_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 episode\n", - " ⚠️ FOREIGN_KEY_NOT_IN_SPEC (field: episode_parent_id) Hint: ORM defines FK but specification does not\n", - "\n", - "📦 episode_event\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: episode_event_field_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: episode_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: event_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 fact_relationship\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: domain_concept_id_1) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: domain_concept_id_2) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: fact_id_1) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: fact_id_2) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: relationship_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 relationship\n", - " ⚠️ FOREIGN_KEY_NOT_IN_SPEC (field: reverse_relationship_id) Hint: ORM defines FK but specification does not\n", - "\n", - "📦 source_to_concept_map\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: source_code) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: source_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: source_vocabulary_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n" - ] - } - ], - "source": [ - "if not report.is_valid():\n", - " print(report.render_text_report())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6086ccff", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c827c762", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3551f2f9", - "metadata": {}, - "outputs": [], - "source": [ - "for table, spec in registry._table_specs.items():\n", - " print(f\"{table}: {spec.is_required}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9585d76b", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2be13a79", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/02_test_load.ipynb b/notebooks/02_test_load.ipynb deleted file mode 100644 index dadd78f..0000000 --- a/notebooks/02_test_load.ipynb +++ /dev/null @@ -1,476 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "67fe4629", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-23 17:36:30,283 | INFO | sql_loader.omop_alchemy.config | Environment variables loaded from .env file\n", - "2026-01-23 17:36:30,283 | INFO | sql_loader.omop_alchemy.config | Default database engine configured\n" - ] - } - ], - "source": [ - "import sqlalchemy as sa\n", - "import pandas as pd\n", - "\n", - "from orm_loader.helpers import configure_logging, bootstrap, explain_sqlite_fk_error, bulk_load_context, configure_logging\n", - "from sqlalchemy.orm import sessionmaker\n", - "from sqlalchemy.exc import IntegrityError\n", - "\n", - "from random import randint, choice\n", - "import numpy as np\n", - "from orm_loader.loaders.loader_interface import ParquetLoader, PandasLoader\n", - "\n", - "from sqlalchemy.orm import Session\n", - "from omop_alchemy.cdm.model.health_system import Location, Care_Site, Provider, Visit_Detail, Visit_Occurrence\n", - "from omop_alchemy.cdm.model.clinical import Person, Condition_Occurrence, Procedure_Occurrence, Death, Specimen, Drug_Exposure, Measurement, Observation\n", - "from omop_alchemy.cdm.model.structural import Episode, Episode_Event\n", - "from omop_alchemy.cdm.model.derived import Observation_Period\n", - "from datetime import date, timedelta\n", - "from omop_alchemy import get_engine_name, load_environment, TEST_PATH, ROOT_PATH\n", - "\n", - "from omop_alchemy.cdm.model.vocabulary import (\n", - " Domain,\n", - " Vocabulary,\n", - " Concept_Class,\n", - " Relationship,\n", - " Concept,\n", - " Concept_Ancestor,\n", - " Concept_Relationship,\n", - " Concept_Synonym,\n", - " Concept_Synonym,\n", - ")\n", - "\n", - "ATHENA_INITIAL_LOAD = [\n", - " Domain,\n", - " Vocabulary,\n", - " Concept_Class,\n", - " Relationship,\n", - " Concept\n", - "]\n", - "\n", - "ATHENA_SUBSEQUENT_LOAD = [\n", - " Concept_Ancestor,\n", - " Concept_Relationship,\n", - " Concept_Synonym\n", - "]\n", - "\n", - "configure_logging()\n", - "load_environment()\n", - "\n", - "engine_string = get_engine_name()\n", - "engine = sa.create_engine(engine_string, future=True, echo=False)\n", - "bootstrap(engine, create=True)\n", - "Session = sessionmaker(bind=engine, future=True)\n", - "session = Session()\n", - "p = PandasLoader()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "433ced72", - "metadata": {}, - "outputs": [], - "source": [ - "base_path = TEST_PATH / \"fixtures\" / \"athena_source\"\n", - "\n", - "# uncomment this line if you want to load the full athena source from env var\n", - "# instead of the minimal test fixture set for rapid access\n", - "\n", - "# base_path = Path(os.environ['SOURCE_PATH'])" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "82601899", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-23 17:36:33,728 | INFO | sql_loader.orm_loader.helpers.bulk | Disabled foreign key checks for bulk load\n", - "Staging table _staging_vocabulary does not exist; recreating\n", - "Staging table _staging_concept_class does not exist; recreating\n", - "Staging table _staging_relationship does not exist; recreating\n", - "Staging table _staging_concept does not exist; recreating\n", - "Found 1 rows with unexpected nulls in concept.vocabulary_id\n", - "2026-01-23 17:36:34,375 | INFO | sql_loader.orm_loader.helpers.bulk | Re-enabled foreign key checks after bulk load\n" - ] - } - ], - "source": [ - "# Initial load of core vocabulary tables - use bulk load to ensure mutual FK constraints are handled (trusted sources only)\n", - "\n", - "with bulk_load_context(session):\n", - " for model in ATHENA_INITIAL_LOAD:\n", - " _ = model.load_csv(\n", - " session,\n", - " base_path / f\"{model.__tablename__.upper()}.csv\",\n", - " dedupe=True,\n", - " merge_strategy=\"upsert\",\n", - " loader=p,\n", - " )\n", - " session.commit()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "dcf65010", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-23 17:36:37,680 | INFO | sql_loader.orm_loader.helpers.bulk | Disabled foreign key checks for bulk load\n", - "Staging table _staging_concept_ancestor does not exist; recreating\n", - "Staging table _staging_concept_relationship does not exist; recreating\n", - "Staging table _staging_concept_synonym does not exist; recreating\n", - "2026-01-23 17:36:39,350 | INFO | sql_loader.orm_loader.helpers.bulk | Re-enabled foreign key checks after bulk load\n" - ] - } - ], - "source": [ - "# can still turn off FK checks for speed but mutual dependency is not an issue for this one \n", - "# has been updated to use merge strategy to handle duplicates\n", - "\n", - "with bulk_load_context(session):\n", - " for model in ATHENA_SUBSEQUENT_LOAD:\n", - " _ = model.load_csv(\n", - " session,\n", - " base_path / f\"{model.__tablename__.upper()}.csv\",\n", - " dedupe=True,\n", - " chunksize=5000,\n", - " merge_strategy=\"upsert\",\n", - " )\n", - " session.commit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eac7991f", - "metadata": {}, - "outputs": [], - "source": [ - "concept_by_domain = pd.DataFrame(\n", - " session.query(\n", - " *Concept.__table__.columns\n", - " )\n", - " .filter(\n", - " sa.or_(\n", - " Concept.domain_id.in_(['Gender', 'Ethnicity', 'Race', 'Visit', 'Geography', 'Provider', 'Type Concept']),\n", - " sa.and_(\n", - " Concept.domain_id == 'Condition',\n", - " Concept.vocabulary_id == 'ICDO3'\n", - " )\n", - " )\n", - " )\n", - ")\n", - "\n", - "avail_gender = list(concept_by_domain[concept_by_domain.domain_id=='Gender'].concept_id)\n", - "avail_ethnicity = list(concept_by_domain[concept_by_domain.domain_id=='Ethnicity'].concept_id)\n", - "avail_race = list(concept_by_domain[concept_by_domain.domain_id=='Race'].concept_id)\n", - "avail_place_of_service = list(concept_by_domain[concept_by_domain.domain_id=='Visit'].concept_id)\n", - "avail_country = list(concept_by_domain[concept_by_domain.concept_class_id=='Location'].concept_id)\n", - "avail_provider = list(concept_by_domain[concept_by_domain.domain_id=='Provider'].concept_id)\n", - "avail_types = list(concept_by_domain[concept_by_domain.domain_id=='Type Concept'].concept_id)\n", - "\n", - "cancers = list(concept_by_domain[(concept_by_domain.domain_id=='Condition')&(concept_by_domain.vocabulary_id=='ICDO3') & (concept_by_domain.concept_code.str.contains('/3'))].concept_id)\n", - "\n", - "staging_parents = pd.DataFrame(\n", - " session.query(\n", - " *Concept.__table__.columns\n", - " )\n", - " .join(Concept_Ancestor, Concept.concept_id==Concept_Ancestor.descendant_concept_id)\n", - " .filter(Concept_Ancestor.ancestor_concept_id==734320)\n", - " .filter(Concept_Ancestor.max_levels_of_separation==1)\n", - ")\n", - "\n", - "staging_sets = {}\n", - "\n", - "for axis in ['T', 'N', 'M', 'Stage']:\n", - " parents = list(staging_parents[staging_parents.concept_name.str.contains(axis)].concept_id)\n", - " s = pd.DataFrame(\n", - " session.query(\n", - " *Concept.__table__.columns\n", - " )\n", - " .join(Concept_Ancestor, Concept.concept_id==Concept_Ancestor.descendant_concept_id)\n", - " .filter(Concept_Ancestor.ancestor_concept_id.in_(parents))\n", - " .filter(Concept.concept_code.ilike('%8th%'))\n", - " .filter(~Concept.concept_code.ilike('%yp%'))\n", - " )\n", - " staging_sets[axis] = s" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "41e86f41", - "metadata": {}, - "outputs": [], - "source": [ - "# confirming string hack to identify staging axes does work as expected\n", - "# staging_sets['Stage'].concept_code.map(lambda x: x.split('-')[-1]).value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "dc70fc6b", - "metadata": {}, - "outputs": [], - "source": [ - "# these are super-naive and brute-force ways to populate very basic test data - good enough for now - better content coming\n", - "\n", - "def populate_reference_data(session):\n", - " \n", - " loc_ids = Location.allocator(session)\n", - " cs_ids = Care_Site.allocator(session)\n", - " pro_ids = Provider.allocator(session)\n", - " \n", - " location_data = [{'location_id': loc_ids.next(), 'country_concept_id': choice(avail_country), 'city': f'City {idx}'} for idx in range(10)]\n", - " locations = [Location(**row) for row in location_data]\n", - " care_site_data = [{'care_site_id': cs_ids.next(), 'care_site_name': f'Care Site {idx}', 'location_id': choice(locations).location_id, 'place_of_service_concept_id': choice(avail_place_of_service)} for idx in range(30)]\n", - " care_sites = [Care_Site(**row) for row in care_site_data]\n", - " provider_data = [{'provider_id': pro_ids.next(), 'specialty_concept_id': choice(avail_provider), 'gender_concept_id': choice(avail_gender), 'care_site_id': choice(care_sites).care_site_id} for _ in range(50)]\n", - " providers = [Provider(**row) for row in provider_data]\n", - "\n", - " session.add_all(locations)\n", - " session.add_all(care_sites)\n", - " session.add_all(providers)\n", - " session.commit()\n", - "\n", - " return locations, care_sites, providers\n", - "\n", - "def populate_people_and_visits(session, care_sites):\n", - " \n", - " person_ids = Person.allocator(session)\n", - " visit_ids = Visit_Occurrence.allocator(session)\n", - " \n", - " person_data = [{'person_id': person_ids.next(), 'year_of_birth': randint(1950, 2020), 'month_of_birth': randint(1, 12), 'gender_concept_id':choice(avail_gender), 'race_concept_id':choice(avail_race), 'ethnicity_concept_id':choice(avail_ethnicity)} for idx in range(1000)]\n", - " people = [Person(**row) for row in person_data]\n", - "\n", - " visits = []\n", - " for person in people:\n", - " cs = choice(care_sites)\n", - " visit_num = randint(1, 3)\n", - " for v in range(visit_num):\n", - " days_delay = randint(0, 365)\n", - " visit_date = date(2020, 1, 1) + timedelta(days_delay)\n", - " visit = Visit_Occurrence(\n", - " visit_occurrence_id=visit_ids.next(),\n", - " person_id=person.person_id,\n", - " care_site_id=cs.care_site_id,\n", - " visit_concept_id=choice(avail_place_of_service),\n", - " visit_start_date=visit_date,\n", - " visit_end_date=visit_date,\n", - " )\n", - " visits.append(visit)\n", - " session.add_all(people)\n", - " session.add_all(visits)\n", - " session.commit()\n", - " return people, visits\n", - "\n", - "def populate_observation_periods(session):\n", - " op_ids = Observation_Period.allocator(session)\n", - " deaths = []\n", - " rows = (\n", - " session.query(\n", - " Visit_Occurrence.person_id,\n", - " sa.func.min(Visit_Occurrence.visit_start_date).label(\"start\"),\n", - " sa.func.max(Visit_Occurrence.visit_end_date).label(\"end\"),\n", - " Death.death_date,\n", - " Observation_Period.observation_period_id\n", - " )\n", - " .join(Death, Death.person_id==Visit_Occurrence.person_id, isouter=True)\n", - " .join(Observation_Period, Observation_Period.person_id==Visit_Occurrence.person_id, isouter=True)\n", - " .filter(Observation_Period.observation_period_id==None)\n", - " .group_by(Visit_Occurrence.person_id)\n", - " .all()\n", - " )\n", - " obs = []\n", - " for idx, r in enumerate(rows):\n", - " deceased = np.random.choice([True, False], p=[0.05, 0.95])\n", - " if deceased:\n", - " death_date = r.end + timedelta(days=randint(1, 365))\n", - " deaths.append(\n", - " Death(\n", - " person_id=r.person_id,\n", - " death_date=death_date,\n", - " death_type_concept_id=choice(avail_types),\n", - " )\n", - " )\n", - " obs_end = death_date\n", - " else:\n", - " obs_end = r.end\n", - " obs.append(\n", - " Observation_Period(\n", - " observation_period_id=op_ids.next(),\n", - " person_id=r.person_id,\n", - " observation_period_start_date=r.start,\n", - " observation_period_end_date=obs_end,\n", - " period_type_concept_id=choice(avail_types),\n", - " )\n", - " )\n", - " session.add_all(deaths)\n", - " session.add_all(obs)\n", - " session.commit()\n", - " return obs\n", - "\n", - "def populate_conditions_and_modifiers(session):\n", - " cond_ids = Condition_Occurrence.allocator(session)\n", - " meas_ids = Measurement.allocator(session)\n", - " ep_ids = Episode.allocator(session)\n", - " rows = (\n", - " session.query(\n", - " Observation_Period, Death, Condition_Occurrence\n", - " )\n", - " .join(Death, Observation_Period.person_id==Death.person_id, isouter=True)\n", - " .join(Condition_Occurrence, Observation_Period.person_id==Condition_Occurrence.person_id, isouter=True)\n", - " .all()\n", - " )\n", - " conditions = []\n", - " measurements = []\n", - " episodes = []\n", - " episode_events = []\n", - " for obs, death, condition in rows:\n", - " if condition:\n", - " continue\n", - " t = choice(list(staging_sets['T'].concept_id))\n", - " n = choice(list(staging_sets['N'].concept_id))\n", - " m = choice(list(staging_sets['M'].concept_id))\n", - " # don't worry abt overall stage for now as it should be calculated\n", - " condition_concept = choice(cancers)\n", - " condition = Condition_Occurrence(\n", - " condition_occurrence_id=cond_ids.next(),\n", - " condition_concept_id = condition_concept,\n", - " condition_start_date = obs.observation_period_start_date,\n", - " condition_type_concept_id = choice(avail_types),\n", - " person_id = obs.person_id,\n", - " condition_status_concept_id = 32902\n", - " )\n", - " conditions.append(condition)\n", - " episode = Episode(\n", - " episode_id=ep_ids.next(),\n", - " person_id=obs.person_id,\n", - " episode_concept_id=32533, # Episode of care\n", - " episode_object_concept_id=condition.condition_concept_id,\n", - " episode_start_date=condition.condition_start_date,\n", - " episode_end_date=(\n", - " death.death_date if death else obs.observation_period_end_date\n", - " ),\n", - " episode_type_concept_id=choice(avail_types), # EHR / registry / derived\n", - " )\n", - " episodes.append(episode)\n", - "\n", - " for stage in [t, n, m]:\n", - " measurement = Measurement(\n", - " person_id = obs.person_id,\n", - " measurement_id = meas_ids.next(),\n", - " measurement_concept_id = stage,\n", - " measurement_event_id = condition.condition_occurrence_id,\n", - " meas_event_field_concept_id = 1147127, # condition_occurrence.condition_occurrence_id\n", - " measurement_date = condition.condition_start_date,\n", - " measurement_type_concept_id = choice(avail_types),\n", - " value_as_number = 1\n", - " )\n", - " measurements.append(measurement)\n", - " episode_events.append(\n", - " Episode_Event(\n", - " episode_id=episode.episode_id,\n", - " event_id=measurement.measurement_id,\n", - " episode_event_field_concept_id=1147138, # measurement.measurement_id\n", - " )\n", - " )\n", - " episode_events.append(\n", - " Episode_Event(\n", - " episode_id=episode.episode_id,\n", - " event_id=condition.condition_occurrence_id,\n", - " episode_event_field_concept_id=1147127, # condition_occurrence.condition_occurrence_id\n", - " )\n", - " )\n", - " session.add_all(conditions)\n", - " session.add_all(measurements)\n", - " session.add_all(episodes)\n", - " session.add_all(episode_events)\n", - " session.commit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b7ccb46a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97d76a3f", - "metadata": {}, - "outputs": [], - "source": [ - "with Session() as sess:\n", - " populate_reference_data(sess)\n", - " sess.commit()\n", - " care_sites = sess.query(Care_Site).all()\n", - "\n", - "with Session() as sess:\n", - " populate_people_and_visits(sess, care_sites)\n", - " populate_observation_periods(sess)\n", - "\n", - "with Session() as sess:\n", - " populate_conditions_and_modifiers(sess)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e57318e0", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a241ac28", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/03_basic_model_query_demo.ipynb b/notebooks/03_basic_model_query_demo.ipynb deleted file mode 100644 index caec0f8..0000000 --- a/notebooks/03_basic_model_query_demo.ipynb +++ /dev/null @@ -1,1205 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "766a9e4a", - "metadata": {}, - "source": [ - "This notebook is a simple demo to introduce some of the fundamental design patterns from the OMOP_Alchemy library " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "634ae11f", - "metadata": {}, - "outputs": [], - "source": [ - "import sqlalchemy as sa\n", - "from sqlalchemy.orm import sessionmaker\n", - "from omop_alchemy.cdm.model.vocabulary import Concept, ConceptView, Domain, Vocabulary, Concept_Class\n", - "from orm_loader.helpers import configure_logging, bootstrap, bulk_load_context\n", - "from omop_alchemy import get_engine_name, load_environment, TEST_PATH, ROOT_PATH\n", - "from omop_alchemy.cdm.model.clinical import Condition_Occurrence, Condition_OccurrenceView\n", - "from omop_alchemy.cdm.model.structural import EpisodeView, Episode_EventView" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "5c3184bb", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-22 15:27:38,567 | INFO | sql_loader.omop_alchemy.config | Environment variables loaded from .env file\n", - "2026-01-22 15:27:38,568 | INFO | sql_loader.omop_alchemy.config | Default database engine configured\n" - ] - } - ], - "source": [ - "# this demo assumes that you have created a .env file in the ROOT_PATH with your database connection string - see .example_dotenv for details\n", - "\n", - "configure_logging()\n", - "load_environment()\n", - "engine_string = get_engine_name()\n", - "\n", - "engine = sa.create_engine(engine_string, future=True, echo=False)\n", - "bootstrap(engine, create=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "fe73295d", - "metadata": {}, - "outputs": [], - "source": [ - "Session = sessionmaker(bind=engine, future=True)\n", - "session = Session()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "8943cd87", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c = session.query(Concept).first()\n", - "c" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "7e2c50e9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'concept_id': 1,\n", - " 'concept_name': 'Domain',\n", - " 'domain_id': 'Metadata',\n", - " 'vocabulary_id': 'Domain',\n", - " 'concept_class_id': 'Domain',\n", - " 'concept_code': 'OMOP generated',\n", - " 'valid_start_date': datetime.date(1970, 1, 1),\n", - " 'valid_end_date': datetime.date(2099, 12, 31)}" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c.to_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e0939c75", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'{\"concept_class_id\": \"Domain\", \"concept_code\": \"OMOP generated\", \"concept_id\": 1, \"concept_name\": \"Domain\", \"domain_id\": \"Metadata\", \"valid_end_date\": \"2099-12-31\", \"valid_start_date\": \"1970-01-01\", \"vocabulary_id\": \"Domain\"}'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c.to_json()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "dcc041a4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[(22274, 'Neoplasm of uncertain behavior of larynx', 'S'),\n", - " (22281, 'Sickle cell-hemoglobin SS disease', 'S'),\n", - " (22288, 'Hereditary elliptocytosis', 'S'),\n", - " (22340, 'Esophageal varices without bleeding', 'S'),\n", - " (22350, 'Edema of larynx', 'S')]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "standard_conditions = (\n", - " session.query(Concept)\n", - " .filter(\n", - " Concept.domain_id == \"Condition\",\n", - " Concept.standard_concept == \"S\",\n", - " )\n", - " .limit(5)\n", - " .all()\n", - ")\n", - "\n", - "[(c.concept_id, c.concept_name, c.standard_concept) for c in standard_conditions]\n" - ] - }, - { - "cell_type": "markdown", - "id": "b524d61d", - "metadata": {}, - "source": [ - "`Concept` is the basic class that you should be using for most ETL steps, but for introspection of relationships (including the triggering of lazy loads), `ConceptView` offers much richer expressions.\n", - "\n", - "This is separated to ensure speed of base class is maintained, while optimising the potential benefits of fully-described object relationships" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4ae51dea", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cv = session.query(ConceptView).first()\n", - "cv" - ] - }, - { - "cell_type": "markdown", - "id": "3df3e3fb", - "metadata": {}, - "source": [ - "`domain_id` is the actual string content of the column that was returned from the query already performed, where `cv.domain` returns a related Domain object" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "3211247e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Metadata',\n", - " str,\n", - " ,\n", - " omop_alchemy.cdm.model.vocabulary.domain.Domain,\n", - " ,\n", - " omop_alchemy.cdm.model.vocabulary.vocabulary.Vocabulary)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cv.domain_id, type(cv.domain_id), cv.domain, type(cv.domain), cv.vocabulary, type(cv.vocabulary)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "b51388fe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Hospital admission'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# because concept ancestor and concept relationship are very large tables, ConceptView relationships have \n", - "# been set to lazy='select', these relationships will not load until accessed\n", - "\n", - "concepts = (\n", - " session.query(ConceptView)\n", - " .filter(ConceptView.vocabulary_id == 'SNOMED')\n", - " .filter(ConceptView.standard_concept == 'S')\n", - " .limit(30)\n", - ")\n", - "\n", - "concepts[0].concept_name" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "5a36bca3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8715 Hospital admission 5 219 361 361\n", - "9173 Inactive 5 1 7 7\n" - ] - } - ], - "source": [ - "# get details about concept dynamically - ancestors, descendants, relationships\n", - "\n", - "# because of the deferred loading strategy, these relationships will now be querying \n", - "# those tables once for every print statement in the below loop - very efficient for\n", - "# single concepts, not for sets of concepts\n", - "\n", - "for concept in concepts[:2]:\n", - " print(\n", - " concept.concept_id,\n", - " concept.concept_name,\n", - " len(concept.ancestors),\n", - " len(concept.descendants),\n", - " len(concept.incoming_relationships),\n", - " len(concept.outgoing_relationships),\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "5a6d3413", - "metadata": {}, - "outputs": [], - "source": [ - "# when known in advance that these relationships will be needed, use joined loading to\n", - "# load them in the original query and only hit the big table once\n", - "\n", - "from sqlalchemy.orm import selectinload\n", - "\n", - "def concept_hierarchy_bundle():\n", - " return (\n", - " selectinload(ConceptView.ancestors),\n", - " selectinload(ConceptView.descendants),\n", - " )\n", - "\n", - "def concept_relationship_bundle():\n", - " return (\n", - " selectinload(ConceptView.incoming_relationships),\n", - " selectinload(ConceptView.outgoing_relationships),\n", - " )\n", - "\n", - "concepts = (\n", - " session.query(ConceptView)\n", - " .filter(ConceptView.vocabulary_id == 'SNOMED')\n", - " .filter(ConceptView.standard_concept == 'S')\n", - " .options(\n", - " *concept_hierarchy_bundle(),\n", - " *concept_relationship_bundle()\n", - " )\n", - " .limit(30)\n", - " .all()\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "55633a75", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8715 Hospital admission 5 219 361 361\n", - "9173 Inactive 5 1 7 7\n", - "9174 Obsolete 5 1 7 7\n", - "9176 Patient status determination, deceased 4 7 12 12\n", - "9177 Other 5 1 9 9\n", - "9181 Active 5 1 7 7\n", - "9189 Negative 4 1 184 184\n", - "9190 Not detected 4 3 213 213\n", - "9191 Positive 7 6 231 231\n", - "9192 Trace 6 1 20 20\n", - "22274 Neoplasm of uncertain behavior of larynx 36 45 49 49\n", - "22281 Sickle cell-hemoglobin SS disease 35 12 74 74\n", - "22288 Hereditary elliptocytosis 44 10 49 49\n", - "22340 Esophageal varices without bleeding 29 1 30 30\n", - "22350 Edema of larynx 16 9 39 39\n", - "22426 Congenital macrostomia 30 5 35 35\n", - "22492 Foreign body in pharynx 26 13 60 60\n", - "22557 Malignant tumor of submandibular gland 49 182 18 18\n", - "22665 Chronic peptic ulcer with hemorrhage AND with perforation but without obstruction 33 1 17 17\n", - "22666 Vomiting after gastrointestinal tract surgery 18 3 21 21\n", - "22722 Accessory salivary gland 33 2 17 17\n", - "22820 Tuberculosis of esophagus 36 1 26 26\n", - "22839 Overlapping malignant neoplasm of larynx 38 1 23 23\n", - "22856 Polyglandular dysfunction 6 21 65 65\n", - "22871 Neoplasm of uncertain behavior of pineal gland 44 11 36 36\n", - "22945 Horizontal overbite 22 1 20 20\n", - "22955 Perforation of esophagus 22 3 28 28\n", - "23034 Neonatal hypoglycemia 14 7 35 35\n", - "23137 Chlamydial pharyngitis 44 1 28 28\n", - "23164 Disorder of anterior pituitary 13 149 57 57\n" - ] - } - ], - "source": [ - "for concept in concepts:\n", - " print(\n", - " concept.concept_id,\n", - " concept.concept_name,\n", - " len(concept.ancestors),\n", - " len(concept.descendants),\n", - " len(concept.incoming_relationships),\n", - " len(concept.outgoing_relationships),\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "a53f0b85", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(36402497, 'Round cell liposarcoma of unknown primary site')" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row = (\n", - " session.query(Condition_Occurrence, Concept)\n", - " .join(Concept, Condition_Occurrence.condition_concept_id == Concept.concept_id)\n", - " .first()\n", - ")\n", - "\n", - "row[0].condition_concept_id, row[1].concept_name" - ] - }, - { - "cell_type": "markdown", - "id": "2954093f", - "metadata": {}, - "source": [ - "we don't want to be needing to define joins every time, but equally we don't want to force the loading of relationships that are not required for simple queries.\n", - "this is why they are separated out into View classes, but they can be very useful for exploration, as well as for serialisation to downstream apis" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "19cad800", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(36402497, 'Round cell liposarcoma of unknown primary site')" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row = (\n", - " session.query(Condition_OccurrenceView)\n", - " .first()\n", - ")\n", - "\n", - "row.condition_concept_id, row.condition_concept.concept_name" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "9370cbc3", - "metadata": {}, - "outputs": [], - "source": [ - "from omop_alchemy.cdm.model.clinical import Person, PersonView\n", - "from omop_alchemy.cdm.model.health_system import Location, Provider, Care_Site" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "3b1f85f4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p = session.query(Person).first()\n", - "p" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "c44f77ac", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'_sa_instance_state': ,\n", - " 'ethnicity_concept_id': 38003564,\n", - " 'gender_source_value': None,\n", - " 'year_of_birth': 1976,\n", - " 'gender_source_concept_id': None,\n", - " 'race_source_value': None,\n", - " 'person_id': 1,\n", - " 'race_source_concept_id': None,\n", - " 'ethnicity_source_value': None,\n", - " 'month_of_birth': 12,\n", - " 'ethnicity_source_concept_id': None,\n", - " 'visit_occurrence_id': None,\n", - " 'day_of_birth': None,\n", - " 'location_id': None,\n", - " 'visit_detail_id': None,\n", - " 'birth_datetime': None,\n", - " 'provider_id': None,\n", - " 'gender_concept_id': 45518388,\n", - " 'care_site_id': None,\n", - " 'race_concept_id': 45456238,\n", - " 'person_source_value': None}" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# simple person class that just has the raw column data - flat, predictable, and cheap to load - no joins and no lazy relationships\n", - "p.__dict__" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "e9910b9c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# subtle in this example, but personview has actually loaded the gender concept relationship to print the label instead of the raw concept_id\n", - "pv = session.query(PersonView).first()\n", - "pv" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "b0fd6101", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Gender unknown', 'Ethnic category - 2001 census', 'Not Hispanic or Latino')" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pv.gender.concept_name, pv.race.concept_name, pv.ethnicity.concept_name" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "9d8e2932", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'gender_concept_id': ,\n", - " 'race_concept_id': ,\n", - " 'ethnicity_concept_id': }" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "PersonView.__expected_domains__" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "4f33223a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p = session.query(PersonView).first()\n", - "p" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "9c059b4b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p.domain_violations" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "8580aa91", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wrong_concept = (\n", - " session.query(Concept)\n", - " .filter(Concept.domain_id == \"Condition\")\n", - " .first()\n", - ")\n", - "wrong_concept" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "930f8d2e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[DomainRule(table='person', field='gender_concept_id', allowed_domains={'Gender'}, allowed_classes=None),\n", - " DomainRule(table='person', field='race_concept_id', allowed_domains={'Race'}, allowed_classes=None),\n", - " DomainRule(table='person', field='ethnicity_concept_id', allowed_domains={'Ethnicity'}, allowed_classes=None)]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "PersonView.collect_domain_rules()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "192eb5ba", - "metadata": {}, - "outputs": [], - "source": [ - "p.gender_concept_id = wrong_concept.concept_id" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "2ee06bb4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p.is_domain_valid" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "feb164dd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[\"gender_concept_id not in domain(s): ['Gender']\"]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# we can do application-side validation of domain rules \n", - "# tbc if this can be made more efficient at scale to truly support ETL \n", - "# so that we can move it to the base class?\n", - "p.domain_violations" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "a5a313da", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "50" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# age as a hybrid property\n", - "from datetime import date\n", - "pv.age" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "85046519", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "44" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pv.age_at(date(2020, 1, 1))" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "efbe1fc7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# because we are using a hybrid property, we can filter on it in queries - same logic but two execution modes\n", - "(\n", - " session.query(PersonView)\n", - " .filter(PersonView.age_at(date(2020, 1, 1)) >= 65)\n", - " .limit(5)\n", - " .all()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "b7de12c1", - "metadata": {}, - "outputs": [], - "source": [ - "# if using the base Person class, we would need to do the age calculation in the query itself\n", - "from sqlalchemy import func\n", - "on = date(2020, 1, 1)\n", - "q = (\n", - " session.query(Person)\n", - " .filter((sa.func.extract(\"year\", sa.literal(on)) - Person.year_of_birth) >= 65)\n", - " .limit(5)\n", - " .all()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "bc2374f3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[, , , , ]" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# this is a trivial example in this case but in the instance of joined elements it can make a big difference in expressiveness / formalism of complex definitions\n", - "q" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "54c9ec02", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.query(PersonView).filter(PersonView.under_observation_on(date(2020, 6, 1))).all()[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "a0b86693", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cohort = (\n", - " session.query(PersonView)\n", - " .filter(\n", - " PersonView.age_at(date(2020, 1, 1)) >= 18,\n", - " PersonView.is_deceased == True,\n", - " )\n", - " .limit(10)\n", - " .all()\n", - ")\n", - "\n", - "cohort" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "4f77674c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'person_id': 1,\n", - " 'year_of_birth': 1976,\n", - " 'month_of_birth': 12,\n", - " 'gender_concept_id': 8689,\n", - " 'race_concept_id': 45456238,\n", - " 'ethnicity_concept_id': 38003564}" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cohort[0].to_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "69fff20b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cohort[0].death" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "00c0f530", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pv.observation_periods" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "61cbed1a", - "metadata": {}, - "outputs": [], - "source": [ - "q = (\n", - " session.query(PersonView)\n", - " .filter(PersonView.first_observation_date >= date(2020, 10, 1))\n", - " .filter(PersonView.last_observation_date <= date(2021, 10, 31))\n", - ").all()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "07d6911c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "96" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(q)" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "50ada151", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ep = session.query(EpisodeView).first()\n", - "ep" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "46f0b554", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Disease Episode', 'Round cell liposarcoma of unknown primary site')" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ep.episode_concept.concept_name, ep.episode_object_concept.concept_name" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "34dfe21a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ep.events" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "ad088151", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "events = (\n", - " session.query(Episode_EventView)\n", - " .filter(Episode_EventView.episode_id == ep.episode_id)\n", - " .all()\n", - ")\n", - "\n", - "# polymorphic relationship to clinical fact tables can be context aware and resolved dynamically\n", - "events" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "87193c76", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'condition_occurrence'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "events[0].event_table" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "851aa001", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SELECT episode_event.episode_id, episode_event.event_id, episode_event.episode_event_field_concept_id \n", - "FROM episode_event \n", - "WHERE episode_event.episode_id = 1\n" - ] - } - ], - "source": [ - "q = session.query(Episode_EventView).filter(Episode_EventView.episode_id == ep.episode_id)\n", - "\n", - "print(q.statement.compile(compile_kwargs={\"literal_binds\": True}))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "201386d6", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e828901e", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/04_timeline.ipynb b/notebooks/04_timeline.ipynb deleted file mode 100644 index 59e747f..0000000 --- a/notebooks/04_timeline.ipynb +++ /dev/null @@ -1,142 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "8deb60a9", - "metadata": {}, - "outputs": [], - "source": [ - "import sqlalchemy as sa\n", - "from sqlalchemy.orm import sessionmaker\n", - "from omop_alchemy.cdm.model.vocabulary import Concept, ConceptView, Domain, Vocabulary, Concept_Class\n", - "from orm_loader.helpers import configure_logging, bootstrap, bulk_load_context\n", - "from omop_alchemy import get_engine_name, load_environment, TEST_PATH, ROOT_PATH\n", - "from omop_alchemy.cdm.model.extended import Person_Timeline" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "deea8749", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-22 15:30:52,347 | INFO | sql_loader.omop_alchemy.config | Environment variables loaded from .env file\n", - "2026-01-22 15:30:52,348 | INFO | sql_loader.omop_alchemy.config | Default database engine configured\n" - ] - } - ], - "source": [ - "\n", - "configure_logging()\n", - "load_environment()\n", - "engine_string = get_engine_name()\n", - "\n", - "engine = sa.create_engine(engine_string, future=True, echo=False)\n", - "bootstrap(engine, create=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b3e61002", - "metadata": {}, - "outputs": [], - "source": [ - "Session = sessionmaker(bind=engine, future=True)\n", - "session = Session()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b2e732c1", - "metadata": {}, - "outputs": [], - "source": [ - "people = session.query(Person_Timeline).limit(5).all()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "7446ea16", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "people[0].timeline" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "99c17c10", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['{\"person_id\": 2, \"concept_id\": 1635163, \"event_start\": \"2020-01-03T00:00:00\", \"event_end\": null, \"value\": {\"type\": \"numeric\", \"value\": 1.0}, \"metadata\": {\"unit_concept_id\": null}}',\n", - " '{\"person_id\": 2, \"concept_id\": 1633674, \"event_start\": \"2020-01-03T00:00:00\", \"event_end\": null, \"value\": {\"type\": \"numeric\", \"value\": 1.0}, \"metadata\": {\"unit_concept_id\": null}}',\n", - " '{\"person_id\": 2, \"concept_id\": 1634891, \"event_start\": \"2020-01-03T00:00:00\", \"event_end\": null, \"value\": {\"type\": \"numeric\", \"value\": 1.0}, \"metadata\": {\"unit_concept_id\": null}}',\n", - " '{\"condition_concept_id\": 36535612, \"condition_occurrence_id\": 2, \"condition_start_date\": \"2020-01-03\", \"condition_status_concept_id\": 32902, \"condition_type_concept_id\": 3564487, \"person_id\": 2}']" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "people[1].to_json()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb3b9d11", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/05_concept_resolver.ipynb b/notebooks/05_concept_resolver.ipynb deleted file mode 100644 index 80da2c0..0000000 --- a/notebooks/05_concept_resolver.ipynb +++ /dev/null @@ -1,308 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "5ebb19b4", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-26 21:26:57,912 | INFO | sql_loader.omop_alchemy.config | Environment variables loaded from .env file\n", - "2026-01-26 21:26:57,912 | INFO | sql_loader.omop_alchemy.config | Default database engine configured\n" - ] - } - ], - "source": [ - "from orm_loader.helpers import configure_logging, bootstrap\n", - "from omop_alchemy import get_engine_name, load_environment\n", - "import sqlalchemy as sa\n", - "\n", - "configure_logging()\n", - "load_environment()\n", - "\n", - "engine_string = get_engine_name('cdm')\n", - "engine = sa.create_engine(engine_string, future=True, echo=False)\n", - "\n", - "bootstrap(engine, create=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "35e8b1b7", - "metadata": {}, - "outputs": [], - "source": [ - "from omop_alchemy.cdm.model.vocabulary import Concept, Concept_Relationship\n", - "from omop_alchemy.cdm.model.clinical import Condition_Occurrence\n", - "from sqlalchemy.orm import sessionmaker" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5921d6ac", - "metadata": {}, - "outputs": [], - "source": [ - "Session = sessionmaker(bind=engine, future=True)\n", - "session = Session()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c5154ea0", - "metadata": {}, - "outputs": [], - "source": [ - "from omop_alchemy.cdm.model.extended.concept_resolver import OMOPConceptResolver, ConceptValidationMixin\n", - "from orm_loader.helpers import Base\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "515d57fe", - "metadata": {}, - "outputs": [], - "source": [ - "related_concept = sa.alias(Concept, name='related_concept')\n", - "\n", - "q = (\n", - " sa.select(\n", - " Concept.concept_id,\n", - " Concept.standard_concept,\n", - " Concept_Relationship.relationship_id,\n", - " related_concept.c.concept_id.label('related_concept_id'),\n", - " related_concept.c.standard_concept.label('related_standard_concept'),\n", - " ).join(\n", - " Concept_Relationship, Concept.concept_id == Concept_Relationship.concept_id_1\n", - " ).join(\n", - " related_concept, Concept_Relationship.concept_id_2 == related_concept.c.concept_id\n", - " ).where(\n", - " Concept_Relationship.relationship_id == 'Subsumes'\n", - " )\n", - ").subquery()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1372d0dc", - "metadata": {}, - "outputs": [], - "source": [ - "class TestMapper(OMOPConceptResolver, ConceptValidationMixin, Base):\n", - " __table__ = q\n", - "\n", - " concept_id = q.c.concept_id\n", - " standard_concept = q.c.standard_concept\n", - " relationship_id = q.c.relationship_id\n", - " related_concept_id = q.c.related_concept_id\n", - " related_standard_concept = q.c.related_standard_concept" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "dfbdb85f", - "metadata": {}, - "outputs": [], - "source": [ - "table = TestMapper.get_queryable_table(session)\n", - "cols = TestMapper.concept_id_columns()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d62c7f03", - "metadata": {}, - "outputs": [], - "source": [ - "violations = TestMapper.referenced_concept_violations(session)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52f3fdde", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "b0f313cb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Invalid Related Concept IDs
037109760
137109761
237109762
342598409
43170326
......
318137109755
318237109756
318337109757
318437109758
318537109759
\n", - "

3186 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " Invalid Related Concept IDs\n", - "0 37109760\n", - "1 37109761\n", - "2 37109762\n", - "3 42598409\n", - "4 3170326\n", - "... ...\n", - "3181 37109755\n", - "3182 37109756\n", - "3183 37109757\n", - "3184 37109758\n", - "3185 37109759\n", - "\n", - "[3186 rows x 1 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "pd.DataFrame(violations['related_concept_id'], columns=['Invalid Related Concept IDs'])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "0da24094", - "metadata": {}, - "outputs": [], - "source": [ - "class CoT(Condition_Occurrence, OMOPConceptResolver, ConceptValidationMixin):\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "a599d50b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'condition_type_concept_id': {32544, 32545, 42539609, 45754907}}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "CoT.referenced_concept_violations(session)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fce09d89", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/ORMforResearchReadyData_APAC2023.pdf b/notebooks/ORMforResearchReadyData_APAC2023.pdf deleted file mode 100644 index 54d5557..0000000 Binary files a/notebooks/ORMforResearchReadyData_APAC2023.pdf and /dev/null differ diff --git a/notebooks/concept_enums.py b/notebooks/concept_enums.py deleted file mode 100644 index da0f4f0..0000000 --- a/notebooks/concept_enums.py +++ /dev/null @@ -1,207 +0,0 @@ -import enum - -class ConceptEnum(enum.Enum): - - @classmethod - def member_values(cls): - return [s.value for s in cls] - - @classmethod - def is_member(cls, val): - return not val or val in [s.value for s in cls] - - @classmethod - def labels(cls): - return [s.name for s in cls] - - @classmethod - def get_name(cls, val): - try: - return cls(val).name - except: - return '' - -class ModifierFields(ConceptEnum): - condition_occurrence_id = 1147127 - drug_exposure_id = 1147707 - procedure_occurrence_id = 1147082 - episode_id = 756290 - -class ModifierTables(ConceptEnum): - drug_exposure = 1147339 - episode = 35225440 - observation = 1147304 - -class TreatmentEpisode(ConceptEnum): - treatment_regimen = 32531 # Assignment to or derivation of chemo treatment regimen - treatment_cycle = 32532 # Assignment to or derivation of chemo treatment cycle - cancer_surgery = 32939 # Surgical treatment episode - radiotherapy = 32940 # Radiotherapy treatment episode - -class Modality(ConceptEnum): - chemotherapy = 35803401 - radiotherapy = 35803411 - -class DiseaseEpisodeConcepts(ConceptEnum): - episode_of_care = 32533 # Overarching disease episode - - confined = 32528 # Confined disease extent - invasive = 32677 # Invasive disease extent - metastatic = 32944 # Invasive disease extent - - stable_disease = 32948 # Stable disease dynamic - disease_progression = 32949 # Progression disease dynamic - partial_response = 32947 # Partial response disease dynamic - complete_response = 32947 # Complete response disease dynamic - -class EpisodeTypes(ConceptEnum): - ehr_defined = 32544 # Episode defined in EHR - ehr_derived = 32545 # Episode derived algorithmically from EHR - ehr_prescription = 32838 # EHR prescription - ehr_planned_dispensing = 32837 # EHR planned dispensation - ehr_encounter_record = 32827 # EHR encounter - ehr_admin_record = 32818 # EHR administration record - ehr_outpatient_note = 32834 # EHR outpatient note - rt_care_plan = 42539609 # RT care plan - -class DocumentType(ConceptEnum): - oncology_note = 706266 - -class DocumentEncoding(ConceptEnum): - UTF8 = 32678 - -class Language(ConceptEnum): - english = 4180186 - -class ConditionModifiers(ConceptEnum): - # for measurement_concept_id grouping - init_diag = 734306 # Cancer Modifier - Initial Diagnosis - tnm = 734320 # Cancer Modifier - Parent AJCC/UICC concept - mets = 36769180 # Cancer Modifier - Parent metastasis hierarchy parent - -class TreatmentModifiers(ConceptEnum): - rt_parameter = 4036397 # Radiotherapy parameter parent - rt_projection = 4124464 # Radiotherapy projection parent - rt_site = 4240671 # Radiotherapy anatomical site parent - -class TreatmentIntent(ConceptEnum): - neoadjuvant = 4161587 - adjuvant = 4191637 - curative = 4162591 - palliative = 4179711 - -class CancerProcedureTypes(ConceptEnum): - surgical_procedure = 4301351 - historical_procedure = 1340204 - rt_procedure = 1242725 # Radiotherapy procedure parent - rn_procedure = 4161415 # Radionuclide parent - rt_externalbeam = 4141448 # ebrt parent - rt_course = 37163499 # overall RT course as a procedure - used to hold intent modifier, as well as to compare intended vs. delivered treatment events - -class ProceduresByLocation(ConceptEnum): - procedure_on_lung = 4040549 - operation_on_lung = 4301352 - -class TStageConcepts(ConceptEnum): - # used to group tnm mappings into their relevant subtypes - # preferably create a concept that is the parent of all these T concepts, but for now... - t0 = 1634213 - t1 = 1635564 - t2 = 1635562 - t3 = 1634376 - t4 = 1634654 - ta = 1635114 - tx = 1635682 - tis = 1634530 - -class NStageConcepts(ConceptEnum): - # as above for n... - n0 = 1633440 - n1 = 1634434 - n2 = 1634119 - n3 = 1635320 - n4 = 1635445 - nx = 1633885 - -class MStageConcepts(ConceptEnum): - # and m... - m0 = 1635624 - m1 = 1635142 - mx = 1633547 - -class GroupStageConcepts(ConceptEnum): - # there's a pattern here - stage0 = 1633754 - stageI = 1633306 - stageII = 1634209 - stageIII = 1633650 - stageIV = 1633308 - -class ConditionConcepts(ConceptEnum): - ehr_problem_list = 32840 - resolved_condition = 32906 - confirmed_diagnosis = 32893 - - -class StageEdition(ConceptEnum): - _6th = 1634647 - _7th = 1633496 - _8th = 1634449 - -class ModifierConcepts(ConceptEnum): - grade = 35918328 - laterality = 35918306 - derived_value = 45754907 - tumor_size = 4139794 - primary_tumor = 36768229 - - -class DrugExposureConcepts(ConceptEnum): - drug_dose = 4162374 - ehr_drug_admin = 32818 - placebo = 1379408 - -class DemographyConcepts(ConceptEnum): - cob = 4155450 - language_spoken = 4052785 - postcode = 4083591 - - -class GenomicValue(ConceptEnum): - positive = 9191 - negative = 9189 - equivocal = 4172976 - -class CancerConsultTypes(ConceptEnum): - medonc = 4147722 - clinonc = 4139715 # there is no suitable radonc code? only radiotherapist? - oncology_referral = 4084352 - pall_care_referral = 4127745 - -class ProviderSpecialty(ConceptEnum): - radonc = 35621987 - medonc = 4151173 - pall_care = 4202942 - dietetitian = 4220638 - occupational_therapist = 4213188 - speech_therapist = 4010130 - haematologist = 4221826 - geneticist = 4009808 - gynaecologist = 17036 - radiation_therapist = 4143746 - medical_doctor = 4010577 - - -class WeightConcepts(ConceptEnum): - weight = 4099154 - height = 607590 - bsa = 4201235 - weight_change = 4086522 - -class WeightUnits(ConceptEnum): - lb = 8739 - pct = 4041099 - kg = 9529 - cm = 8582 - inch = 9327 - m2 = 8617 \ No newline at end of file diff --git a/omop_alchemy/config.py b/omop_alchemy/config.py index eafadb6..1cbd66f 100644 --- a/omop_alchemy/config.py +++ b/omop_alchemy/config.py @@ -10,10 +10,12 @@ logger = get_logger(__name__) +# from orm-loader 0.4.0 onwards, implicit psycopg2 dependency has been removed in favor of explicit driver modules. +# This mapping is used to provide clearer error messages when a required driver is missing. POSTGRES_DRIVER_MODULES: Mapping[str, str] = { - "postgresql": "psycopg2", - "postgresql+psycopg2": "psycopg2", + "postgresql": "psycopg", # bare URL aliased to psycopg "postgresql+psycopg": "psycopg", + "postgresql+psycopg2": "psycopg2", # retained so missing-driver message is clear } def load_environment(dotenv: str = '') -> None: diff --git a/omop_alchemy/maintenance/backup.py b/omop_alchemy/maintenance/backup.py index 6f32eee..a277e78 100644 --- a/omop_alchemy/maintenance/backup.py +++ b/omop_alchemy/maintenance/backup.py @@ -98,7 +98,7 @@ def _psql_path() -> str: def _default_output_path(format: BackupFormat) -> Path: timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - return Path.cwd() / f"omop-maint-backup-{timestamp}{FORMAT_SUFFIXES[format]}" + return Path.cwd() / f"omop-alchemy-backup-{timestamp}{FORMAT_SUFFIXES[format]}" def _libpq_connection_uri(url: sa.engine.URL) -> str: diff --git a/omop_alchemy/maintenance/cli.py b/omop_alchemy/maintenance/cli.py index d7f640f..de2c23f 100644 --- a/omop_alchemy/maintenance/cli.py +++ b/omop_alchemy/maintenance/cli.py @@ -35,7 +35,7 @@ from .help import install_help_customizations from .info import collect_maintenance_info from .indexes import IndexAction, manage_indexes -from .load_vocab import VocabularyLoadProgress, load_vocab_source +from .load_vocab import MergeStrategy, VocabularyLoadProgress, load_vocab_source from .reconcile import reconcile_schema from .reset_sequences import reset_model_sequences from .tables import TableScope @@ -140,7 +140,7 @@ def _configure_cli_logging() -> None: ) if mode == "file": - log_path = defaults_path().parent / "logging" / "omop-maint.log" + log_path = defaults_path().parent / "logging" / "omop-alchemy.log" log_path.parent.mkdir(parents=True, exist_ok=True) handler: logging.Handler = logging.FileHandler(log_path, encoding="utf-8") else: @@ -821,13 +821,13 @@ def load_vocab_source_command( dotenv: str | None = typer.Option(None, help="Optional dotenv file to load."), engine_schema: str | None = typer.Option(None, help="Engine schema selector."), db_schema: str | None = typer.Option(None, help="Database schema override. PostgreSQL only; uses search_path for ORM CSV loading."), - merge_strategy: str = typer.Option( - "upsert", - help="CSV merge strategy passed to the ORM loader. Defaults to non-destructive `upsert`; use `replace` to overwrite matching primary keys.", + merge_strategy: MergeStrategy = typer.Option( + "replace", + help="CSV merge strategy. One of `replace` (default, keeps DB in sync), `upsert` (incremental, non-destructive), or `insert_if_empty` (fast path for a fresh empty target).", ), chunksize: int | None = typer.Option( - None, - help="Chunk size for fallback ORM CSV loading to reduce memory usage on large Athena files.", + 100_000, + help="Chunk size for fallback ORM CSV loading. Defaults to 100 000 rows; pass 0 to disable chunking.", ), dry_run: bool = typer.Option(False, "--dry-run"), ) -> None: @@ -851,7 +851,7 @@ def load_vocab_source_command( console.print( render_error( "No Athena vocabulary source path is configured. " - "Set it with `omop-maint config set-overrides --athena-source ` " + "Set it with `omop-alchemy config set-overrides --athena-source ` " "or pass `--athena-source`." ) ) @@ -895,25 +895,15 @@ def _update_progress(event: VocabularyLoadProgress) -> None: ) ) - if chunksize is None: - report = load_vocab_source( - engine, - source_path=connection_defaults.athena_source, - db_schema=connection_defaults.db_schema, - dry_run=dry_run, - merge_strategy=merge_strategy, - progress_callback=_update_progress, - ) - else: - report = load_vocab_source( - engine, - source_path=connection_defaults.athena_source, - db_schema=connection_defaults.db_schema, - dry_run=dry_run, - merge_strategy=merge_strategy, - chunksize=chunksize, - progress_callback=_update_progress, - ) + report = load_vocab_source( + engine, + source_path=connection_defaults.athena_source, + db_schema=connection_defaults.db_schema, + dry_run=dry_run, + merge_strategy=merge_strategy, + chunksize=None if chunksize == 0 else chunksize, + progress_callback=_update_progress, + ) progress.update( task_id, completed=100.0, diff --git a/omop_alchemy/maintenance/doctor.py b/omop_alchemy/maintenance/doctor.py index bc1a881..91b1cbd 100644 --- a/omop_alchemy/maintenance/doctor.py +++ b/omop_alchemy/maintenance/doctor.py @@ -63,7 +63,7 @@ def _build_recommendations( DoctorRecommendation( status="warning", summary=f"{info.missing_table_count} ORM-managed table(s) are missing from the target database.", - action="Run `omop-maint create-missing-tables` before attempting bulk operations.", + action="Run `omop-alchemy create-missing-tables` before attempting bulk operations.", ) ) @@ -72,7 +72,7 @@ def _build_recommendations( DoctorRecommendation( status="warning", summary=f"Schema reconciliation found {len(reconciliation.issues)} difference(s) against ORM metadata.", - action="Review `omop-maint reconcile-schema` output before continuing with ETL or maintenance work.", + action="Review `omop-alchemy reconcile-schema` output before continuing with ETL or maintenance work.", ) ) @@ -84,7 +84,7 @@ def _build_recommendations( DoctorRecommendation( status="warning", summary="Some PostgreSQL RI triggers are currently disabled.", - action="If loading is complete, run `omop-maint foreign-keys validate` and then `omop-maint foreign-keys enable --strict`.", + action="If loading is complete, run `omop-alchemy foreign-keys validate` and then `omop-alchemy foreign-keys enable --strict`.", ) ) @@ -96,7 +96,7 @@ def _build_recommendations( DoctorRecommendation( status="failed", summary="Foreign key validation found violating rows.", - action="Fix the reported rows, then rerun `omop-maint foreign-keys enable --strict`.", + action="Fix the reported rows, then rerun `omop-alchemy foreign-keys enable --strict`.", ) ) @@ -105,7 +105,7 @@ def _build_recommendations( DoctorRecommendation( status="warning", summary="`pg_dump` is not on PATH, so backup-database is unavailable from this machine.", - action="Install PostgreSQL client tools on the machine running `omop-maint`.", + action="Install PostgreSQL client tools on the machine running `omop-alchemy`.", ) ) @@ -118,7 +118,7 @@ def _build_recommendations( DoctorRecommendation( status="warning", summary="Neither `pg_restore` nor `psql` is on PATH, so restore-database is unavailable from this machine.", - action="Install PostgreSQL client tools on the machine running `omop-maint`.", + action="Install PostgreSQL client tools on the machine running `omop-alchemy`.", ) ) @@ -207,7 +207,7 @@ def collect_doctor_report( DoctorCheck( name="schema drift", status="skipped", - detail="Run `omop-maint doctor --deep` to reconcile ORM metadata against the target database.", + detail="Run `omop-alchemy doctor --deep` to reconcile ORM metadata against the target database.", ) ) @@ -261,7 +261,7 @@ def collect_doctor_report( DoctorCheck( name="foreign key validation", status="skipped", - detail="Run `omop-maint doctor --deep` to validate selected foreign key relationships.", + detail="Run `omop-alchemy doctor --deep` to validate selected foreign key relationships.", ) ) else: diff --git a/omop_alchemy/maintenance/info.py b/omop_alchemy/maintenance/info.py index 4ca7003..aabd11c 100644 --- a/omop_alchemy/maintenance/info.py +++ b/omop_alchemy/maintenance/info.py @@ -315,7 +315,7 @@ def collect_maintenance_info( managed_tables = select_maintenance_tables( exclude_categories=(() if vocabulary_included else (TableCategory.VOCABULARY,)) ) - cli_path = shutil.which("omop-maint") + cli_path = shutil.which("omop-alchemy") dotenv_exists = None if dotenv is None else os.path.exists(dotenv) engine_name: str | None = None diff --git a/omop_alchemy/maintenance/load_vocab.py b/omop_alchemy/maintenance/load_vocab.py index a0ea2af..029a86d 100644 --- a/omop_alchemy/maintenance/load_vocab.py +++ b/omop_alchemy/maintenance/load_vocab.py @@ -3,7 +3,7 @@ from collections.abc import Callable from dataclasses import dataclass from pathlib import Path -from typing import TypeAlias, cast +from typing import Literal, TypeAlias, cast import sqlalchemy as sa import sqlalchemy.orm as so @@ -26,6 +26,8 @@ from .reset_sequences import reset_model_sequences from .tables import TableCategory, schema_adjusted_metadata, select_maintenance_tables +MergeStrategy: TypeAlias = Literal["replace", "upsert", "insert_if_empty"] + VocabularyModel: TypeAlias = type[CSVTableProtocol] VocabularyLoadProgressCallback: TypeAlias = Callable[["VocabularyLoadProgress"], None] @@ -48,7 +50,7 @@ class VocabularyLoadReport: source_path: str backend: str db_schema: str | None - merge_strategy: str + merge_strategy: MergeStrategy created_table_count: int sequence_reset_count: int results: tuple[VocabularyLoadResult, ...] @@ -149,8 +151,8 @@ def _load_vocab_model_csv( *, model: VocabularyModel, csv_path: Path, - merge_strategy: str, - quote_mode: str = "csv", + merge_strategy: MergeStrategy, + quote_mode: str = "auto", chunksize: int | None = None, ) -> int: load_kwargs: dict[str, object] = { @@ -262,7 +264,8 @@ def _configure_loader_connection( "SQLite uses the default database namespace." ) - connection.exec_driver_sql(f"SET search_path TO {db_schema}") + quoted_schema = '"' + db_schema.replace('"', '""') + '"' + connection.exec_driver_sql(f"SET search_path TO {quoted_schema}") def load_vocab_source( engine: sa.Engine, @@ -270,8 +273,8 @@ def load_vocab_source( source_path: str | Path, db_schema: str | None = None, dry_run: bool = False, - merge_strategy: str = "replace", - chunksize: int | None = None, + merge_strategy: MergeStrategy = "replace", + chunksize: int | None = 100_000, progress_callback: VocabularyLoadProgressCallback | None = None, ) -> VocabularyLoadReport: _ensure_supported_backend(engine) @@ -361,6 +364,7 @@ def load_vocab_source( for table_index, item in enumerate(load_items, start=1): model = item.model csv_path = item.csv_path + required = item.required current_model_name = model.__tablename__ current_csv_path = str(csv_path) if dry_run: @@ -396,7 +400,7 @@ def load_vocab_source( row_count=None, csv_path=str(csv_path), required=required, - detail="Athena CSV would be loaded via staged ORM CSV loader using tab-delimited input and literal quote mode", + detail="Athena CSV would be loaded via staged ORM CSV loader using tab-delimited input and auto-detected quote mode", ) ) continue @@ -405,7 +409,7 @@ def load_vocab_source( "model": model, "csv_path": csv_path, "merge_strategy": merge_strategy, - "quote_mode": "literal", + "quote_mode": "auto", } if chunksize is not None: loader_kwargs["chunksize"] = chunksize @@ -465,7 +469,7 @@ def load_vocab_source( row_count=row_count, csv_path=str(csv_path), required=required, - detail="Athena CSV loaded via staged ORM CSV loader using tab-delimited input and literal quote mode", + detail="Athena CSV loaded via staged ORM CSV loader using tab-delimited input and auto-detected quote mode", ) ) if not dry_run: diff --git a/omop_alchemy/maintenance/tables.py b/omop_alchemy/maintenance/tables.py index 407bbf0..999efd0 100644 --- a/omop_alchemy/maintenance/tables.py +++ b/omop_alchemy/maintenance/tables.py @@ -67,7 +67,8 @@ def has_single_integer_primary_key(self) -> bool: def qualified_table_name(table_name: str, db_schema: str | None) -> str: if db_schema: - return f"{db_schema}.{table_name}" + quoted_schema = '"' + db_schema.replace('"', '""') + '"' + return f"{quoted_schema}.{table_name}" return table_name diff --git a/omop_alchemy/maintenance/ui.py b/omop_alchemy/maintenance/ui.py index 6e4a7a1..3bcd4e9 100644 --- a/omop_alchemy/maintenance/ui.py +++ b/omop_alchemy/maintenance/ui.py @@ -825,7 +825,7 @@ def render_foreign_key_validation_summary( ( "All selected foreign key relationships passed validation." if not failed_tables - else "Fix the violating rows, then rerun `omop-maint foreign-keys enable --strict`." + else "Fix the violating rows, then rerun `omop-alchemy foreign-keys enable --strict`." ), ) return Panel.fit( diff --git a/pyproject.toml b/pyproject.toml index f8f6890..965ad27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "omop-alchemy" -version = "0.6.2" +version = "0.6.3" description = "SQLAlchemy-based models, validation, and utilities for the OHDSI OMOP Common Data Model" readme = "README.md" requires-python = ">=3.12" @@ -36,13 +36,12 @@ dependencies = [ "python-dotenv>=1.2.2", "typer>=0.12", "rich>=13.0", - "orm-loader>=0.3.27,<0.4.0", + "orm-loader>=0.4.1", ] [project.optional-dependencies] postgres = [ "psycopg[binary]>=3.2", - "psycopg2-binary>=2.9", ] dev = [ @@ -69,13 +68,15 @@ Repository = "https://github.com/AustralianCancerDataNetwork/OMOP_Alchemy" Issues = "https://github.com/AustralianCancerDataNetwork/OMOP_Alchemy/issues" [project.scripts] -omop-maint = "omop_alchemy.maintenance.cli:main" +omop-alchemy = "omop_alchemy.maintenance.cli:main" [build-system] requires = ["setuptools>=68", "wheel"] build-backend = "setuptools.build_meta" +[tool.pytest.ini_options] + [tool.setuptools] include-package-data = true diff --git a/tests/README.md b/tests/README.md index f2e6ec4..0f70491 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,8 +1,41 @@ -# OMOP_Alchemy Tests +# Running the test suite -## Running Tests +## Quick start ```bash -py.test omop_alchemy # run all tests -py.test omop_alchemy test_config_and_setup.py # run specific test battery -``` \ No newline at end of file +# Unit and SQLite tests — no database required +uv run --extra dev pytest -m "not postgres" + +# PostgreSQL integration tests — requires the Docker container below +docker compose -f tests/docker-compose.yaml up -d +uv run --extra dev --extra postgres pytest -m postgres -v +``` + +## PostgreSQL integration tests + +The `postgres`-marked tests connect to a local PostgreSQL 16 container on +port **55432**. + +```bash +# Start +docker compose -f tests/example-docker-compose.yaml up -d + +# Run (this will run all tests) +uv run --extra dev --extra postgres pytest -m "postgres or not postgres" -v + +# Stop +docker compose -f tests/docker-compose.yaml down +``` + +## Test markers + +| Marker | Meaning | +|--------|---------| +| *(none)* | Runs on SQLite, no external dependencies | +| `postgres` | Requires the Docker container on port 55432 | + +## Fixture data + +`tests/fixtures/athena_source/` contains a minimal set of Athena vocabulary +CSVs (7 concepts) used to seed the SQLite test database. These are committed +to the repo and are sufficient for all non-postgres tests. diff --git a/tests/conftest.py b/tests/conftest.py index 3443879..6c3422c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,16 @@ +import copy +import time from datetime import date from pathlib import Path - +import os import pytest import sqlalchemy as sa from orm_loader.helpers import bootstrap import sqlalchemy.orm as so from sqlalchemy.orm import Session, sessionmaker +from typing import Any, Dict, Tuple + from omop_alchemy.maintenance.load_vocab import _load_vocab_model_csv from omop_alchemy.cdm.model.clinical import Condition_Occurrence, Person from omop_alchemy.cdm.model.derived import Observation_Period @@ -32,27 +36,149 @@ Concept_Relationship, ] +# --------------------------------------------------------------------------- +# In-memory Athena fixture data +# --------------------------------------------------------------------------- +# Keyed by ORM __tablename__. Each value is a dict mapping column name → +# tuple of row values (one entry per row, in the same order). +# Empty tuples = table has no rows (header-only CSV). + +_ATHENA_FIXTURE_DATA: Dict[str, Dict[str, Tuple[Any, ...]]] = { + "concept_ancestor": { + "ancestor_concept_id": (), + "descendant_concept_id": (), + "min_levels_of_separation": (), + "max_levels_of_separation": (), + }, + "concept_class": { + "concept_class_id": ("Clinical Finding", "Episode", "Ethnicity", "Field", "Gender", "Race", "Type Concept"), + "concept_class_name": ("Clinical Finding", "Episode", "Ethnicity", "Field", "Gender", "Race", "Type Concept"), + "concept_class_concept_id": (0, 0, 0, 0, 0, 0, 0), + }, + "concept_relationship": { + "concept_id_1": (), + "concept_id_2": (), + "relationship_id": (), + "valid_start_date": (), + "valid_end_date": (), + "invalid_reason": (), + }, + "concept_synonym": { + "concept_id": (), + "concept_synonym_name": (), + "language_concept_id": (), + }, + "concept": { + "concept_id": (8507, 8527, 38003564, 32817, 201826, 32546, 1147127), + "concept_name": ( + "MALE", + "White", + "Not Hispanic or Latino", + "EHR", + "Type 2 diabetes mellitus", + "Disease Episode", + "condition_occurrence.condition_occurrence_id", + ), + "domain_id": ("Gender", "Race", "Ethnicity", "Type Concept", "Condition", "Episode", "Metadata"), + "vocabulary_id": ("Gender", "Race", "Ethnicity", "Type Concept", "SNOMED", "Episode", "CDM"), + "concept_class_id": ( + "Gender", + "Race", + "Ethnicity", + "Type Concept", + "Clinical Finding", + "Episode", + "Field", + ), + "standard_concept": ("S", "S", "S", "S", "S", "S", "S"), + "concept_code": ( + "M", + "White", + "Not Hispanic or Latino", + "EHR", + "44054006", + "Disease Episode", + "condition_occurrence.condition_occurrence_id", + ), + "valid_start_date": ( + "19700101", + "19700101", + "19700101", + "19700101", + "19700101", + "19700101", + "19700101", + ), + "valid_end_date": ( + "20991231", + "20991231", + "20991231", + "20991231", + "20991231", + "20991231", + "20991231", + ), + "invalid_reason": (None, None, None, None, None, None, None), + }, + "domain": { + "domain_id": ("Condition", "Episode", "Ethnicity", "Gender", "Metadata", "Race", "Type Concept"), + "domain_name": ("Condition", "Episode", "Ethnicity", "Gender", "Metadata", "Race", "Type Concept"), + "domain_concept_id": (0, 0, 0, 0, 0, 0, 0), + }, + "relationship": { + "relationship_id": ("Is a", "Subsumes"), + "relationship_name": ("Is a", "Subsumes"), + "is_hierarchical": (1, 1), + "defines_ancestry": (1, 0), + "reverse_relationship_id": ("Subsumes", "Is a"), + "relationship_concept_id": (0, 0), + }, + "vocabulary": { + "vocabulary_id": ("CDM", "Episode", "Ethnicity", "Gender", "Race", "SNOMED", "Type Concept"), + "vocabulary_name": ( + "Common Data Model", + "OMOP Episode", + "OMOP Ethnicity", + "OMOP Gender", + "OMOP Race", + "SNOMED-CT", + "OMOP Type Concept", + ), + "vocabulary_reference": ("OHDSI", "OHDSI", "OHDSI", "OHDSI", "OHDSI", "IHTSDO", "OHDSI"), + "vocabulary_version": ("v5.4", "v1.0", "v1.0", "v1.0", "v1.0", "SNOMED CT 2023", "v1.0"), + "vocabulary_concept_id": (0, 0, 0, 0, 0, 0, 0), + }, +} + -def _athena_source_path() -> Path: - """Return the repo-local Athena fixture directory.""" - return Path(__file__).parent / "fixtures" / "athena_source" +def _write_fixture_csv(directory: Path, table_name: str, data: Dict[str, Tuple[Any, ...]]) -> Path: + """Write an in-memory fixture dict to a tab-separated CSV file.""" + path = directory / f"{table_name.upper()}.csv" + cols = list(data.keys()) + rows = list(zip(*data.values())) if cols and any(data.values()) else [] + with open(path, "w", newline="", encoding="utf-8") as f: + f.write("\t".join(cols) + "\n") + for row in rows: + f.write("\t".join("" if v is None else str(v) for v in row) + "\n") + return path -def _load_fixture_vocabulary(engine: sa.Engine) -> None: - """Load required Athena vocabulary fixtures into the test database.""" - base_path = _athena_source_path() +def _load_fixture_vocabulary(engine: sa.Engine, tmp_dir: Path) -> None: + """Write in-memory Athena fixtures to tmp_dir and load them into the test database.""" with engine.connect() as connection: SessionLocal = so.sessionmaker(bind=connection, future=True) session = SessionLocal() try: for model in ATHENA_LOAD_ORDER: - csv_path = base_path / f"{model.__tablename__.upper()}.csv" + csv_path = _write_fixture_csv( + tmp_dir, model.__tablename__, _ATHENA_FIXTURE_DATA[model.__tablename__] + ) _load_vocab_model_csv( session, model=model, csv_path=csv_path, merge_strategy="upsert", - quote_mode="literal", + quote_mode="auto", ) session.commit() connection.commit() @@ -188,7 +314,7 @@ def engine(tmp_path_factory: pytest.TempPathFactory): ) bootstrap(engine, create=True) - _load_fixture_vocabulary(engine) + _load_fixture_vocabulary(engine, db_dir) SessionLocal = sessionmaker(bind=engine, future=True, expire_on_commit=False) with SessionLocal() as seed_session: @@ -200,6 +326,62 @@ def engine(tmp_path_factory: pytest.TempPathFactory): engine.dispose() +@pytest.fixture(scope="session") +def pg_engine(): + """ + Session-scoped engine connecting to a local PostgreSQL container. + + Start the container with: + docker compose -f tests/docker-compose.yaml up -d + + The fixture retries for up to 20 seconds to allow the container to become ready. + """ + _PG_URL = os.getenv("ENGINE_CDM") + if not _PG_URL: + pytest.skip("No PostgreSQL engine configured. Set ENGINE_CDM environment variable.") + engine = sa.create_engine(_PG_URL, future=True) + for attempt in range(20): + try: + with engine.connect() as conn: + conn.execute(sa.text("SELECT 1")) + break + except Exception: + if attempt == 19: + engine.dispose() + pytest.fail( + "PostgreSQL container not available after 20 attempts. " + "Run: docker compose -f tests/docker-compose.yaml up -d" + ) + time.sleep(1) + try: + yield engine + finally: + engine.dispose() + + +@pytest.fixture +def pg_session(pg_engine): + """ + Function-scoped PostgreSQL session with a clean schema for each test. + + Drops and recreates the public schema before each test to ensure full isolation. + """ + with pg_engine.connect() as conn: + conn.execute(sa.text("DROP SCHEMA public CASCADE")) + conn.execute(sa.text("CREATE SCHEMA public")) + conn.commit() + + bootstrap(pg_engine, create=True) + + SessionLocal = sessionmaker(bind=pg_engine, future=True, expire_on_commit=False) + session = SessionLocal() + try: + yield session + finally: + session.rollback() + session.close() + + @pytest.fixture(scope="function") def session(engine) -> Session: # type: ignore """ @@ -220,3 +402,74 @@ def session(engine) -> Session: # type: ignore session.rollback() finally: session.close() + + +# --------------------------------------------------------------------------- +# In-memory Athena vocabulary fixtures +# --------------------------------------------------------------------------- +# Each fixture returns a mutable copy of the module-level constant so tests +# can append rows without cross-contaminating other tests. + + +@pytest.fixture(scope="function") +def athena_fixtures() -> Dict[str, Dict[str, Tuple[Any, ...]]]: + """All Athena vocabulary tables as a single dict keyed by ORM table name.""" + return {k: dict(v) for k, v in _ATHENA_FIXTURE_DATA.items()} + + +@pytest.fixture(scope="function") +def athena_source_dir(tmp_path: Path) -> Path: + """Write in-memory Athena fixtures to a temp directory and return the path.""" + source = tmp_path / "athena_source" + source.mkdir() + for table_name, data in _ATHENA_FIXTURE_DATA.items(): + _write_fixture_csv(source, table_name, data) + return source + + +@pytest.fixture(scope="function") +def concept_ancestor() -> Dict[str, Tuple[Any, ...]]: + """Mutable copy of the concept_ancestor fixture data.""" + return copy.deepcopy(_ATHENA_FIXTURE_DATA["concept_ancestor"]) + + +@pytest.fixture(scope="function") +def concept_class() -> Dict[str, Tuple[Any, ...]]: + """Mutable copy of the concept_class fixture data.""" + return copy.deepcopy(_ATHENA_FIXTURE_DATA["concept_class"]) + + +@pytest.fixture(scope="function") +def concept_relationship() -> Dict[str, Tuple[Any, ...]]: + """Mutable copy of the concept_relationship fixture data.""" + return copy.deepcopy(_ATHENA_FIXTURE_DATA["concept_relationship"]) + + +@pytest.fixture(scope="function") +def concept_synonym() -> Dict[str, Tuple[Any, ...]]: + """Mutable copy of the concept_synonym fixture data.""" + return copy.deepcopy(_ATHENA_FIXTURE_DATA["concept_synonym"]) + + +@pytest.fixture(scope="function") +def concept() -> Dict[str, Tuple[Any, ...]]: + """Mutable copy of the concept fixture data.""" + return copy.deepcopy(_ATHENA_FIXTURE_DATA["concept"]) + + +@pytest.fixture(scope="function") +def domain() -> Dict[str, Tuple[Any, ...]]: + """Mutable copy of the domain fixture data.""" + return copy.deepcopy(_ATHENA_FIXTURE_DATA["domain"]) + + +@pytest.fixture(scope="function") +def relationship() -> Dict[str, Tuple[Any, ...]]: + """Mutable copy of the relationship fixture data.""" + return copy.deepcopy(_ATHENA_FIXTURE_DATA["relationship"]) + + +@pytest.fixture(scope="function") +def vocabulary() -> Dict[str, Tuple[Any, ...]]: + """Mutable copy of the vocabulary fixture data.""" + return copy.deepcopy(_ATHENA_FIXTURE_DATA["vocabulary"]) diff --git a/tests/fixtures/create_test_fixtures.py b/tests/fixtures/create_test_fixtures.py deleted file mode 100644 index f98c140..0000000 --- a/tests/fixtures/create_test_fixtures.py +++ /dev/null @@ -1,580 +0,0 @@ -from __future__ import annotations -""" -This script rebuilds the SQLite test fixture from Athena vocabulary CSVs, then exports dummy clinical tables as CSV files. - -It assumes you have a terse sample set of appropriate concepts in the Athena source, but will attempt to fall back to any available concepts if the ideal ones are not present. The generated clinical data is deterministic based on the provided random seed, but otherwise arbitrary and not meant to reflect any real patient population. -""" -import argparse -from dataclasses import dataclass -from datetime import date, timedelta -from pathlib import Path -from random import Random - -import pandas as pd -import sqlalchemy as sa -from sqlalchemy.orm import Session, sessionmaker - -from omop_alchemy.cdm.model.clinical import Condition_Occurrence, Death, Measurement, Person -from omop_alchemy.cdm.model.derived import Observation_Period -from omop_alchemy.cdm.model.health_system import Care_Site, Location, Provider, Visit_Occurrence -from omop_alchemy.cdm.model.structural import Episode, Episode_Event -from omop_alchemy.cdm.model.vocabulary import Concept, Concept_Ancestor -from omop_alchemy.maintenance.create_tables import create_missing_tables -from omop_alchemy.maintenance.load_vocab import load_vocab_source - - -ROOT = Path(__file__).resolve().parents[2] -DEFAULT_ATHENA_SOURCE = ROOT / "tests" / "fixtures" / "athena_source" -DEFAULT_DB_PATH = ROOT / "tests" / "fixtures" / "test.db" -DEFAULT_CLINICAL_CSV_DIR = ROOT / "tests" / "fixtures" / "test_clinical_csvs" - -CLINICAL_EXPORT_MODELS = ( - Location, - Care_Site, - Provider, - Person, - Visit_Occurrence, - Observation_Period, - Death, - Condition_Occurrence, - Measurement, - Episode, - Episode_Event, -) - - -@dataclass(frozen=True) -class FixtureConcepts: - genders: tuple[int, ...] - ethnicities: tuple[int, ...] - races: tuple[int, ...] - visit_concepts: tuple[int, ...] - location_concepts: tuple[int, ...] - provider_specialties: tuple[int, ...] - type_concepts: tuple[int, ...] - condition_concepts: tuple[int, ...] - stage_concepts: tuple[int, ...] - episode_concept_id: int - condition_event_field_concept_id: int - measurement_event_field_concept_id: int - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description=( - "Rebuild the SQLite test fixture from Athena vocabulary CSVs, " - "then export dummy clinical tables as CSV files." - ) - ) - parser.add_argument( - "--athena-source", - type=Path, - default=DEFAULT_ATHENA_SOURCE, - help="Path to the curated Athena fixture directory.", - ) - parser.add_argument( - "--db-path", - type=Path, - default=DEFAULT_DB_PATH, - help="Target SQLite database path.", - ) - parser.add_argument( - "--clinical-csv-dir", - type=Path, - default=DEFAULT_CLINICAL_CSV_DIR, - help="Directory to receive exported dummy clinical CSV files.", - ) - parser.add_argument( - "--seed", - type=int, - default=54, - help="Deterministic random seed for dummy clinical data generation.", - ) - parser.add_argument( - "--person-count", - type=int, - default=24, - help="Number of dummy people to generate.", - ) - parser.add_argument( - "--force-rebuild", - action="store_true", - help="Ignore any existing fixture database and rebuild from scratch.", - ) - return parser.parse_args() - - -def _reset_outputs(db_path: Path, clinical_csv_dir: Path) -> None: - db_path.parent.mkdir(parents=True, exist_ok=True) - clinical_csv_dir.mkdir(parents=True, exist_ok=True) - - if db_path.exists(): - db_path.unlink() - - for csv_path in clinical_csv_dir.glob("*.csv"): - csv_path.unlink() - - -def _fixture_db_has_people(db_path: Path) -> bool: - if not db_path.exists(): - return False - - engine = sa.create_engine(f"sqlite:///{db_path}", future=True, echo=False) - try: - inspector = sa.inspect(engine) - if not inspector.has_table("person"): - return False - - with engine.connect() as connection: - person_count = connection.scalar(sa.text("SELECT COUNT(*) FROM person")) - return bool(person_count and int(person_count) > 0) - except Exception: - return False - finally: - engine.dispose() - - -def _missing_clinical_csv_exports(output_dir: Path) -> tuple[str, ...]: - missing: list[str] = [] - for model in CLINICAL_EXPORT_MODELS: - output_path = output_dir / f"{model.__table__.name.upper()}.csv" - if not output_path.exists(): - missing.append(output_path.name) - return tuple(missing) - - -def _concept_ids( - session: Session, - *, - domain_id: str | None = None, - concept_name: str | None = None, - vocabulary_id: str | None = None, - concept_class_id: str | None = None, - standard_only: bool = False, - limit: int | None = None, -) -> tuple[int, ...]: - stmt = sa.select(Concept.concept_id) - if domain_id is not None: - stmt = stmt.where(Concept.domain_id == domain_id) - if concept_name is not None: - stmt = stmt.where(Concept.concept_name == concept_name) - if vocabulary_id is not None: - stmt = stmt.where(Concept.vocabulary_id == vocabulary_id) - if concept_class_id is not None: - stmt = stmt.where(Concept.concept_class_id == concept_class_id) - if standard_only: - stmt = stmt.where(Concept.standard_concept == "S") - stmt = stmt.order_by(Concept.concept_id) - if limit is not None: - stmt = stmt.limit(limit) - return tuple(int(value) for value in session.scalars(stmt)) - - -def _single_concept_id(session: Session, **filters: object) -> int: - values = _concept_ids(session, limit=1, **filters) - if not values: - raise RuntimeError(f"Missing concept fixture for filters: {filters}") - return values[0] - - -def _stage_concept_ids(session: Session) -> tuple[int, ...]: - stage_root_id = 734320 - parent_stmt = ( - sa.select(Concept.concept_id) - .join( - Concept_Ancestor, - Concept.concept_id == Concept_Ancestor.descendant_concept_id, - ) - .where(Concept_Ancestor.ancestor_concept_id == stage_root_id) - .where(Concept_Ancestor.max_levels_of_separation == 1) - .where( - sa.or_( - Concept.concept_name.contains("T"), - Concept.concept_name.contains("N"), - Concept.concept_name.contains("M"), - Concept.concept_name.contains("Stage"), - ) - ) - .order_by(Concept.concept_id) - ) - parent_ids = tuple(int(value) for value in session.scalars(parent_stmt)) - - if parent_ids: - descendant_stmt = ( - sa.select(Concept.concept_id) - .join( - Concept_Ancestor, - Concept.concept_id == Concept_Ancestor.descendant_concept_id, - ) - .where(Concept_Ancestor.ancestor_concept_id.in_(parent_ids)) - .where(Concept.concept_code.ilike("%8th%")) - .where(~Concept.concept_code.ilike("%yp%")) - .order_by(Concept.concept_id) - ) - stage_ids = tuple(int(value) for value in session.scalars(descendant_stmt)) - if stage_ids: - return stage_ids - - fallback = _concept_ids(session, domain_id="Measurement", standard_only=True, limit=12) - if fallback: - return fallback - - return _concept_ids(session, domain_id="Condition", standard_only=True, limit=12) - - -def _collect_fixture_concepts(session: Session) -> FixtureConcepts: - genders = _concept_ids(session, domain_id="Gender", standard_only=True, limit=4) - ethnicities = _concept_ids(session, domain_id="Ethnicity", standard_only=True, limit=4) - races = _concept_ids(session, domain_id="Race", standard_only=True, limit=6) - visit_concepts = _concept_ids(session, domain_id="Visit", standard_only=True, limit=8) - location_concepts = _concept_ids( - session, - concept_class_id="Location", - standard_only=True, - limit=8, - ) - provider_specialties = _concept_ids(session, domain_id="Provider", standard_only=True, limit=8) - type_concepts = _concept_ids(session, domain_id="Type Concept", standard_only=True, limit=12) - condition_concepts = _concept_ids( - session, - domain_id="Condition", - vocabulary_id="ICDO3", - standard_only=True, - limit=24, - ) or _concept_ids(session, domain_id="Condition", standard_only=True, limit=24) - stage_concepts = _stage_concept_ids(session) - - if not all((genders, ethnicities, races, visit_concepts, type_concepts, condition_concepts)): - raise RuntimeError("Fixture vocabulary does not contain the minimum concepts required for dummy clinical data.") - - return FixtureConcepts( - genders=genders, - ethnicities=ethnicities, - races=races, - visit_concepts=visit_concepts, - location_concepts=location_concepts, - provider_specialties=provider_specialties, - type_concepts=type_concepts, - condition_concepts=condition_concepts, - stage_concepts=stage_concepts, - episode_concept_id=_single_concept_id( - session, - domain_id="Episode", - concept_name="Disease Episode", - ), - condition_event_field_concept_id=_single_concept_id( - session, - domain_id="Metadata", - concept_name="condition_occurrence.condition_occurrence_id", - ), - measurement_event_field_concept_id=_single_concept_id( - session, - domain_id="Metadata", - concept_name="measurement.measurement_id", - ), - ) - - -def _pick(values: tuple[int, ...], rng: Random, index: int) -> int: - if not values: - raise RuntimeError("Expected at least one fixture concept value.") - return values[(index + rng.randint(0, len(values) - 1)) % len(values)] - - -def _seed_dummy_clinical_data( - session: Session, - *, - concepts: FixtureConcepts, - rng: Random, - person_count: int, -) -> None: - locations: list[Location] = [] - care_sites: list[Care_Site] = [] - providers: list[Provider] = [] - - for index in range(1, 7): - country_concept_id = ( - _pick(concepts.location_concepts, rng, index) - if concepts.location_concepts - else None - ) - locations.append( - Location( - location_id=index, - city=f"Fixture City {index}", - state="NS", - zip=f"20{index:02d}", - country_concept_id=country_concept_id, - location_source_value=f"fixture-location-{index}", - ) - ) - - for index in range(1, 9): - location = locations[(index - 1) % len(locations)] - care_sites.append( - Care_Site( - care_site_id=index, - care_site_name=f"Fixture Care Site {index}", - location_id=location.location_id, - place_of_service_concept_id=_pick(concepts.visit_concepts, rng, index), - care_site_source_value=f"fixture-care-site-{index}", - ) - ) - - for index in range(1, 13): - care_site = care_sites[(index - 1) % len(care_sites)] - specialty = ( - _pick(concepts.provider_specialties, rng, index) - if concepts.provider_specialties - else None - ) - providers.append( - Provider( - provider_id=index, - provider_name=f"Fixture Provider {index}", - care_site_id=care_site.care_site_id, - specialty_concept_id=specialty, - gender_concept_id=_pick(concepts.genders, rng, index), - provider_source_value=f"fixture-provider-{index}", - ) - ) - - session.add_all(locations) - session.add_all(care_sites) - session.add_all(providers) - session.flush() - - people: list[Person] = [] - visits: list[Visit_Occurrence] = [] - observation_periods: list[Observation_Period] = [] - deaths: list[Death] = [] - conditions: list[Condition_Occurrence] = [] - measurements: list[Measurement] = [] - episodes: list[Episode] = [] - episode_events: list[Episode_Event] = [] - - visit_id = 1 - observation_period_id = 1 - condition_id = 1 - measurement_id = 1 - episode_id = 1 - base_date = date(2020, 1, 1) - - for person_id in range(1, person_count + 1): - location = locations[(person_id - 1) % len(locations)] - care_site = care_sites[(person_id - 1) % len(care_sites)] - provider = providers[(person_id - 1) % len(providers)] - - person = Person( - person_id=person_id, - year_of_birth=1950 + (person_id % 55), - month_of_birth=(person_id % 12) + 1, - day_of_birth=(person_id % 28) + 1, - gender_concept_id=_pick(concepts.genders, rng, person_id), - race_concept_id=_pick(concepts.races, rng, person_id), - ethnicity_concept_id=_pick(concepts.ethnicities, rng, person_id), - location_id=location.location_id, - provider_id=provider.provider_id, - care_site_id=care_site.care_site_id, - person_source_value=f"fixture-person-{person_id}", - ) - people.append(person) - - visit_count = 1 + (person_id % 3) - person_visits: list[Visit_Occurrence] = [] - for visit_index in range(visit_count): - visit_date = base_date + timedelta(days=(person_id * 9) + (visit_index * 14)) - person_visits.append( - Visit_Occurrence( - visit_occurrence_id=visit_id, - person_id=person_id, - visit_concept_id=_pick(concepts.visit_concepts, rng, visit_id), - visit_start_date=visit_date, - visit_end_date=visit_date + timedelta(days=1), - visit_type_concept_id=_pick(concepts.type_concepts, rng, visit_id), - provider_id=provider.provider_id, - care_site_id=care_site.care_site_id, - visit_source_value=f"fixture-visit-{visit_id}", - ) - ) - visit_id += 1 - - visits.extend(person_visits) - - first_visit_date = person_visits[0].visit_start_date - last_visit_date = person_visits[-1].visit_end_date - death_date = None - if person_id % 8 == 0: - death_date = last_visit_date + timedelta(days=30 + person_id) - deaths.append( - Death( - person_id=person_id, - death_date=death_date, - death_type_concept_id=_pick(concepts.type_concepts, rng, person_id), - ) - ) - - observation_periods.append( - Observation_Period( - observation_period_id=observation_period_id, - person_id=person_id, - observation_period_start_date=first_visit_date, - observation_period_end_date=death_date or last_visit_date, - period_type_concept_id=_pick(concepts.type_concepts, rng, observation_period_id), - ) - ) - observation_period_id += 1 - - primary_visit = person_visits[0] - condition = Condition_Occurrence( - condition_occurrence_id=condition_id, - person_id=person_id, - condition_concept_id=_pick(concepts.condition_concepts, rng, condition_id), - condition_start_date=primary_visit.visit_start_date, - condition_end_date=primary_visit.visit_end_date + timedelta(days=28), - condition_type_concept_id=_pick(concepts.type_concepts, rng, condition_id), - visit_occurrence_id=primary_visit.visit_occurrence_id, - provider_id=provider.provider_id, - condition_source_value=f"fixture-condition-{condition_id}", - ) - conditions.append(condition) - - episode = Episode( - episode_id=episode_id, - person_id=person_id, - episode_parent_id=(episode_id - 1) if person_id % 6 == 0 else None, - episode_concept_id=concepts.episode_concept_id, - episode_object_concept_id=condition.condition_concept_id, - episode_start_date=condition.condition_start_date, - episode_end_date=death_date or (condition.condition_end_date or condition.condition_start_date), - episode_type_concept_id=_pick(concepts.type_concepts, rng, episode_id), - episode_source_value=f"fixture-episode-{episode_id}", - ) - episodes.append(episode) - - for offset in range(3): - stage_concept_id = _pick(concepts.stage_concepts, rng, measurement_id + offset) - measurement = Measurement( - measurement_id=measurement_id, - person_id=person_id, - measurement_concept_id=stage_concept_id, - measurement_date=condition.condition_start_date + timedelta(days=offset * 7), - measurement_type_concept_id=_pick(concepts.type_concepts, rng, measurement_id), - measurement_event_id=condition.condition_occurrence_id, - meas_event_field_concept_id=concepts.condition_event_field_concept_id, - visit_occurrence_id=primary_visit.visit_occurrence_id, - provider_id=provider.provider_id, - value_as_number=float(offset + 1), - measurement_source_value=f"fixture-measurement-{measurement_id}", - ) - measurements.append(measurement) - episode_events.append( - Episode_Event( - episode_id=episode.episode_id, - event_id=measurement.measurement_id, - episode_event_field_concept_id=concepts.measurement_event_field_concept_id, - ) - ) - measurement_id += 1 - - episode_events.append( - Episode_Event( - episode_id=episode.episode_id, - event_id=condition.condition_occurrence_id, - episode_event_field_concept_id=concepts.condition_event_field_concept_id, - ) - ) - - condition_id += 1 - episode_id += 1 - - session.add_all(people) - session.add_all(visits) - session.add_all(observation_periods) - session.add_all(deaths) - session.add_all(conditions) - session.add_all(episodes) - session.add_all(measurements) - session.add_all(episode_events) - session.commit() - - -def _export_table_csvs(engine: sa.Engine, output_dir: Path) -> None: - with engine.connect() as connection: - for model in CLINICAL_EXPORT_MODELS: - table = model.__table__ - stmt = sa.select(table) - primary_keys = list(table.primary_key.columns) - if primary_keys: - stmt = stmt.order_by(*primary_keys) - frame = pd.read_sql_query(stmt, connection) - output_path = output_dir / f"{table.name.upper()}.csv" - frame.to_csv(output_path, index=False) - - -def main() -> None: - args = parse_args() - athena_source = args.athena_source.expanduser().resolve() - db_path = args.db_path.expanduser().resolve() - clinical_csv_dir = args.clinical_csv_dir.expanduser().resolve() - rng = Random(args.seed) - - clinical_csv_dir.mkdir(parents=True, exist_ok=True) - - if not args.force_rebuild and _fixture_db_has_people(db_path): - print(f"Using existing SQLite fixture at {db_path}") - missing_exports = _missing_clinical_csv_exports(clinical_csv_dir) - if not missing_exports: - print(f"Clinical CSV fixtures already present in {clinical_csv_dir}") - return - - print( - "Existing SQLite fixture is valid but some clinical CSV exports are missing: " - + ", ".join(missing_exports) - ) - engine = sa.create_engine(f"sqlite:///{db_path}", future=True, echo=False) - try: - for csv_path in clinical_csv_dir.glob("*.csv"): - csv_path.unlink() - _export_table_csvs(engine, clinical_csv_dir) - print(f"Exported clinical CSV fixtures to {clinical_csv_dir}") - return - finally: - engine.dispose() - - _reset_outputs(db_path, clinical_csv_dir) - - engine = sa.create_engine(f"sqlite:///{db_path}", future=True, echo=False) - try: - print(f"Loading Athena vocabulary from {athena_source}") - vocab_report = load_vocab_source( - engine, - source_path=athena_source, - merge_strategy="upsert", - ) - loaded_count = sum(1 for result in vocab_report.results if result.status == "loaded") - print(f"Loaded {loaded_count} vocabulary table(s)") - - creation_results = create_missing_tables(engine, vocabulary_included=False) - created_count = sum(1 for result in creation_results if result.status == "created") - print(f"Created {created_count} non-vocabulary table(s)") - - SessionLocal = sessionmaker(bind=engine, future=True, expire_on_commit=False) - with SessionLocal() as session: - concepts = _collect_fixture_concepts(session) - _seed_dummy_clinical_data( - session, - concepts=concepts, - rng=rng, - person_count=args.person_count, - ) - - _export_table_csvs(engine, clinical_csv_dir) - print(f"Wrote SQLite fixture to {db_path}") - print(f"Exported clinical CSV fixtures to {clinical_csv_dir}") - finally: - engine.dispose() - - -if __name__ == "__main__": - main() diff --git a/tests/test_config_driver.py b/tests/test_config_driver.py new file mode 100644 index 0000000..7d3522b --- /dev/null +++ b/tests/test_config_driver.py @@ -0,0 +1,108 @@ +""" +Tests for omop_alchemy.config driver-selection logic. + +These tests do not require a database; they exercise the driver-mapping +constants, _missing_driver_message(), and create_engine_with_dependencies() +using mock exceptions to simulate missing packages. +""" +import pytest + +from omop_alchemy.config import ( + POSTGRES_DRIVER_MODULES, + _missing_driver_message, + create_engine_with_dependencies, +) + + +def _make_module_not_found(module_name: str) -> ModuleNotFoundError: + exc = ModuleNotFoundError(f"No module named '{module_name}'") + exc.name = module_name + return exc + + +# --------------------------------------------------------------------------- +# Driver-mapping constants +# --------------------------------------------------------------------------- + +def test_bare_postgresql_url_aliases_to_psycopg(): + """Bare postgresql:// now resolves to psycopg, not psycopg2.""" + assert POSTGRES_DRIVER_MODULES["postgresql"] == "psycopg" + + +def test_psycopg_driver_mapping(): + assert POSTGRES_DRIVER_MODULES["postgresql+psycopg"] == "psycopg" + + +def test_psycopg2_driver_mapping_retained_for_error_quality(): + """psycopg2 entry is kept so users get a clear error message.""" + assert POSTGRES_DRIVER_MODULES["postgresql+psycopg2"] == "psycopg2" + + +# --------------------------------------------------------------------------- +# _missing_driver_message() +# --------------------------------------------------------------------------- + +def test_missing_driver_message_for_psycopg(): + exc = _make_module_not_found("psycopg") + msg = _missing_driver_message("postgresql+psycopg://host/db", exc) + + assert msg is not None + assert "psycopg" in msg + assert "postgres" in msg.lower() + + +def test_missing_driver_message_for_bare_postgresql_url(): + """Bare postgresql:// is now aliased to psycopg; missing psycopg gives a helpful error.""" + exc = _make_module_not_found("psycopg") + msg = _missing_driver_message("postgresql://host/db", exc) + + assert msg is not None + assert "psycopg" in msg + + +def test_missing_driver_message_for_psycopg2(): + exc = _make_module_not_found("psycopg2") + msg = _missing_driver_message("postgresql+psycopg2://host/db", exc) + + assert msg is not None + assert "psycopg2" in msg + + +def test_missing_driver_message_returns_none_for_unrelated_module(): + """A ModuleNotFoundError for an unrelated package should not be intercepted.""" + exc = _make_module_not_found("pandas") + msg = _missing_driver_message("postgresql+psycopg://host/db", exc) + + assert msg is None + + +def test_missing_driver_message_returns_none_for_sqlite_url(): + exc = _make_module_not_found("psycopg") + msg = _missing_driver_message("sqlite:///test.db", exc) + + assert msg is None + + +# --------------------------------------------------------------------------- +# create_engine_with_dependencies() +# --------------------------------------------------------------------------- + +def test_sqlite_url_not_intercepted(): + """create_engine_with_dependencies should work for sqlite without wrapping errors.""" + engine = create_engine_with_dependencies("sqlite:///:memory:", future=True) + engine.dispose() + + +def test_create_engine_raises_runtime_for_missing_postgres_driver(monkeypatch): + """When psycopg is missing, create_engine_with_dependencies raises RuntimeError with install hint.""" + import sqlalchemy as sa + + exc = _make_module_not_found("psycopg") + + def raising_create_engine(url, **kwargs): + raise exc + + monkeypatch.setattr(sa, "create_engine", raising_create_engine) + + with pytest.raises(RuntimeError, match="psycopg"): + create_engine_with_dependencies("postgresql+psycopg://host/db") diff --git a/tests/test_load_vocab.py b/tests/test_load_vocab.py index 735c8a8..80b75bc 100644 --- a/tests/test_load_vocab.py +++ b/tests/test_load_vocab.py @@ -1,28 +1,17 @@ import pytest -from pathlib import Path from orm_loader.helpers import bootstrap import sqlalchemy as sa from sqlalchemy.orm import sessionmaker from omop_alchemy.cdm.model.vocabulary import ( Domain, - Vocabulary, - Concept_Class, Relationship, Concept, Concept_Ancestor, Concept_Relationship, ) - -ATHENA_LOAD_ORDER = [ - Domain, - Vocabulary, - Concept_Class, - Relationship, - Concept, - Concept_Ancestor, - Concept_Relationship, -] +from pathlib import Path +from tests.conftest import ATHENA_LOAD_ORDER, _ATHENA_FIXTURE_DATA, _write_fixture_csv @pytest.fixture(scope="session") @@ -58,25 +47,19 @@ def db_session(connection): @pytest.fixture(scope="session") -def athena_vocab(connection): +def athena_vocab(connection, tmp_path_factory): """ - Load a minimal, internally consistent Athena vocabulary - using the real ORM CSV loader. + Load the minimal Athena vocabulary fixture using the real ORM CSV loader. + + Writes in-memory fixture data to a temp directory so no static CSV files + on disk are required. """ + base_path: Path = tmp_path_factory.mktemp("athena_vocab") Session = sessionmaker(bind=connection, future=True) session = Session() - base_path = ( - Path(__file__).parent - / "fixtures" - / "athena_source" - ) - for model in ATHENA_LOAD_ORDER: - csv_path = base_path / f"{model.__tablename__}.csv" - if not csv_path.exists(): - raise RuntimeError(f"Missing vocab CSV: {csv_path}") - + csv_path = _write_fixture_csv(base_path, model.__tablename__, _ATHENA_FIXTURE_DATA[model.__tablename__]) model.load_csv(session, csv_path) session.commit() @@ -84,26 +67,22 @@ def athena_vocab(connection): yield + def test_concept_loaded(db_session, athena_vocab): - """Test concept loaded.""" - concept = db_session.get(Concept, 1) + """Test that vocabulary concepts load and are accessible by primary key.""" + # MALE (concept_id=8507) is a known row in the minimal fixture. + concept = db_session.get(Concept, 8507) assert concept is not None - assert concept.concept_name == "Domain" - assert concept.domain_id == "Metadata" + assert concept.concept_name == "MALE" + assert concept.domain_id == "Gender" + def test_concept_ancestor(db_session, athena_vocab): - """Test concept ancestor.""" - ancestors = ( - # running tests with metadata concepts so that they are definitely present - # assuming the logic to produce test db is stable - db_session.query(Concept_Ancestor) - .filter_by(descendant_concept_id=1147371) - .all() - ) - assert len(ancestors) == 2 - a = [a.ancestor_concept_id for a in ancestors] - assert 1147371 in a - assert 1147423 in a + """Test that the concept_ancestor table loads without error.""" + # Minimal fixtures have no ancestor rows; table must be accessible and empty. + count = db_session.query(Concept_Ancestor).count() + assert count == 0 + def test_all_concepts_reference_valid_domain(db_session, athena_vocab): """Test all concepts reference valid domain.""" @@ -116,15 +95,17 @@ def test_all_concepts_reference_valid_domain(db_session, athena_vocab): assert invalid == 0 + def test_relationship_vocab_loaded(db_session, athena_vocab): """Test relationship vocab loaded.""" rel = ( db_session.query(Relationship) - .filter_by(relationship_id="Has type") + .filter_by(relationship_id="Is a") .one() ) - assert rel.reverse_relationship_id == "Type of" + assert rel.reverse_relationship_id == "Subsumes" + def test_expected_domains_exist(db_session, athena_vocab): """Test expected domains exist.""" @@ -134,31 +115,34 @@ def test_expected_domains_exist(db_session, athena_vocab): } assert "Condition" in domains - assert "Procedure" in domains - assert "Drug" in domains + assert "Gender" in domains + assert "Race" in domains + def test_domains_are_consistent(db_session, athena_vocab): - """Test domains are consistent.""" + """Test concepts reference domains that exist in the domain table.""" concepts = ( db_session.query(Concept) - .filter(Concept.domain_id.in_(["Condition", "Procedure"])) + .filter(Concept.domain_id.in_(["Condition", "Gender"])) .all() ) - assert concepts + assert concepts for c in concepts: - assert c.domain_id in {"Condition", "Procedure"} + assert c.domain_id in {"Condition", "Gender"} -def test_procedure_concepts_exist(db_session, athena_vocab): - """Test procedure concepts exist.""" + +def test_condition_concepts_exist(db_session, athena_vocab): + """Test condition concepts exist.""" assert ( db_session.query(Concept) - .filter(Concept.domain_id == "Procedure") + .filter(Concept.domain_id == "Condition") .count() > 0 ) + def test_relationships_reference_valid_concepts(db_session, athena_vocab): """Test relationships reference valid concepts.""" rels = db_session.query(Concept_Relationship).limit(50).all() diff --git a/tests/test_load_vocab_postgres.py b/tests/test_load_vocab_postgres.py new file mode 100644 index 0000000..d8a2f87 --- /dev/null +++ b/tests/test_load_vocab_postgres.py @@ -0,0 +1,246 @@ +""" +PostgreSQL integration tests for OMOP_Alchemy vocabulary loading. + +These tests require a running PostgreSQL container. Start one with: + docker compose -f tests/docker-compose.yaml up -d + +Then run: + pytest -m postgres +""" +from pathlib import Path + +import sqlalchemy as sa + +from omop_alchemy.cdm.model.vocabulary import Concept +from omop_alchemy.maintenance.load_vocab import ( + _load_vocab_model_csv, + load_vocab_source, +) +from tests.conftest import _ATHENA_FIXTURE_DATA, _write_fixture_csv + + +def _copy_fixture_source(base_dir: Path) -> Path: + """Write the shared in-memory Athena fixture set into an isolated per-test source dir.""" + source_path = base_dir / "athena_source" + source_path.mkdir(parents=True) + for table_name, data in _ATHENA_FIXTURE_DATA.items(): + _write_fixture_csv(source_path, table_name, data) + return source_path + + +def _make_concept_source( + base_dir: Path, + *, + concept_id: int, + concept_name: str, +) -> Path: + """ + Build a minimal vocabulary source where CONCEPT.csv contains exactly one + test concept with a Gender domain reference, and all other required tables + are written from the shared in-memory fixture. + """ + source_path = base_dir / "athena_source" + source_path.mkdir(parents=True) + + for table_name, data in _ATHENA_FIXTURE_DATA.items(): + if table_name != "concept": + _write_fixture_csv(source_path, table_name, data) + + concept_cols = list(_ATHENA_FIXTURE_DATA["concept"].keys()) + concept_row = [concept_id, concept_name, "Gender", "Gender", "Gender", "S", "TEST", "19700101", "20991231", None] + _write_fixture_csv(source_path, "concept", {col: (val,) for col, val in zip(concept_cols, concept_row)}) + return source_path + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_end_to_end_vocab_load_on_postgres(pg_session, pg_engine, tmp_path): + """load_vocab_source() completes end-to-end on real Postgres via orm-loader>=0.4.0.""" + source_path = _copy_fixture_source(tmp_path) + report = load_vocab_source(pg_engine, source_path=source_path) + + assert report.merge_strategy == "replace" + assert all(r.status == "loaded" for r in report.results if r.required) + assert all(r.status == "skipped" for r in report.results if not r.required) + + count = pg_session.execute(sa.text("SELECT COUNT(*) FROM concept")).scalar() + assert count == 7 + + + +def test_quote_mode_auto_regression_on_postgres(pg_session, pg_engine, tmp_path): + """ + quote_mode='auto' strips RFC-4180 double-quotes via PostgreSQL COPY. + + Under the old quote_mode='literal' a concept_name of exactly 255 chars + wrapped in double-quotes would be stored as 257 chars and violate the + VARCHAR(255) constraint. This test would fail under literal mode. + """ + source_path = tmp_path / "athena_source" + source_path.mkdir() + + long_name = "A" * 255 # exactly at VARCHAR(255) limit when unquoted + + # All tables except concept get the standard fixture data. + for table_name, data in _ATHENA_FIXTURE_DATA.items(): + if table_name != "concept": + _write_fixture_csv(source_path, table_name, data) + + # Concept gets a single row whose name is wrapped in RFC-4180 double-quotes + # so the raw file value is 257 chars. quote_mode='auto' must strip them. + concept_cols = list(_ATHENA_FIXTURE_DATA["concept"].keys()) + concept_row = [1, f'"{long_name}"', "Gender", "Gender", "Gender", "S", "TEST", "19700101", "20991231", None] + _write_fixture_csv(source_path, "concept", {col: (val,) for col, val in zip(concept_cols, concept_row)}) + + # Should not raise: literal mode would produce a 257-char value and fail. + load_vocab_source(pg_engine, source_path=source_path) + + concept_name = pg_session.execute( + sa.text("SELECT concept_name FROM concept WHERE concept_id = 1") + ).scalar() + assert concept_name is not None + assert len(concept_name) == 255, ( + f"Expected 255-char name; got {len(concept_name)}: {concept_name!r}" + ) + assert not concept_name.startswith('"'), "Surrounding quotes were not stripped" + + + +def test_load_vocab_model_csv_on_postgres(pg_session, tmp_path): + """ + _load_vocab_model_csv loads data correctly on a real PostgreSQL session. + + orm-loader>=0.4.0 handles staging-table creation internally, so we test + the end-to-end path: CSV → staging → concept table on real Postgres. + """ + source_path = _copy_fixture_source(tmp_path) + csv_path = source_path / "CONCEPT.csv" + + row_count = _load_vocab_model_csv( + pg_session, + model=Concept, + csv_path=csv_path, + merge_strategy="replace", + ) + pg_session.commit() + + assert row_count == 7 + count = pg_session.execute(sa.text("SELECT COUNT(*) FROM concept")).scalar() + assert count == 7 + + + +def test_replace_strategy_overwrites_existing_rows(pg_session, pg_engine, tmp_path): + """merge_strategy='replace' fully replaces rows with the same PKs on second load.""" + concept_id = 99999 + source_v1 = _make_concept_source( + tmp_path / "v1", concept_id=concept_id, concept_name="name_v1" + ) + source_v2 = _make_concept_source( + tmp_path / "v2", concept_id=concept_id, concept_name="name_v2" + ) + + load_vocab_source(pg_engine, source_path=source_v1, merge_strategy="replace") + load_vocab_source(pg_engine, source_path=source_v2, merge_strategy="replace") + + name = pg_session.execute( + sa.text("SELECT concept_name FROM concept WHERE concept_id = :cid"), + {"cid": concept_id}, + ).scalar() + assert name == "name_v2", f"Expected 'name_v2' after replace, got {name!r}" + + + +def test_upsert_strategy_is_non_destructive(pg_session, pg_engine, tmp_path): + """merge_strategy='upsert' preserves existing rows on second load with same PKs.""" + concept_id = 99998 + source_v1 = _make_concept_source( + tmp_path / "v1", concept_id=concept_id, concept_name="name_v1" + ) + source_v2 = _make_concept_source( + tmp_path / "v2", concept_id=concept_id, concept_name="name_v2" + ) + + load_vocab_source(pg_engine, source_path=source_v1, merge_strategy="upsert") + load_vocab_source(pg_engine, source_path=source_v2, merge_strategy="upsert") + + name = pg_session.execute( + sa.text("SELECT concept_name FROM concept WHERE concept_id = :cid"), + {"cid": concept_id}, + ).scalar() + assert name == "name_v1", ( + f"Expected 'name_v1' after upsert (existing row preserved), got {name!r}" + ) + + + +def test_chunksize_forwarded_to_loader(pg_session, pg_engine, monkeypatch, tmp_path): + """chunksize is forwarded from load_vocab_source through to _load_vocab_model_csv.""" + from omop_alchemy.maintenance import load_vocab as _lv_module + + source_path = _copy_fixture_source(tmp_path) + received_chunksizes: list[int | None] = [] + original = _lv_module._load_vocab_model_csv + + def tracking_load(session, *, model, csv_path, merge_strategy, quote_mode="auto", chunksize=None): + received_chunksizes.append(chunksize) + return original( + session, + model=model, + csv_path=csv_path, + merge_strategy=merge_strategy, + quote_mode=quote_mode, + chunksize=chunksize, + ) + + monkeypatch.setattr(_lv_module, "_load_vocab_model_csv", tracking_load) + + load_vocab_source(pg_engine, source_path=source_path, chunksize=500) + + assert received_chunksizes, "Expected at least one table to be loaded" + assert all(c == 500 for c in received_chunksizes), ( + f"Expected chunksize=500 for all tables, got: {received_chunksizes}" + ) + + + +def test_db_schema_search_path_on_postgres(pg_engine, tmp_path): + """ + load_vocab_source with db_schema creates vocabulary tables in the requested + PostgreSQL schema and loads data into them correctly. + """ + schema = 'VocabTest' + source_path = _copy_fixture_source(tmp_path) + quoted_schema = '"' + schema.replace('"', '""') + '"' + + with pg_engine.connect() as conn: + conn.execute(sa.text(f"DROP SCHEMA IF EXISTS {quoted_schema} CASCADE")) + conn.execute(sa.text(f"CREATE SCHEMA {quoted_schema}")) + conn.commit() + + try: + report = load_vocab_source( + pg_engine, + source_path=source_path, + db_schema=schema, + ) + + assert any(r.status == "loaded" for r in report.results if r.required) + + inspector = sa.inspect(pg_engine) + assert inspector.has_table("concept", schema=schema), ( + f"Expected concept table in schema '{schema}'" + ) + + with pg_engine.connect() as conn: + count = conn.execute( + sa.text(f"SELECT COUNT(*) FROM {quoted_schema}.concept") + ).scalar() + assert count == 7 + finally: + with pg_engine.connect() as conn: + conn.execute(sa.text(f"DROP SCHEMA IF EXISTS {quoted_schema} CASCADE")) + conn.commit() diff --git a/tests/test_load_vocab_source.py b/tests/test_load_vocab_source.py index a6fa1bc..fbb8a59 100644 --- a/tests/test_load_vocab_source.py +++ b/tests/test_load_vocab_source.py @@ -10,6 +10,7 @@ from omop_alchemy.maintenance.load_vocab import ( OPTIONAL_VOCAB_MODELS, REQUIRED_VOCAB_MODELS, + MergeStrategy, _load_vocab_model_csv, load_vocab_source, ) @@ -67,7 +68,8 @@ def fake_load_vocab_model_csv( model, csv_path, merge_strategy, - quote_mode="csv", + quote_mode="auto", + chunksize=None, ) -> int: loaded_tables.append((model.__tablename__, merge_strategy, quote_mode, csv_path)) return 1 @@ -88,7 +90,7 @@ def fake_load_vocab_model_csv( assert all(result_by_name[model.__tablename__].status == "loaded" for model in REQUIRED_VOCAB_MODELS) assert all(result_by_name[model.__tablename__].status == "skipped" for model in OPTIONAL_VOCAB_MODELS) assert all(merge_strategy == "replace" for _, merge_strategy, _, _ in loaded_tables) - assert all(quote_mode == "literal" for _, _, quote_mode, _ in loaded_tables) + assert all(quote_mode == "auto" for _, _, quote_mode, _ in loaded_tables) assert {table_name for table_name, _, _, _ in loaded_tables} == { model.__tablename__ for model in REQUIRED_VOCAB_MODELS @@ -102,10 +104,15 @@ def test_load_vocab_source_requires_full_required_athena_fixture(tmp_path): """Test load vocab source requires full required athena fixture.""" engine = sa.create_engine(f"sqlite:///{tmp_path / 'load_vocab_source_missing_required.db'}", future=True) + # Build a source with only a subset of required models to trigger the missing-files error. + partial_source = tmp_path / "partial_athena" + partial_source.mkdir() + _write_athena_csv(partial_source, REQUIRED_VOCAB_MODELS[0].__tablename__) + with pytest.raises(RuntimeError) as exc_info: load_vocab_source( engine, - source_path=_athena_source_path(), + source_path=partial_source, ) assert "Missing required Athena vocabulary CSV files" in str(exc_info.value) @@ -162,7 +169,8 @@ def fake_load_vocab_source( source_path: str | Path, db_schema: str | None = None, dry_run: bool = False, - merge_strategy: str = "upsert", + merge_strategy: MergeStrategy = "replace", + chunksize: int | None = None, progress_callback=None, ): from omop_alchemy.maintenance.load_vocab import VocabularyLoadReport, VocabularyLoadResult @@ -233,7 +241,7 @@ def fake_load_vocab_source( assert result.exit_code == 0 assert calls["engine"] == "ENGINE" assert calls["source_path"] == expected_source_path - assert calls["merge_strategy"] == "upsert" + assert calls["merge_strategy"] == "replace" assert "load-vocab-source" in result.stdout assert "concept" in result.stdout @@ -302,7 +310,8 @@ def fake_load_vocab_model_csv( model, csv_path, merge_strategy, - quote_mode="csv", + quote_mode="auto", + chunksize=None, ) -> int: loaded_order.append(model.__tablename__) return 1 @@ -333,7 +342,8 @@ def fake_load_vocab_model_csv( model, csv_path, merge_strategy, - quote_mode="csv", + quote_mode="auto", + chunksize=None, ) -> int: return 1 @@ -360,7 +370,7 @@ def test_load_vocab_source_wraps_failed_table_load(monkeypatch, tmp_path): engine = sa.create_engine(f"sqlite:///{tmp_path / 'load_vocab_source_error.db'}", future=True) source_path = _build_required_athena_source(tmp_path) - def fake_load_vocab_model_csv(session, *, model, csv_path, merge_strategy, quote_mode="csv"): + def fake_load_vocab_model_csv(session, *, model, csv_path, merge_strategy, quote_mode="auto", chunksize=None): if model.__tablename__ == "domain": raise sa.exc.ProgrammingError( "COPY domain FROM STDIN", @@ -470,3 +480,57 @@ def fail_load_vocab_source(*args, **kwargs): assert result.exit_code == 1 assert "Database operation failed: ProgrammingError." in result.stdout assert "value too long for type character varying(255)" in result.stdout + + +def test_load_vocab_source_uses_auto_not_literal_quote_mode(monkeypatch, tmp_path): + """Regression: Athena load must use auto quote mode so that quoted concept_name + values are not padded with surrounding double-quote characters, which would + cause 'value too long for type character varying(255)' on CONCEPT.csv.""" + engine = sa.create_engine(f"sqlite:///{tmp_path / 'quote_mode_regression.db'}", future=True) + + # Build a tab-delimited CSV where concept_name is exactly 255 chars when + # unquoted, but would be 257 chars if the surrounding CSV quotes were kept + # as literal characters (the literal-mode bug). + source_path = tmp_path / "athena_source" + source_path.mkdir() + + long_name = "A" * 255 + for model in REQUIRED_VOCAB_MODELS: + table_name = model.__tablename__.upper() + csv_path = source_path / f"{table_name}.csv" + if table_name == "CONCEPT": + csv_path.write_text( + "concept_id\tconcept_name\tdomain_id\tvocabulary_id\t" + "concept_class_id\tstandard_concept\tconcept_code\t" + "valid_start_date\tvalid_end_date\tinvalid_reason\n" + f'4715176\t"{long_name}"\t...\t...\t...\t\t...\t20000101\t20991231\t\n', + encoding="utf-8", + ) + else: + csv_path.write_text("stub\n", encoding="utf-8") + + received_quote_modes: list[str] = [] + + def fake_load_vocab_model_csv( + session, + *, + model, + csv_path, + merge_strategy, + quote_mode="auto", + chunksize=None, + ) -> int: + received_quote_modes.append(quote_mode) + return 1 + + monkeypatch.setattr( + "omop_alchemy.maintenance.load_vocab._load_vocab_model_csv", + fake_load_vocab_model_csv, + ) + + load_vocab_source(engine, source_path=source_path) + + assert all(mode == "auto" for mode in received_quote_modes), ( + f"Expected all tables to use quote_mode='auto', got: {received_quote_modes}" + ) + assert "literal" not in received_quote_modes diff --git a/uv.lock b/uv.lock index 861910c..1f9887b 100644 --- a/uv.lock +++ b/uv.lock @@ -862,7 +862,7 @@ wheels = [ [[package]] name = "omop-alchemy" -version = "0.6.2" +version = "0.6.3" source = { editable = "." } dependencies = [ { name = "orm-loader" }, @@ -893,7 +893,6 @@ docs = [ ] postgres = [ { name = "psycopg", extra = ["binary"] }, - { name = "psycopg2-binary" }, ] [package.metadata] @@ -905,10 +904,9 @@ requires-dist = [ { name = "mkdocstrings-python", marker = "extra == 'dev'", specifier = ">=2.0.1" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.8" }, { name = "myst-parser", marker = "extra == 'docs'" }, - { name = "orm-loader", specifier = ">=0.3.27,<4.0" }, + { name = "orm-loader", specifier = ">=0.4.1" }, { name = "pandas", specifier = ">=2.0" }, { name = "psycopg", extras = ["binary"], marker = "extra == 'postgres'", specifier = ">=3.2" }, - { name = "psycopg2-binary", marker = "extra == 'postgres'", specifier = ">=2.9" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.3" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" }, { name = "python-dotenv", specifier = ">=1.2.2" }, @@ -924,7 +922,7 @@ provides-extras = ["postgres", "dev", "docs"] [[package]] name = "orm-loader" -version = "0.3.27" +version = "0.4.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "chardet" }, @@ -932,9 +930,9 @@ dependencies = [ { name = "pyarrow" }, { name = "sqlalchemy" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ca/72/f5ae8aafb2868301da88c71f6ee095cac14bf4405648c935b533cf1550b6/orm_loader-0.3.27.tar.gz", hash = "sha256:51de60177bb45572329899d883414ba47ed42034a782d49bf05d0dc5d1e9f58c", size = 33014, upload-time = "2026-05-06T07:04:59.088Z" } +sdist = { url = "https://files.pythonhosted.org/packages/2a/6a/007e6eef497753702d5a53444842ee6cc38bcbf7c5c422857c0671bfc727/orm_loader-0.4.1.tar.gz", hash = "sha256:434b6c3436c05bf3ad43774b46476e7f324db05a18bf34ad9f9692e4f02bcb7e", size = 39449, upload-time = "2026-05-19T12:56:29.572Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/83/f8/8f16b0123ea3438a084125d7450ef1250e4780edf0934f79e14a924578bc/orm_loader-0.3.27-py3-none-any.whl", hash = "sha256:7e2bbd7f6935aff1710a99d9d8f550d691307c446e75c04cb59cd67f1e64b16d", size = 44815, upload-time = "2026-05-06T07:04:57.509Z" }, + { url = "https://files.pythonhosted.org/packages/98/d7/37f82f8748a91fdb14d41f314ddc829806f596dec409196c037e59d3a5a7/orm_loader-0.4.1-py3-none-any.whl", hash = "sha256:03131b5d4b7b787ea446e110684b7256b5690313503626939b83984953174825", size = 54472, upload-time = "2026-05-19T12:56:27.959Z" }, ] [[package]] @@ -1120,47 +1118,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/5a/291d89f44d3820fffb7a04ebc8f3ef5dda4f542f44a5daea0c55a84abf45/psycopg_binary-3.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:165f22ab5a9513a3d7425ffb7fcc7955ed8ccaeef6d37e369d6cc1dff1582383", size = 3652796, upload-time = "2026-02-18T16:52:14.02Z" }, ] -[[package]] -name = "psycopg2-binary" -version = "2.9.11" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ac/6c/8767aaa597ba424643dc87348c6f1754dd9f48e80fdc1b9f7ca5c3a7c213/psycopg2-binary-2.9.11.tar.gz", hash = "sha256:b6aed9e096bf63f9e75edf2581aa9a7e7186d97ab5c177aa6c87797cd591236c", size = 379620, upload-time = "2025-10-10T11:14:48.041Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d8/91/f870a02f51be4a65987b45a7de4c2e1897dd0d01051e2b559a38fa634e3e/psycopg2_binary-2.9.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:be9b840ac0525a283a96b556616f5b4820e0526addb8dcf6525a0fa162730be4", size = 3756603, upload-time = "2025-10-10T11:11:52.213Z" }, - { url = "https://files.pythonhosted.org/packages/27/fa/cae40e06849b6c9a95eb5c04d419942f00d9eaac8d81626107461e268821/psycopg2_binary-2.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f090b7ddd13ca842ebfe301cd587a76a4cf0913b1e429eb92c1be5dbeb1a19bc", size = 3864509, upload-time = "2025-10-10T11:11:56.452Z" }, - { url = "https://files.pythonhosted.org/packages/2d/75/364847b879eb630b3ac8293798e380e441a957c53657995053c5ec39a316/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ab8905b5dcb05bf3fb22e0cf90e10f469563486ffb6a96569e51f897c750a76a", size = 4411159, upload-time = "2025-10-10T11:12:00.49Z" }, - { url = "https://files.pythonhosted.org/packages/6f/a0/567f7ea38b6e1c62aafd58375665a547c00c608a471620c0edc364733e13/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:bf940cd7e7fec19181fdbc29d76911741153d51cab52e5c21165f3262125685e", size = 4468234, upload-time = "2025-10-10T11:12:04.892Z" }, - { url = "https://files.pythonhosted.org/packages/30/da/4e42788fb811bbbfd7b7f045570c062f49e350e1d1f3df056c3fb5763353/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fa0f693d3c68ae925966f0b14b8edda71696608039f4ed61b1fe9ffa468d16db", size = 4166236, upload-time = "2025-10-10T11:12:11.674Z" }, - { url = "https://files.pythonhosted.org/packages/3c/94/c1777c355bc560992af848d98216148be5f1be001af06e06fc49cbded578/psycopg2_binary-2.9.11-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a1cf393f1cdaf6a9b57c0a719a1068ba1069f022a59b8b1fe44b006745b59757", size = 3983083, upload-time = "2025-10-30T02:55:15.73Z" }, - { url = "https://files.pythonhosted.org/packages/bd/42/c9a21edf0e3daa7825ed04a4a8588686c6c14904344344a039556d78aa58/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ef7a6beb4beaa62f88592ccc65df20328029d721db309cb3250b0aae0fa146c3", size = 3652281, upload-time = "2025-10-10T11:12:17.713Z" }, - { url = "https://files.pythonhosted.org/packages/12/22/dedfbcfa97917982301496b6b5e5e6c5531d1f35dd2b488b08d1ebc52482/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:31b32c457a6025e74d233957cc9736742ac5a6cb196c6b68499f6bb51390bd6a", size = 3298010, upload-time = "2025-10-10T11:12:22.671Z" }, - { url = "https://files.pythonhosted.org/packages/66/ea/d3390e6696276078bd01b2ece417deac954dfdd552d2edc3d03204416c0c/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:edcb3aeb11cb4bf13a2af3c53a15b3d612edeb6409047ea0b5d6a21a9d744b34", size = 3044641, upload-time = "2025-10-30T02:55:19.929Z" }, - { url = "https://files.pythonhosted.org/packages/12/9a/0402ded6cbd321da0c0ba7d34dc12b29b14f5764c2fc10750daa38e825fc/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:62b6d93d7c0b61a1dd6197d208ab613eb7dcfdcca0a49c42ceb082257991de9d", size = 3347940, upload-time = "2025-10-10T11:12:26.529Z" }, - { url = "https://files.pythonhosted.org/packages/b1/d2/99b55e85832ccde77b211738ff3925a5d73ad183c0b37bcbbe5a8ff04978/psycopg2_binary-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:b33fabeb1fde21180479b2d4667e994de7bbf0eec22832ba5d9b5e4cf65b6c6d", size = 2714147, upload-time = "2025-10-10T11:12:29.535Z" }, - { url = "https://files.pythonhosted.org/packages/ff/a8/a2709681b3ac11b0b1786def10006b8995125ba268c9a54bea6f5ae8bd3e/psycopg2_binary-2.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b8fb3db325435d34235b044b199e56cdf9ff41223a4b9752e8576465170bb38c", size = 3756572, upload-time = "2025-10-10T11:12:32.873Z" }, - { url = "https://files.pythonhosted.org/packages/62/e1/c2b38d256d0dafd32713e9f31982a5b028f4a3651f446be70785f484f472/psycopg2_binary-2.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:366df99e710a2acd90efed3764bb1e28df6c675d33a7fb40df9b7281694432ee", size = 3864529, upload-time = "2025-10-10T11:12:36.791Z" }, - { url = "https://files.pythonhosted.org/packages/11/32/b2ffe8f3853c181e88f0a157c5fb4e383102238d73c52ac6d93a5c8bffe6/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8c55b385daa2f92cb64b12ec4536c66954ac53654c7f15a203578da4e78105c0", size = 4411242, upload-time = "2025-10-10T11:12:42.388Z" }, - { url = "https://files.pythonhosted.org/packages/10/04/6ca7477e6160ae258dc96f67c371157776564679aefd247b66f4661501a2/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c0377174bf1dd416993d16edc15357f6eb17ac998244cca19bc67cdc0e2e5766", size = 4468258, upload-time = "2025-10-10T11:12:48.654Z" }, - { url = "https://files.pythonhosted.org/packages/3c/7e/6a1a38f86412df101435809f225d57c1a021307dd0689f7a5e7fe83588b1/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c6ff3335ce08c75afaed19e08699e8aacf95d4a260b495a4a8545244fe2ceb3", size = 4166295, upload-time = "2025-10-10T11:12:52.525Z" }, - { url = "https://files.pythonhosted.org/packages/f2/7d/c07374c501b45f3579a9eb761cbf2604ddef3d96ad48679112c2c5aa9c25/psycopg2_binary-2.9.11-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:84011ba3109e06ac412f95399b704d3d6950e386b7994475b231cf61eec2fc1f", size = 3983133, upload-time = "2025-10-30T02:55:24.329Z" }, - { url = "https://files.pythonhosted.org/packages/82/56/993b7104cb8345ad7d4516538ccf8f0d0ac640b1ebd8c754a7b024e76878/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ba34475ceb08cccbdd98f6b46916917ae6eeb92b5ae111df10b544c3a4621dc4", size = 3652383, upload-time = "2025-10-10T11:12:56.387Z" }, - { url = "https://files.pythonhosted.org/packages/2d/ac/eaeb6029362fd8d454a27374d84c6866c82c33bfc24587b4face5a8e43ef/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b31e90fdd0f968c2de3b26ab014314fe814225b6c324f770952f7d38abf17e3c", size = 3298168, upload-time = "2025-10-10T11:13:00.403Z" }, - { url = "https://files.pythonhosted.org/packages/2b/39/50c3facc66bded9ada5cbc0de867499a703dc6bca6be03070b4e3b65da6c/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:d526864e0f67f74937a8fce859bd56c979f5e2ec57ca7c627f5f1071ef7fee60", size = 3044712, upload-time = "2025-10-30T02:55:27.975Z" }, - { url = "https://files.pythonhosted.org/packages/9c/8e/b7de019a1f562f72ada81081a12823d3c1590bedc48d7d2559410a2763fe/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04195548662fa544626c8ea0f06561eb6203f1984ba5b4562764fbeb4c3d14b1", size = 3347549, upload-time = "2025-10-10T11:13:03.971Z" }, - { url = "https://files.pythonhosted.org/packages/80/2d/1bb683f64737bbb1f86c82b7359db1eb2be4e2c0c13b947f80efefa7d3e5/psycopg2_binary-2.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:efff12b432179443f54e230fdf60de1f6cc726b6c832db8701227d089310e8aa", size = 2714215, upload-time = "2025-10-10T11:13:07.14Z" }, - { url = "https://files.pythonhosted.org/packages/64/12/93ef0098590cf51d9732b4f139533732565704f45bdc1ffa741b7c95fb54/psycopg2_binary-2.9.11-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:92e3b669236327083a2e33ccfa0d320dd01b9803b3e14dd986a4fc54aa00f4e1", size = 3756567, upload-time = "2025-10-10T11:13:11.885Z" }, - { url = "https://files.pythonhosted.org/packages/7c/a9/9d55c614a891288f15ca4b5209b09f0f01e3124056924e17b81b9fa054cc/psycopg2_binary-2.9.11-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:e0deeb03da539fa3577fcb0b3f2554a97f7e5477c246098dbb18091a4a01c16f", size = 3864755, upload-time = "2025-10-10T11:13:17.727Z" }, - { url = "https://files.pythonhosted.org/packages/13/1e/98874ce72fd29cbde93209977b196a2edae03f8490d1bd8158e7f1daf3a0/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b52a3f9bb540a3e4ec0f6ba6d31339727b2950c9772850d6545b7eae0b9d7c5", size = 4411646, upload-time = "2025-10-10T11:13:24.432Z" }, - { url = "https://files.pythonhosted.org/packages/5a/bd/a335ce6645334fb8d758cc358810defca14a1d19ffbc8a10bd38a2328565/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:db4fd476874ccfdbb630a54426964959e58da4c61c9feba73e6094d51303d7d8", size = 4468701, upload-time = "2025-10-10T11:13:29.266Z" }, - { url = "https://files.pythonhosted.org/packages/44/d6/c8b4f53f34e295e45709b7568bf9b9407a612ea30387d35eb9fa84f269b4/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:47f212c1d3be608a12937cc131bd85502954398aaa1320cb4c14421a0ffccf4c", size = 4166293, upload-time = "2025-10-10T11:13:33.336Z" }, - { url = "https://files.pythonhosted.org/packages/4b/e0/f8cc36eadd1b716ab36bb290618a3292e009867e5c97ce4aba908cb99644/psycopg2_binary-2.9.11-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e35b7abae2b0adab776add56111df1735ccc71406e56203515e228a8dc07089f", size = 3983184, upload-time = "2025-10-30T02:55:32.483Z" }, - { url = "https://files.pythonhosted.org/packages/53/3e/2a8fe18a4e61cfb3417da67b6318e12691772c0696d79434184a511906dc/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fcf21be3ce5f5659daefd2b3b3b6e4727b028221ddc94e6c1523425579664747", size = 3652650, upload-time = "2025-10-10T11:13:38.181Z" }, - { url = "https://files.pythonhosted.org/packages/76/36/03801461b31b29fe58d228c24388f999fe814dfc302856e0d17f97d7c54d/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:9bd81e64e8de111237737b29d68039b9c813bdf520156af36d26819c9a979e5f", size = 3298663, upload-time = "2025-10-10T11:13:44.878Z" }, - { url = "https://files.pythonhosted.org/packages/97/77/21b0ea2e1a73aa5fa9222b2a6b8ba325c43c3a8d54272839c991f2345656/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:32770a4d666fbdafab017086655bcddab791d7cb260a16679cc5a7338b64343b", size = 3044737, upload-time = "2025-10-30T02:55:35.69Z" }, - { url = "https://files.pythonhosted.org/packages/67/69/f36abe5f118c1dca6d3726ceae164b9356985805480731ac6712a63f24f0/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3cb3a676873d7506825221045bd70e0427c905b9c8ee8d6acd70cfcbd6e576d", size = 3347643, upload-time = "2025-10-10T11:13:53.499Z" }, - { url = "https://files.pythonhosted.org/packages/e1/36/9c0c326fe3a4227953dfb29f5d0c8ae3b8eb8c1cd2967aa569f50cb3c61f/psycopg2_binary-2.9.11-cp314-cp314-win_amd64.whl", hash = "sha256:4012c9c954dfaccd28f94e84ab9f94e12df76b4afb22331b1f0d3154893a6316", size = 2803913, upload-time = "2025-10-10T11:13:57.058Z" }, -] - [[package]] name = "ptyprocess" version = "0.7.0"