diff --git a/docs/usage/interface-guide.md b/docs/usage/interface-guide.md index d576577..fcf7af4 100644 --- a/docs/usage/interface-guide.md +++ b/docs/usage/interface-guide.md @@ -51,7 +51,7 @@ writer = EmbeddingWriterInterface( backend=backend, metric_type=MetricType.COSINE, embedding_client=embedding_client, - omop_cdm_engine=cdm_engine, # optional; required for embed_and_upsert_concepts + omop_cdm_engine=cdm_engine, # optional; used to enrich search results ) ``` @@ -71,13 +71,16 @@ is safe and returns the existing record. ### Generate and store embeddings ```python -# Generate embeddings from CDM concepts and upsert in one step. -# omop_cdm_engine is used to fetch domain_id, vocabulary_id, standard_concept, -# and invalid_reason from the CDM and store them as filter metadata. -writer.embed_and_upsert_concepts( +# Fetch candidate concepts from the CDM, then pass the returned rows back as +# concept_meta so filter columns can be stored alongside the embeddings. +missing = writer.get_concepts_without_embedding( omop_cdm_engine=cdm_engine, - concept_ids=(1, 2, 3), - concept_texts=("Hypertension", "Diabetes mellitus", "Aspirin"), +) + +writer.embed_and_upsert_concepts( + concept_ids=tuple(missing.keys()), + concept_texts=tuple(row.concept_name for row in missing.values()), + concept_meta=missing, ) ``` diff --git a/pyproject.toml b/pyproject.toml index f010ccc..ff912e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,8 +33,7 @@ classifiers = [ dependencies = [ "numpy>=1.26", - "omop-alchemy>=0.5.7", - "orm-loader>=0.3.15", + "omop-alchemy>=0.6.3", "sqlalchemy>=2.0.45", "typing-extensions>=4.15.0", "sqlite-vec>=0.1.9", diff --git a/src/omop_emb/cli/cli_embeddings.py b/src/omop_emb/cli/cli_embeddings.py index 25227af..45d0c1a 100644 --- a/src/omop_emb/cli/cli_embeddings.py +++ b/src/omop_emb/cli/cli_embeddings.py @@ -8,9 +8,8 @@ from dotenv import load_dotenv from tqdm import tqdm -from orm_loader.helpers import create_db - from .utils import configure_logging_level, resolve_omop_cdm_engine +from omop_emb.utils.cdm import check_concept_cdm from omop_emb.backends.index_config import index_config_from_index_type from omop_emb.backends import resolve_backend from omop_emb.config import IndexType, MetricType @@ -104,11 +103,6 @@ def add_embeddings( help="Limit the number of concepts to embed. Useful for testing.", rich_help_panel="Concept Filters", )] = None, - cdm_batch_size: Annotated[int, typer.Option( - "--cdm-batch-size", - help="Batch size for fetching concept metadata from the CDM during ingestion. Adjust if you encounter performance issues or database limits during ingestion.", - rich_help_panel="CDM Fetch Options", - )] = 50_000, verbosity: Annotated[int, typer.Option( "--verbose", "-v", count=True, help="Increase verbosity (up to two levels)", @@ -124,7 +118,6 @@ def add_embeddings( backend = resolve_backend() omop_cdm_engine = resolve_omop_cdm_engine() - create_db(omop_cdm_engine) embedding_client = EmbeddingClient( model=model, @@ -139,38 +132,49 @@ def add_embeddings( metric_type=MetricType.COSINE, embedding_client=embedding_client, ) - embedding_writer.register_model() - - concept_filter = EmbeddingConceptFilter( - require_standard=standard_only, - domains=tuple(domains) if domains else None, - vocabularies=tuple(vocabularies) if vocabularies else None, - ) + check_concept_cdm(omop_cdm_engine) - missing = embedding_writer.get_concepts_without_embedding( - omop_cdm_engine=omop_cdm_engine, - concept_filter=concept_filter, - ) - if num_embeddings is not None: - missing = dict(list(missing.items())[:num_embeddings]) + try: + embedding_writer.register_model() - total_concepts = len(missing) - typer.echo(f"Total concepts to process: {total_concepts:,}") + # Filter concepts + concept_filter = EmbeddingConceptFilter( + require_standard=standard_only, + domains=tuple(domains) if domains else None, + vocabularies=tuple(vocabularies) if vocabularies else None, + ) + n_missing = embedding_writer.count_concepts_without_embedding( + omop_cdm_engine=omop_cdm_engine, + concept_filter=concept_filter, + ) + n_total = min(n_missing, num_embeddings) if num_embeddings is not None else n_missing + typer.echo(f"Total concepts to process: {n_total:,}") - from itertools import batched as _batched - with tqdm(total=total_concepts, desc="Processing", unit="concept") as pbar: - for batch in _batched(missing.items(), batch_size): - batch_dict = dict(batch) - embedding_writer.embed_and_upsert_concepts( + n_processed = 0 + with tqdm(total=n_total, desc="Processing", unit="concept") as pbar: + for batch_dict in embedding_writer.get_concepts_without_embedding_batched( omop_cdm_engine=omop_cdm_engine, - concept_ids=tuple(batch_dict.keys()), - concept_texts=tuple(batch_dict.values()), + concept_filter=concept_filter, batch_size=batch_size, - cdm_batch_size=cdm_batch_size, - ) - pbar.update(len(batch_dict)) - - logger.info("Completed embedding generation and storage.") + limit=num_embeddings, + ): + embedding_writer.embed_and_upsert_concepts( + concept_ids=tuple(batch_dict.keys()), + concept_texts=tuple(row.concept_name for row in batch_dict.values()), + concept_meta=batch_dict, + batch_size=batch_size, + ) + n_processed += len(batch_dict) + pbar.update(len(batch_dict)) + + typer.echo(f"Processed {n_processed:,} concepts.") + logger.info("Completed embedding generation and storage.") + except Exception as e: + logger.exception(f"Error during embedding generation and storage.\n{e}") + if not embedding_writer.has_any_embeddings(): + logger.info("No embeddings were stored. Cleaning up model registration.") + embedding_writer.delete_model() + raise typer.Exit(code=1) @app.command() @@ -305,11 +309,6 @@ def add_embeddings_with_index( help="Limit the number of concepts to embed. Useful for testing.", rich_help_panel="Concept Filters", )] = None, - cdm_batch_size: Annotated[int, typer.Option( - "--cdm-batch-size", - help="Batch size for fetching concept metadata from the CDM during ingestion. Adjust if you encounter performance issues or database limits during ingestion.", - rich_help_panel="CDM Fetch Options", - )] = 50_000, index_hnsw_num_neighbors: Annotated[Optional[int], typer.Option( "--index-hnsw-num-neighbors", help="HNSW: number of neighbors per graph node.", @@ -344,7 +343,6 @@ def add_embeddings_with_index( vocabularies=vocabularies, domains=domains, num_embeddings=num_embeddings, - cdm_batch_size=cdm_batch_size, verbosity=verbosity, ) diff --git a/src/omop_emb/cli/cli_legacy.py b/src/omop_emb/cli/cli_legacy.py index 2ff9d8a..90b209c 100644 --- a/src/omop_emb/cli/cli_legacy.py +++ b/src/omop_emb/cli/cli_legacy.py @@ -9,13 +9,11 @@ from tqdm import tqdm import sqlalchemy as sa -from sqlalchemy.orm import sessionmaker - from .utils import configure_logging_level from omop_emb.backends import resolve_backend from omop_emb.backends.index_config import FlatIndexConfig from omop_emb.config import MetricType, ProviderType -from omop_emb.interface import _fetch_cdm_concepts_for_ingestion +from omop_emb.utils.cdm import fetch_cdm_concepts_for_ingestion from omop_emb.utils.embedding_utils import ConceptEmbeddingRecord logger = logging.getLogger(__name__) @@ -132,7 +130,6 @@ def add_embeddings_from_h5( typer.echo(f"Registered model '{model}' ({dimensions}d, metric={metric_type.value}).") cdm_engine = sa.create_engine(omop_cdm_db_url, future=True, echo=False) - cdm_factory = sessionmaker(cdm_engine) n_batches = (total + batch_size - 1) // batch_size typer.echo(f"Ingesting {total:,} embeddings in {n_batches} batch(es) of {batch_size:,}...") @@ -142,8 +139,9 @@ def add_embeddings_from_h5( end = min(start + batch_size, total) batch_cids: np.ndarray = np.asarray(cid_ds[start:end]) batch_emb = np.asarray(emb_ds[start:end], dtype=np.float32) - meta = _fetch_cdm_concepts_for_ingestion( - {int(cid) for cid in batch_cids}, cdm_factory, + meta = fetch_cdm_concepts_for_ingestion( + {int(cid) for cid in batch_cids}, + cdm_engine, batch_size=cdm_batch_size, ) records = [] diff --git a/src/omop_emb/interface.py b/src/omop_emb/interface.py index ee1bd06..da58795 100644 --- a/src/omop_emb/interface.py +++ b/src/omop_emb/interface.py @@ -18,16 +18,13 @@ import logging import os from dataclasses import replace as dc_replace -from itertools import batched from typing import TYPE_CHECKING, Iterable, List, Mapping, Optional, Sequence, Tuple, Union import numpy as np from numpy import ndarray -from sqlalchemy import Engine, select, Row -from sqlalchemy.orm import sessionmaker - -from omop_alchemy.cdm.model.vocabulary import Concept +from sqlalchemy import Engine, Row +from omop_emb.utils.cdm import count_missing_concepts, fetch_cdm_concepts_for_filter, iter_cdm_concepts_for_filter from omop_emb.embeddings import ( EmbeddingClient, EmbeddingRole, @@ -79,54 +76,6 @@ def list_registered_models( ) -# --------------------------------------------------------------------------- -# CDM fetch helpers (private) -# --------------------------------------------------------------------------- - -def _fetch_cdm_concepts_for_ingestion( - concept_ids: set[int], - cdm_session_factory: sessionmaker, - batch_size: int = 50_000, -) -> dict[int, Row]: - """Return CDM rows needed to build ``ConceptEmbeddingRecord`` filter columns. - - Fetches ``domain_id``, ``vocabulary_id``, and ``standard_concept`` - (the three columns written into the embedding table at upsert time.) - - Warnings - -------- - This method allows sub-batching to avoid bind-parameter limits when fetching large numbers of concepts from the CDM. This is not designed for high performance retrieval of concept metadata; it is intended for ingestion workflows where the number of concepts is large and the CDM database may have an unknown dialect. - """ - if not concept_ids: - return {} - id_list = list(concept_ids) - result: dict[int, Row] = {} - for start in range(0, len(id_list), batch_size): - chunk = id_list[start : start + batch_size] - query = select( - Concept.concept_id, - Concept.domain_id, - Concept.vocabulary_id, - Concept.standard_concept, - Concept.invalid_reason, - ).where(Concept.concept_id.in_(chunk)) - with cdm_session_factory() as session: - result.update({row.concept_id: row for row in session.execute(query)}) - return result - - -def _fetch_cdm_concepts_for_filter( - concept_filter: Optional[EmbeddingConceptFilter], - cdm_session_factory: sessionmaker, -) -> dict[int, str]: - """Return {concept_id: concept_name} from CDM matching the filter.""" - query = select(Concept.concept_id, Concept.concept_name) - if concept_filter is not None: - query = concept_filter.apply(query, Concept) - with cdm_session_factory() as session: - return {row.concept_id: row.concept_name for row in session.execute(query)} - - # --------------------------------------------------------------------------- # Reader interface # --------------------------------------------------------------------------- @@ -195,7 +144,6 @@ def __init__( self._canonical_model_name = canonical_model_name self._k = k self._cdm_engine = omop_cdm_engine - self._cdm_session_factory = sessionmaker(omop_cdm_engine) if omop_cdm_engine else None # FAISS fast path activated at construction, not mid-search _faiss_dir = faiss_cache_dir or os.getenv(ENV_OMOP_EMB_FAISS_CACHE_DIR) @@ -366,18 +314,35 @@ def get_concepts_without_embedding( omop_cdm_engine: Engine, *, concept_filter: Optional[EmbeddingConceptFilter] = None, - ) -> Mapping[int, str]: - """Return ``{concept_id: concept_name}`` for concepts lacking embeddings. + ) -> Mapping[int, Row]: + """Return CDM rows for concepts lacking embeddings, keyed by concept_id. - Requires *omop_cdm_engine* to query the CDM for candidate concepts. + Each row contains concept_name, domain_id, vocabulary_id, + standard_concept, and invalid_reason — all columns needed for both + text lookup and embedding-record metadata. """ - cdm_factory = sessionmaker(omop_cdm_engine) - all_concepts = _fetch_cdm_concepts_for_filter(concept_filter, cdm_factory) + all_concepts = fetch_cdm_concepts_for_filter( + concept_filter=concept_filter, + cdm_engine=omop_cdm_engine, + ) + embedded_ids = self._backend.get_all_stored_concept_ids( + model_name=self.canonical_model_name, + metric_type=self._metric_type, + ) + return {cid: row for cid, row in all_concepts.items() if cid not in embedded_ids} + + def count_concepts_without_embedding( + self, + omop_cdm_engine: Engine, + *, + concept_filter: Optional[EmbeddingConceptFilter] = None, + ) -> int: + """Return how many CDM concepts match *concept_filter* but lack an embedding.""" embedded_ids = self._backend.get_all_stored_concept_ids( model_name=self.canonical_model_name, metric_type=self._metric_type, ) - return {cid: name for cid, name in all_concepts.items() if cid not in embedded_ids} + return count_missing_concepts(concept_filter, omop_cdm_engine, embedded_ids) def get_concepts_without_embedding_batched( self, @@ -385,14 +350,36 @@ def get_concepts_without_embedding_batched( *, batch_size: int, concept_filter: Optional[EmbeddingConceptFilter] = None, - ) -> Iterable[Mapping[int, str]]: - missing = self.get_concepts_without_embedding( - omop_cdm_engine=omop_cdm_engine, - concept_filter=concept_filter, + limit: Optional[int] = None, + ) -> Iterable[Mapping[int, Row]]: + """Yield ``{concept_id: Row}`` batches for concepts lacking embeddings. + + Streams CDM rows and filters against already-embedded IDs on-the-fly, + so only one batch of CDM rows is in memory at a time. + """ + embedded_ids = self._backend.get_all_stored_concept_ids( + model_name=self.canonical_model_name, + metric_type=self._metric_type, ) - items = list(missing.items()) - for batch in batched(items, batch_size): - yield dict(batch) + batch: dict[int, Row] = {} + n_yielded = 0 + for row in iter_cdm_concepts_for_filter(concept_filter, omop_cdm_engine): + if row.concept_id in embedded_ids: + continue + batch[row.concept_id] = row + if len(batch) >= batch_size: + yield batch + n_yielded += len(batch) + batch = {} + if limit is not None and n_yielded >= limit: + return + if batch: + if limit is not None: + trimmed = dict(list(batch.items())[: limit - n_yielded]) + if trimmed: + yield trimmed + else: + yield batch # ------------------------------------------------------------------ # CDM enrichment (internal) @@ -407,16 +394,16 @@ def _enrich( Only ``concept_name`` is populated here. ``is_standard`` is already set by the backend from the embedding table filter columns. """ - if not self._cdm_session_factory: + if not self._cdm_engine: return raw unique_ids = {r.concept_id for results in raw for r in results} concept_filter = EmbeddingConceptFilter(concept_ids=tuple(unique_ids)) - names = _fetch_cdm_concepts_for_filter(concept_filter, self._cdm_session_factory) + rows = fetch_cdm_concepts_for_filter(concept_filter=concept_filter, cdm_engine=self._cdm_engine) return tuple( tuple( - dc_replace(r, concept_name=names.get(r.concept_id)) + dc_replace(r, concept_name=rows[r.concept_id].concept_name if r.concept_id in rows else None) for r in query_results ) for query_results in raw @@ -567,23 +554,25 @@ def bulk_upsert_concept_embeddings( def embed_and_upsert_concepts( self, *, - omop_cdm_engine: Engine, concept_ids: Sequence[int], concept_texts: Sequence[str], + concept_meta: Mapping[int, Row], batch_size: Optional[int] = None, - cdm_batch_size: int = 50_000, ) -> ndarray: """Generate embeddings from CDM concepts and upsert with filter metadata. + Concept_ids, concept_texts and concept_meta must be aligned (same length, same order). + `fetch_cdm_concepts_for_filter` can be used to get aligned concept_meta for a set of concept_ids. Parameters ---------- - omop_cdm_engine : Engine - CDM engine used to fetch concept filter attributes - (domain_id, vocabulary_id, standard_concept) for each concept_id. concept_ids : Sequence[int] OMOP concept IDs to embed. concept_texts : Sequence[str] Text strings to embed (aligned with *concept_ids*). + concept_meta : Mapping[int, Row] + CDM rows keyed by concept_id, as returned by + ``get_concepts_without_embedding``. Used to populate + domain_id, vocabulary_id, is_standard, and is_valid. """ if len(concept_ids) != len(concept_texts): raise ValueError( @@ -591,21 +580,13 @@ def embed_and_upsert_concepts( "must have the same length." ) - # Fetch concept metadata from CDM for filter columns - cdm_factory = sessionmaker(omop_cdm_engine) - meta = _fetch_cdm_concepts_for_ingestion( - set(concept_ids), - cdm_factory, - batch_size=cdm_batch_size, - ) - records = [ ConceptEmbeddingRecord( concept_id=cid, - domain_id=meta[cid].domain_id if cid in meta else "", - vocabulary_id=meta[cid].vocabulary_id if cid in meta else "", - is_standard=meta[cid].standard_concept in ("S", "C") if cid in meta else False, - is_valid=meta[cid].invalid_reason not in ("D", "U") if cid in meta else True, + domain_id=concept_meta[cid].domain_id if cid in concept_meta else "", + vocabulary_id=concept_meta[cid].vocabulary_id if cid in concept_meta else "", + is_standard=concept_meta[cid].standard_concept in ("S", "C") if cid in concept_meta else False, + is_valid=concept_meta[cid].invalid_reason not in ("D", "U") if cid in concept_meta else True, ) for cid in concept_ids ] @@ -629,15 +610,18 @@ def embed_and_upsert_concepts( def get_nearest_concepts_from_query_texts( self, query_texts: Union[str, Tuple[str, ...], List[str]], + embedding_client: Optional[EmbeddingClient] = None, *, concept_filter: Optional[EmbeddingConceptFilter] = None, batch_size: Optional[int] = None, k: Optional[int] = None, + faiss_index_config: Optional[IndexConfig] = None, ) -> Tuple[Tuple[NearestConceptMatch, ...], ...]: return super().get_nearest_concepts_from_query_texts( query_texts=query_texts, - embedding_client=self._embedding_client, + embedding_client=embedding_client or self._embedding_client, concept_filter=concept_filter, batch_size=batch_size, k=k, + faiss_index_config=faiss_index_config, ) diff --git a/src/omop_emb/utils/cdm.py b/src/omop_emb/utils/cdm.py new file mode 100644 index 0000000..3c5c532 --- /dev/null +++ b/src/omop_emb/utils/cdm.py @@ -0,0 +1,143 @@ +"""OMOP CDM utilities. Since embeddings and OMOP CDM are separate, we have this helper utility in case we need to query the OMOP CDM.""" +from __future__ import annotations + +import logging +from contextlib import contextmanager +from typing import Generator, Iterator, Optional + +from sqlalchemy import Engine, Row, select +from sqlalchemy.exc import DBAPIError +from sqlalchemy.orm import Session, sessionmaker + +from omop_alchemy.cdm.model.vocabulary import Concept +from omop_emb.utils.embedding_utils import EmbeddingConceptFilter + +logger = logging.getLogger(__name__) + +@contextmanager +def cdm_session(cdm_engine: Engine) -> Generator[Session, None, None]: + """Context manager yielding a single CDM session from *cdm_engine*.""" + with sessionmaker(cdm_engine)() as session: + yield session + + +def check_concept_cdm(cdm_engine: Engine) -> None: + """Verify the OMOP CDM Concept table is reachable. + + Raises RuntimeError with a human-friendly message when the schema is + missing, so callers can fail fast before expensive setup (e.g. model + registration). + """ + try: + with cdm_session(cdm_engine) as session: + session.execute(select(Concept.concept_id).limit(1)) + except DBAPIError as e: + error_msg = str(e).lower() + if "does not exist" in error_msg or "no such table" in error_msg: + logger.error("Database schema is missing! Did you forget to run the ingestion CLI?") + raise RuntimeError("Database not initialized.") from e + raise + + +def fetch_cdm_concepts_for_filter( + concept_filter: Optional[EmbeddingConceptFilter], + cdm_engine: Engine, +) -> dict[int, Row]: + """Return CDM rows matching *concept_filter*, keyed by concept_id. + + Selects all columns needed for both concept name lookup and embedding + metadata (domain_id, vocabulary_id, standard_concept, invalid_reason), + so callers do not need a second CDM round-trip. + """ + query = select( + Concept.concept_id, + Concept.concept_name, + Concept.domain_id, + Concept.vocabulary_id, + Concept.standard_concept, + Concept.invalid_reason, + ) + if concept_filter is not None: + query = concept_filter.apply(query, Concept) + with cdm_session(cdm_engine) as session: + return {row.concept_id: row for row in session.execute(query)} + + +def iter_cdm_concepts_for_filter( + concept_filter: Optional[EmbeddingConceptFilter], + cdm_engine: Engine, + chunk_size: int = 5_000, +) -> Iterator[Row]: + """Stream CDM concept rows matching *concept_filter*, server-side chunked. + + Uses ``yield_per`` so the database driver fetches *chunk_size* rows at a + time instead of buffering the full result set. The session is held open + for the lifetime of the generator. + """ + query = select( + Concept.concept_id, + Concept.concept_name, + Concept.domain_id, + Concept.vocabulary_id, + Concept.standard_concept, + Concept.invalid_reason, + ) + if concept_filter is not None: + query = concept_filter.apply(query, Concept) + with cdm_session(cdm_engine) as session: + yield from session.execute( + query.execution_options(stream_results=True, yield_per=chunk_size) + ) + + +def count_missing_concepts( + concept_filter: Optional[EmbeddingConceptFilter], + cdm_engine: Engine, + embedded_ids: set[int], + chunk_size: int = 10_000, +) -> int: + """Return how many CDM concepts match *concept_filter* but lack an embedding. + + Streams only ``concept_id`` (one integer column) and checks each against + *embedded_ids* via O(1) set lookup — far cheaper than fetching full rows. + """ + query = select(Concept.concept_id) + if concept_filter is not None: + query = concept_filter.apply(query, Concept) + count = 0 + with cdm_session(cdm_engine) as session: + for row in session.execute( + query.execution_options(stream_results=True, yield_per=chunk_size) + ): + if row.concept_id not in embedded_ids: + count += 1 + return count + + +def fetch_cdm_concepts_for_ingestion( + concept_ids: set[int], + cdm_engine: Engine, + batch_size: int = 50_000, +) -> dict[int, Row]: + """Return CDM rows needed to build ``ConceptEmbeddingRecord`` filter columns. + + Sub-batches to avoid bind-parameter limits on large concept sets. + Fetches ``domain_id``, ``vocabulary_id``, ``standard_concept``, and + ``invalid_reason`` for each concept_id. + """ + if not concept_ids: + return {} + id_list = list(concept_ids) + result: dict[int, Row] = {} + for start in range(0, len(id_list), batch_size): + chunk = id_list[start : start + batch_size] + query = select( + Concept.concept_id, + Concept.domain_id, + Concept.vocabulary_id, + Concept.standard_concept, + Concept.invalid_reason, + ).where(Concept.concept_id.in_(chunk)) + with cdm_session(cdm_engine) as session: + result.update({row.concept_id: row for row in session.execute(query)}) + return result diff --git a/tests/conftest.py b/tests/conftest.py index e709e2c..2e6e800 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,16 +4,20 @@ import os import time +from pathlib import Path from typing import Generator +from dotenv import load_dotenv + import numpy as np import pytest import sqlalchemy as sa from omop_emb.backends.base_backend import ConceptEmbeddingRecord from omop_emb.backends.sqlitevec import SQLiteVecEmbeddingBackend, create_sqlitevec_engine -from omop_emb.config import MetricType, ProviderType +from omop_emb.config import ProviderType +load_dotenv(Path(__file__).resolve().parents[1] / ".env", override=False) # --------------------------------------------------------------------------- # Test data constants diff --git a/uv.lock b/uv.lock index 2e7f414..0320cdf 100644 --- a/uv.lock +++ b/uv.lock @@ -348,6 +348,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ea/ab/1608e5a7578e62113506740b88066bf09888322a311cff602105e619bd87/greenlet-3.3.2-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:ac8d61d4343b799d1e526db579833d72f23759c71e07181c2d2944e429eb09cd", size = 280358, upload-time = "2026-02-20T20:17:43.971Z" }, { url = "https://files.pythonhosted.org/packages/a5/23/0eae412a4ade4e6623ff7626e38998cb9b11e9ff1ebacaa021e4e108ec15/greenlet-3.3.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ceec72030dae6ac0c8ed7591b96b70410a8be370b6a477b1dbc072856ad02bd", size = 601217, upload-time = "2026-02-20T20:47:31.462Z" }, { url = "https://files.pythonhosted.org/packages/f8/16/5b1678a9c07098ecb9ab2dd159fafaf12e963293e61ee8d10ecb55273e5e/greenlet-3.3.2-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a2a5be83a45ce6188c045bcc44b0ee037d6a518978de9a5d97438548b953a1ac", size = 611792, upload-time = "2026-02-20T20:55:58.423Z" }, + { url = "https://files.pythonhosted.org/packages/5c/c5/cc09412a29e43406eba18d61c70baa936e299bc27e074e2be3806ed29098/greenlet-3.3.2-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ae9e21c84035c490506c17002f5c8ab25f980205c3e61ddb3a2a2a2e6c411fcb", size = 626250, upload-time = "2026-02-20T21:02:46.596Z" }, { url = "https://files.pythonhosted.org/packages/50/1f/5155f55bd71cabd03765a4aac9ac446be129895271f73872c36ebd4b04b6/greenlet-3.3.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43e99d1749147ac21dde49b99c9abffcbc1e2d55c67501465ef0930d6e78e070", size = 613875, upload-time = "2026-02-20T20:21:01.102Z" }, { url = "https://files.pythonhosted.org/packages/fc/dd/845f249c3fcd69e32df80cdab059b4be8b766ef5830a3d0aa9d6cad55beb/greenlet-3.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4c956a19350e2c37f2c48b336a3afb4bff120b36076d9d7fb68cb44e05d95b79", size = 1571467, upload-time = "2026-02-20T20:49:33.495Z" }, { url = "https://files.pythonhosted.org/packages/2a/50/2649fe21fcc2b56659a452868e695634722a6655ba245d9f77f5656010bf/greenlet-3.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6c6f8ba97d17a1e7d664151284cb3315fc5f8353e75221ed4324f84eb162b395", size = 1640001, upload-time = "2026-02-20T20:21:09.154Z" }, @@ -356,6 +357,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ac/48/f8b875fa7dea7dd9b33245e37f065af59df6a25af2f9561efa8d822fde51/greenlet-3.3.2-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:aa6ac98bdfd716a749b84d4034486863fd81c3abde9aa3cf8eff9127981a4ae4", size = 279120, upload-time = "2026-02-20T20:19:01.9Z" }, { url = "https://files.pythonhosted.org/packages/49/8d/9771d03e7a8b1ee456511961e1b97a6d77ae1dea4a34a5b98eee706689d3/greenlet-3.3.2-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ab0c7e7901a00bc0a7284907273dc165b32e0d109a6713babd04471327ff7986", size = 603238, upload-time = "2026-02-20T20:47:32.873Z" }, { url = "https://files.pythonhosted.org/packages/59/0e/4223c2bbb63cd5c97f28ffb2a8aee71bdfb30b323c35d409450f51b91e3e/greenlet-3.3.2-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d248d8c23c67d2291ffd47af766e2a3aa9fa1c6703155c099feb11f526c63a92", size = 614219, upload-time = "2026-02-20T20:55:59.817Z" }, + { url = "https://files.pythonhosted.org/packages/94/2b/4d012a69759ac9d77210b8bfb128bc621125f5b20fc398bce3940d036b1c/greenlet-3.3.2-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ccd21bb86944ca9be6d967cf7691e658e43417782bce90b5d2faeda0ff78a7dd", size = 628268, upload-time = "2026-02-20T21:02:48.024Z" }, { url = "https://files.pythonhosted.org/packages/7a/34/259b28ea7a2a0c904b11cd36c79b8cef8019b26ee5dbe24e73b469dea347/greenlet-3.3.2-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b6997d360a4e6a4e936c0f9625b1c20416b8a0ea18a8e19cabbefc712e7397ab", size = 616774, upload-time = "2026-02-20T20:21:02.454Z" }, { url = "https://files.pythonhosted.org/packages/0a/03/996c2d1689d486a6e199cb0f1cf9e4aa940c500e01bdf201299d7d61fa69/greenlet-3.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:64970c33a50551c7c50491671265d8954046cb6e8e2999aacdd60e439b70418a", size = 1571277, upload-time = "2026-02-20T20:49:34.795Z" }, { url = "https://files.pythonhosted.org/packages/d9/c4/2570fc07f34a39f2caf0bf9f24b0a1a0a47bc2e8e465b2c2424821389dfc/greenlet-3.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1a9172f5bf6bd88e6ba5a84e0a68afeac9dc7b6b412b245dd64f52d83c81e55b", size = 1640455, upload-time = "2026-02-20T20:21:10.261Z" }, @@ -364,6 +366,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3f/ae/8bffcbd373b57a5992cd077cbe8858fff39110480a9d50697091faea6f39/greenlet-3.3.2-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:8d1658d7291f9859beed69a776c10822a0a799bc4bfe1bd4272bb60e62507dab", size = 279650, upload-time = "2026-02-20T20:18:00.783Z" }, { url = "https://files.pythonhosted.org/packages/d1/c0/45f93f348fa49abf32ac8439938726c480bd96b2a3c6f4d949ec0124b69f/greenlet-3.3.2-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:18cb1b7337bca281915b3c5d5ae19f4e76d35e1df80f4ad3c1a7be91fadf1082", size = 650295, upload-time = "2026-02-20T20:47:34.036Z" }, { url = "https://files.pythonhosted.org/packages/b3/de/dd7589b3f2b8372069ab3e4763ea5329940fc7ad9dcd3e272a37516d7c9b/greenlet-3.3.2-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c2e47408e8ce1c6f1ceea0dffcdf6ebb85cc09e55c7af407c99f1112016e45e9", size = 662163, upload-time = "2026-02-20T20:56:01.295Z" }, + { url = "https://files.pythonhosted.org/packages/cd/ac/85804f74f1ccea31ba518dcc8ee6f14c79f73fe36fa1beba38930806df09/greenlet-3.3.2-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e3cb43ce200f59483eb82949bf1835a99cf43d7571e900d7c8d5c62cdf25d2f9", size = 675371, upload-time = "2026-02-20T21:02:49.664Z" }, { url = "https://files.pythonhosted.org/packages/d2/d8/09bfa816572a4d83bccd6750df1926f79158b1c36c5f73786e26dbe4ee38/greenlet-3.3.2-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63d10328839d1973e5ba35e98cccbca71b232b14051fd957b6f8b6e8e80d0506", size = 664160, upload-time = "2026-02-20T20:21:04.015Z" }, { url = "https://files.pythonhosted.org/packages/48/cf/56832f0c8255d27f6c35d41b5ec91168d74ec721d85f01a12131eec6b93c/greenlet-3.3.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8e4ab3cfb02993c8cc248ea73d7dae6cec0253e9afa311c9b37e603ca9fad2ce", size = 1619181, upload-time = "2026-02-20T20:49:36.052Z" }, { url = "https://files.pythonhosted.org/packages/0a/23/b90b60a4aabb4cec0796e55f25ffbfb579a907c3898cd2905c8918acaa16/greenlet-3.3.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:94ad81f0fd3c0c0681a018a976e5c2bd2ca2d9d94895f23e7bb1af4e8af4e2d5", size = 1687713, upload-time = "2026-02-20T20:21:11.684Z" }, @@ -372,6 +375,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/6d/8f2ef704e614bcf58ed43cfb8d87afa1c285e98194ab2cfad351bf04f81e/greenlet-3.3.2-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:e26e72bec7ab387ac80caa7496e0f908ff954f31065b0ffc1f8ecb1338b11b54", size = 286617, upload-time = "2026-02-20T20:19:29.856Z" }, { url = "https://files.pythonhosted.org/packages/5e/0d/93894161d307c6ea237a43988f27eba0947b360b99ac5239ad3fe09f0b47/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b466dff7a4ffda6ca975979bab80bdadde979e29fc947ac3be4451428d8b0e4", size = 655189, upload-time = "2026-02-20T20:47:35.742Z" }, { url = "https://files.pythonhosted.org/packages/f5/2c/d2d506ebd8abcb57386ec4f7ba20f4030cbe56eae541bc6fd6ef399c0b41/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b8bddc5b73c9720bea487b3bffdb1840fe4e3656fba3bd40aa1489e9f37877ff", size = 658225, upload-time = "2026-02-20T20:56:02.527Z" }, + { url = "https://files.pythonhosted.org/packages/d1/67/8197b7e7e602150938049d8e7f30de1660cfb87e4c8ee349b42b67bdb2e1/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:59b3e2c40f6706b05a9cd299c836c6aa2378cabe25d021acd80f13abf81181cf", size = 666581, upload-time = "2026-02-20T21:02:51.526Z" }, { url = "https://files.pythonhosted.org/packages/8e/30/3a09155fbf728673a1dea713572d2d31159f824a37c22da82127056c44e4/greenlet-3.3.2-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b26b0f4428b871a751968285a1ac9648944cea09807177ac639b030bddebcea4", size = 657907, upload-time = "2026-02-20T20:21:05.259Z" }, { url = "https://files.pythonhosted.org/packages/f3/fd/d05a4b7acd0154ed758797f0a43b4c0962a843bedfe980115e842c5b2d08/greenlet-3.3.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1fb39a11ee2e4d94be9a76671482be9398560955c9e568550de0224e41104727", size = 1618857, upload-time = "2026-02-20T20:49:37.309Z" }, { url = "https://files.pythonhosted.org/packages/6f/e1/50ee92a5db521de8f35075b5eff060dd43d39ebd46c2181a2042f7070385/greenlet-3.3.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:20154044d9085151bc309e7689d6f7ba10027f8f5a8c0676ad398b951913d89e", size = 1680010, upload-time = "2026-02-20T20:21:13.427Z" }, @@ -1026,29 +1030,30 @@ wheels = [ [[package]] name = "omop-alchemy" -version = "0.5.12" +version = "0.6.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "orm-loader" }, { name = "pandas" }, { name = "python-dotenv" }, { name = "pyyaml" }, + { name = "rich" }, { name = "sqlalchemy" }, + { name = "typer" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b4/8f/96074945b47bf60550496df43e60c3f378ee1e57d431756cabd00ecd7774/omop_alchemy-0.5.12.tar.gz", hash = "sha256:4ff57dbba7830a3ddc6888995d4e647a4923cb44fbe77f5b59c87a7bd01dda71", size = 42704, upload-time = "2026-02-23T04:40:32.635Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e2/46/9adc53d5348d1b80b3115d5a94f56209abf04ff33fd5c9aeebf88443f7f2/omop_alchemy-0.6.3.tar.gz", hash = "sha256:6db5548fe36d36c036a07e3ff1d93ad6d69402075b576d8331befb5962f82d21", size = 101101, upload-time = "2026-05-25T04:36:59.836Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/1d/84/8d99fc5ff78d1013e1d2829a80f903a1a738f1bbada32b1ef8ef63c17962/omop_alchemy-0.5.12-py3-none-any.whl", hash = "sha256:d0e4c45f731a192406e76d9d018a0f354188a46479653e06ecf40b8767a894ec", size = 67122, upload-time = "2026-02-23T04:40:31.099Z" }, + { url = "https://files.pythonhosted.org/packages/79/f9/5a4b5ef436f284f630890f528406a20dcc6b5983d2d19d1ceee1f09d3f2d/omop_alchemy-0.6.3-py3-none-any.whl", hash = "sha256:0ff9e68d986bff26a0ebf22f20edb80a8fe502f52591aec56676b252b1948145", size = 123933, upload-time = "2026-05-25T04:36:58.192Z" }, ] [[package]] name = "omop-emb" -version = "0.4.1" +version = "1.0.0" source = { editable = "." } dependencies = [ { name = "numpy" }, { name = "omop-alchemy" }, { name = "openai" }, - { name = "orm-loader" }, { name = "requests" }, { name = "sqlalchemy" }, { name = "sqlite-vec" }, @@ -1089,9 +1094,8 @@ dev = [ requires-dist = [ { name = "faiss-cpu", marker = "extra == 'faiss-cpu'", specifier = ">=1.8.0" }, { name = "numpy", specifier = ">=1.26" }, - { name = "omop-alchemy", specifier = ">=0.5.7" }, + { name = "omop-alchemy", specifier = ">=0.6.3" }, { name = "openai" }, - { name = "orm-loader", specifier = ">=0.3.15" }, { name = "pgvector", marker = "extra == 'pgvector'", specifier = ">=0.3.0" }, { name = "psycopg", extras = ["binary"], marker = "extra == 'pgvector'", specifier = ">=3.1.0" }, { name = "requests" }, @@ -1143,18 +1147,17 @@ wheels = [ [[package]] name = "orm-loader" -version = "0.3.25" +version = "0.4.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "chardet" }, { name = "pandas" }, { name = "pyarrow" }, { name = "sqlalchemy" }, - { name = "sqlalchemy-utils" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b1/c6/e9f43390d2a752370d0977a782a35293130c1619f031902fc5eb2dbed3de/orm_loader-0.3.25.tar.gz", hash = "sha256:b0293c373269949f70ffdc672c0719cecbac82d8805855d816ffe2a1dd751f1d", size = 32990, upload-time = "2026-03-23T12:30:33.611Z" } +sdist = { url = "https://files.pythonhosted.org/packages/2a/6a/007e6eef497753702d5a53444842ee6cc38bcbf7c5c422857c0671bfc727/orm_loader-0.4.1.tar.gz", hash = "sha256:434b6c3436c05bf3ad43774b46476e7f324db05a18bf34ad9f9692e4f02bcb7e", size = 39449, upload-time = "2026-05-19T12:56:29.572Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/df/25/6a7bb87820a59f89c112ec214d77c2c4d03d1f16d0d5076e2a0c60d27be8/orm_loader-0.3.25-py3-none-any.whl", hash = "sha256:b5b2ac5b41edce81e534edb5cc954b0a3c44dcb341a8d2f22b6fc70e62ff9696", size = 44805, upload-time = "2026-03-23T12:30:32.047Z" }, + { url = "https://files.pythonhosted.org/packages/98/d7/37f82f8748a91fdb14d41f314ddc829806f596dec409196c037e59d3a5a7/orm_loader-0.4.1-py3-none-any.whl", hash = "sha256:03131b5d4b7b787ea446e110684b7256b5690313503626939b83984953174825", size = 54472, upload-time = "2026-05-19T12:56:27.959Z" }, ] [[package]] @@ -1728,18 +1731,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/46/2c/9664130905f03db57961b8980b05cab624afd114bf2be2576628a9f22da4/sqlalchemy-2.0.48-py3-none-any.whl", hash = "sha256:a66fe406437dd65cacd96a72689a3aaaecaebbcd62d81c5ac1c0fdbeac835096", size = 1940202, upload-time = "2026-03-02T15:52:43.285Z" }, ] -[[package]] -name = "sqlalchemy-utils" -version = "0.42.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sqlalchemy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/0f/7d/eb9565b6a49426552a5bf5c57e7c239c506dc0e4e5315aec6d1e8241dc7c/sqlalchemy_utils-0.42.1.tar.gz", hash = "sha256:881f9cd9e5044dc8f827bccb0425ce2e55490ce44fc0bb848c55cc8ee44cc02e", size = 130789, upload-time = "2025-12-13T03:14:13.591Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7c/25/7400c18c3ee97914cc99c90007795c00a4ec5b60c853b49db7ba24d11179/sqlalchemy_utils-0.42.1-py3-none-any.whl", hash = "sha256:243cfe1b3a1dae3c74118ae633f1d1e0ed8c787387bc33e556e37c990594ac80", size = 91761, upload-time = "2025-12-13T03:14:15.014Z" }, -] - [[package]] name = "sqlite-vec" version = "0.1.9"