diff --git a/.env b/.env new file mode 100644 index 0000000..7c34513 --- /dev/null +++ b/.env @@ -0,0 +1 @@ +ARGO_USER=jplfaria diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..90b1e75 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,36 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + lint: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11"] + steps: + - uses: actions/checkout@v3 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install --upgrade pip + pip install black isort flake8 mypy pytest mkdocs mkdocs-material + - name: Black check + run: black --check . + - name: isort check + run: isort --check-only . + - name: flake8 lint + run: flake8 . + - name: mypy type check + run: mypy . + - name: pytest + run: pytest -q + - name: Build docs + run: mkdocs build --strict diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml new file mode 100644 index 0000000..f3ee552 --- /dev/null +++ b/.github/workflows/deploy-docs.yml @@ -0,0 +1,26 @@ +name: Deploy Docs + +on: + push: + branches: [main] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + - name: Install MKDocs + run: | + pip install --upgrade pip + pip install mkdocs mkdocs-material + - name: Build documentation + run: mkdocs build --strict + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./site diff --git a/.gitignore b/.gitignore index a5214b7..1a293f0 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,11 @@ __pycache__/ # ------------------------------------------------------------------ *.log *.concordia.csv # result files regenerated each run +example_data/test_results/ # benchmark test results directory +eval/results/ # ignore new benchmark results directory +eval/benchmark_results/ # ignore benchmark suite output +eval/scripts/eval/results/ # ignore nested eval script results +*.png # ignore plot files # Hugging Face cache (weights are huge; re-downloaded on demand) .cache/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..9b2cd73 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +repos: + - repo: https://github.com/psf/black + rev: 25.1.0 + hooks: + - id: black + language_version: python3 + - repo: https://github.com/pycqa/isort + rev: 6.0.1 + hooks: + - id: isort + - repo: https://github.com/myint/autoflake + rev: v1.4 + hooks: + - id: autoflake + args: [--remove-all-unused-imports, --remove-unused-variables, --in-place, --recursive] + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: v0.11.8 + hooks: + - id: ruff diff --git a/README.md b/README.md index 0340842..96a4bde 100644 --- a/README.md +++ b/README.md @@ -1,164 +1,185 @@ -# CONCORDIA +# CONCORDIA *CONcordance of Curated & Original Raw Descriptions In Annotations* -Concordia compares two functional-annotation sources—old vs new, RAST vs UniProt, manual vs AI—and writes a tidy table with - -* **`similarity_Pubmedbert`** – cosine similarity from the PubMedBERT sentence-embedding model -* **`label`** – a one-word judgement from an LLM (`o3-mini` by default) drawn from a 7-class ontology -* **`note`** – the LLM’s ultra-short reason (blank when the heuristic label is used) - -You can choose from **four processing modes**: - -| Mode | What happens | -|------|--------------| -| **llm** | LLM only — cheapest if you already trust the model | -| **local** | PubMedBERT embeddings → cosine → heuristic label (no LLM) | -| **dual** | Embeddings **and** LLM (baseline) | -| **simhint** | Same as **dual** **plus** the cosine similarity is prefixed to the prompt as a weak prior | - ---- +A toolkit for annotation concordance and entity relationship classification using embeddings and LLMs. + +## Features +- **gateway-check**: Argo Gateway API connectivity check on startup with prod/dev endpoint fallback +- **local**: PubMedBERT embeddings → cosine similarity → heuristic labels +- **zero-shot**: Single LLM call with optional similarity hints +- **vote**: Multiple LLM calls with majority vote (with vote tracking) +- **rac** (Beta): Retrieval-Augmented Classification with example memory +- **fallback**: Safe local fallback on errors +- Template-driven prompt management with versioned external templates (v1.x, v2, v2.1, v3.0, v3.1) +- Ad-hoc mode for quick two-sentence comparisons (without requiring a CSV file) +- **list-templates**: List available prompt templates +- **batch processing**: Control both file chunking and LLM batch sizes +- **verbose**: Show detailed evidence and explanations ## Installation + +### Using Poetry (recommended) ```bash git clone https://github.com/you/concordia.git cd concordia -poetry install # installs deps + CLI -poetry shell # activate the virtualenv +poetry install # install dependencies & CLI entry-point +poetry shell # activate the virtual environment ``` ---- - -## Quick-start recipes +### Alternative via pip +```bash +pip install concordia +``` -| Goal | Command | -|------|---------| -| CSV with default LLM (o3-mini → dev) | `concord example_data/annotations_test.csv` | -| TSV with GPT-4o (prod) | `concord example_data/annotations_test.tsv --llm-model gpt4o` | -| Embed-only (no LLM) | `concord …csv --mode local` | -| Dual (baseline) | `concord …csv --mode dual` | -| Similarity-hint | `concord …csv --mode simhint` | -| Two ad-hoc strings | `concord --text-a "RecA" --text-b "DNA recombinase A"` | -| **Overwrite** existing output | add `--force` | +### Syncing Local Dependencies +If you've installed additional Python packages in your environment, you can compare them with Poetry-managed dependencies: +```bash +# export current environment packages +pip freeze > env-requirements.txt ---- +# export Poetry-managed requirements +poetry export -f requirements.txt --without-hashes > poetry-requirements.txt -## Accepted input formats +# view differences +diff env-requirements.txt poetry-requirements.txt +``` +Manually add any missing packages to `pyproject.toml` under `[tool.poetry.dependencies]` and run `poetry update`. -| Extension | Loader | Note | -|-----------|--------|------| -| `.csv` | `pandas.read_csv(sep=',')` | default | -| `.tsv` / `.tab` | `pandas.read_csv(sep='\t')`| or `--sep "\t"` | -| `.json` | `pandas.read_json()` | list-of-objects **or** column-orient | +## Quickstart +**CLI** +```bash +# Simplified command structure (single invocation) +concord example_data/annotations_test.csv --mode zero-shot --llm-model gpt4o +concord example_data/annotations_test.csv --mode local --output local.csv +concord example_data/annotations_test.csv --mode vote --output results_vote.csv +concord example_data/annotations_test.csv --mode rac --output results_rac.csv -If you do **not** pass `--col-a / --col-b`, the **first two textual columns that don’t end with `id`** are taken. +# Direct text comparison (no CSV required) +concord --text-a "Entity A" --text-b "Entity B" --mode zero-shot ---- +# List available templates +concord --list-templates -## Minimal sample CSV -```csv -annotation_a,annotation_b -DNA repair protein RecA,Recombinase A -Hypothetical protein,Uncharacterized protein +# Control batch processing +concord example_data/annotations_test.csv --batch-size 32 --llm-batch-size 12 ``` -Any extra columns (e.g. `gene_id`) are preserved in the output. - ---- -## CLI options +**Python** +```python +from concord.pipeline import run_pair, run_file +label, sim, evidence = run_pair("Entity A", "Entity B", "config.yaml") +print(label, sim, evidence) +``` -| Flag | Description | -|------|-------------| -| **`FILE`** | Input table (`.csv`, `.tsv`, `.json`) | -| `--text-a / --text-b` | Compare two strings instead of a file | -| `--mode` | `llm` (default) | `local` | `dual` | `simhint` | -| `--llm-model` | Gateway LLM (`gpto3mini`, `gpt4o`, …) | -| `--retry` | Automatic blank-reply retries (default 5) | -| `--force` | Overwrite existing output instead of appending | -| `--output` | Destination path (file-mode only) | -| `--cfg` | Alternate YAML config | -| `--col-a / --col-b` | Explicit annotation columns | -| `--sep` | Custom delimiter for text files (e.g. `"\t"`) | +## Evaluation +After generating predictions (e.g., from a benchmark run), evaluate them against the gold standard using `eval/evaluate_suite.py`. +For detailed instructions on running benchmark suites and evaluation, see the [Benchmarking Workflow](docs/benchmarking.md). ---- +Example evaluation command: +```bash +python eval/evaluate_suite.py \ + --gold eval/datasets/Benchmark_subset__200_pairs_v1.csv \ + --pred-dir eval/results/your_benchmark_run_timestamp_dir \ + --pattern "**/*.csv" \ + --out eval/results/your_benchmark_run_timestamp_dir/evaluation_output \ + --plot +``` +Replace `your_benchmark_run_timestamp_dir` with the specific output directory of your benchmark run. -## Modes in detail +## Configuration (`config.yaml`) +```yaml +engine: + mode: zero-shot # local | zero-shot | vote | rac + sim_hint: false # Optional: prefix similarity hint to prompts -| Mode | Pipeline | Extra columns | -|------|----------|---------------| -| **llm** | Skip embeddings; every pair → LLM | `label`, `note` | -| **local** | PubMedBERT embeddings → cosine → heuristic | `similarity_Pubmedbert`, `label` | -| **dual** | Embeddings **and** LLM for every row | `similarity_Pubmedbert`, `label`, `note` | -| **simhint**| Same as **dual**, but cosine similarity is sent to the LLM prompt | `similarity_Pubmedbert`, `label`, `note` | +llm: + model: gpt4o # use without hyphens + stream: false + user: ${ARGO_USER} -> **Embedding model** [`NeuML/pubmedbert-base-embeddings`](https://huggingface.co/NeuML/pubmedbert-base-embeddings) (Apache-2.0) +local: + model_id: NeuML/pubmedbert-base-embeddings + device: cpu # cpu or cuda ---- +# RAC mode settings (Beta) +rac: + example_limit: 3 # Number of examples to include in prompts + similarity_threshold: 0.6 # Minimum similarity to include example + auto_store: true # Auto-save classifications to vector store -## 7-label ontology +data_dir: "./data" # Where to store the vector database +``` -| Label | Meaning | -|-------|---------| -| **Exact** | Same function; wording/punctuation only | -| **Synonym** | Semantically equivalent paraphrase | -| **Broader** | A ⊃ B (A more general) | -| **Narrower** | A ⊂ B (A more specific) | -| **Related** | Same pathway / complex / family but not parent–child | -| **Uninformative**| Placeholder or extremely generic | -| **Different** | No functional overlap | +### Configuration Fields +- `engine.mode`: select mode (`local`, `zero-shot`, `vote`, `rac`) +- `engine.sim_hint`: boolean flag to prefix cosine similarity hint to LLM prompts (default: false) +- `engine.sim_threshold`: similarity threshold for local mode (default: 0.98) +- `engine.vote_temps`: list of temperatures for vote mode LLM calls (default: `[0.8, 0.2, 0.0]`) +- `llm.model`: Gateway model name (e.g. `gpt4o`, `gpt35`, `gpto3mini`) +- `llm.stream`: `true` to use streaming `/streamchat/` endpoint +- `llm.user`: Argo Gateway username (via `ARGO_USER`) +- `llm.api_key`: Argo Gateway API key (via `ARGO_API_KEY`) +- `prompt_ver`: explicit prompt version to use (overrides config `prompt_ver` and bucket routing) +- `local.model_id`: embedding model ID (PubMedBERT or SPECTER2) +- `local.device`: device for embeddings (`cpu` or `cuda`) +- `local.batch_size`: batch size for file processing +- `rac.example_limit`: number of similar examples to retrieve (for RAC mode) +- `rac.similarity_threshold`: minimum similarity score for examples (0-1) +- `rac.auto_store`: whether to automatically store successful classifications +- `data_dir`: directory for storing vector database and other data -### Alias system — why you rarely see **Unknown** +## RAC Mode (Beta) -Older checkpoints reply with tokens like *Identical* or *Partial*. -`llm_label()` holds a tiny alias map so such answers are remapped automatically; genuine blanks are the only source of `Unknown`. +The Retrieval-Augmented Classification (RAC) mode is currently in beta development. This mode enhances classification by retrieving similar previously classified examples and including them in the prompt for context. ---- +### Current Limitations -## Prompting — tweak in one place +RAC mode currently has several limitations being actively worked on: -All prompt text lives in **`concord/llm/prompts.py`**. -Edit the few-shot examples or definitions to experiment; no other code must change. +1. **All Classifications Get Stored**: Currently, all successful LLM classifications are stored in the vector database if `auto_store` is enabled, regardless of quality or accuracy. ---- +2. **Planned Improvements**: + - Human validation before storing examples + - Confidence thresholds from the LLM responses + - Selective storage based on specific characteristics or patterns + - Improved embedding methods for better similarity matching -## Config snapshot (`concord/config.yaml`) -```yaml -engine: - mode: llm +### Using RAC Mode -llm: - model: gpto3mini # auto-routes to apps-dev - stream: false - user: ${ARGO_USER} +```bash +# First time setup - create data directory +mkdir -p data -local: - model_id: NeuML/pubmedbert-base-embeddings +# Run with RAC mode (will build up examples over time) +concord example_data/annotations_test.csv --mode rac --output results_rac.csv ``` -*(Leave `env` unset — o-series → apps-dev, GPT-4* → apps-prod.)* - ---- - -## Progress, recovery & overwrite -* Output is **appended row-by-row** – abort with Ctrl-C and rerun; finished pairs are skipped. - Add `--force` to **replace** an existing file instead. -* Live progress bar example: - ``` - Processing 73%|██████████████▋ | 730/1000 [00:14<00:05, 49.2it/s] - ``` - o3-mini ≈ 0.10 s per pair; embeddings ≈ 1 ms per string on Apple M-series. +## Documentation +```bash +mkdocs serve +``` +Published site: https://.github.io/concordia/ ---- +## Environment Variables +- `ARGO_USER`: ANL login for Argo Gateway (required) +- `ARGO_API_KEY`: API key for private Argo Gateway (optional) -## FAQ +## Contributing +See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. -| Q | A | -|---|---| -| **Why keep similarity in LLM mode?** | Free sanity check (~ 2 ms) | -| **Still see “Unknown”?** | Reply didn’t start with any known token; tweak alias or prompt | -| **Where are model weights?** | Hugging Face cache (`~/.cache/huggingface`) | -| **Crash recovery?** | Append-only output; rerun resumes automatically | -| **Overwrite output?** | Use `--force` to replace an existing file | +## Testing +Run all tests via `pytest`: +```bash +pytest +``` ---- +## Development +We enforce formatting and linting with pre-commit hooks: +```bash +pip install pre-commit +pre-commit install +pre-commit run --all-files +``` -*Happy concording! – Stars ⭐, issues 🐞, and PRs 💡 welcome.* \ No newline at end of file +## License +Apache-2.0 \ No newline at end of file diff --git a/bin/eval.py b/bin/eval.py deleted file mode 100644 index d8196b1..0000000 --- a/bin/eval.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python3 -""" -bin/eval.py -=========== - -Simple scorer for Concordia outputs. - -Usage ------ - python bin/eval.py GOLD.csv RUN.csv [--col LABEL_COL] - -Assumptions ------------ -* GOLD.csv and RUN.csv share a key column (default: `gene_id`) **and** - a label column (default: `label`). -* All other columns are ignored. -""" - -from __future__ import annotations -import argparse, sys -import pandas as pd -from sklearn.metrics import ( - precision_recall_fscore_support as prfs, - accuracy_score, -) - -# ---------------------------------------------------------------------- -def load(path: str, key: str, label: str) -> pd.Series: - df = pd.read_csv(path, usecols=[key, label]) - if df[key].duplicated().any(): - dup = df[df[key].duplicated()][key].tolist() - sys.exit(f"[error] duplicate keys in {path}: {dup[:5]} …") - return df.set_index(key)[label] - - -def main(): - p = argparse.ArgumentParser() - p.add_argument("gold", help="gold-standard CSV") - p.add_argument("run", help="system output CSV") - p.add_argument("--key", "--id", default="gene_id", - help="column used to align rows (default: gene_id)") - p.add_argument("--col", "--label", default="label", - help="column holding the predicted label (default: label)") - args = p.parse_args() - - gold = load(args.gold, args.key, args.col) - run = load(args.run, args.key, args.col) - - # ----- align & sanity ------------------------------------------------ - common = gold.index.intersection(run.index) - if len(common) == 0: - sys.exit("[error] no matching keys between gold and run!]") - - y_true = gold.loc[common] - y_pred = run.loc[common].reindex(common) - - # ----- metrics ------------------------------------------------------- - labels = sorted(y_true.unique()) - P, R, F, _ = prfs(y_true, y_pred, labels=labels, - average=None, zero_division=0) - acc = accuracy_score(y_true, y_pred) - - print(f"Samples evaluated: {len(common)}") - print(f"Micro-accuracy : {acc:6.3f}") - - macroP, macroR, macroF, _ = prfs( - y_true, y_pred, labels=labels, - average="macro", zero_division=0) - print(f"Macro P/R/F : {macroP:6.3f} {macroR:6.3f} {macroF:6.3f}\n") - - print(" Per-class scores") - print(" label P R F n") - for lbl, p_, r_, f_ in zip(labels, P, R, F): - n = (y_true == lbl).sum() - print(f" {lbl:<10} {p_:6.3f} {r_:6.3f} {f_:6.3f} {n:5d}") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/concord/__init__.py b/concord/__init__.py index c5f12dd..53565b0 100644 --- a/concord/__init__.py +++ b/concord/__init__.py @@ -1,3 +1,70 @@ -from importlib.metadata import version +""" +concord +======= +CONCORDIA annotation engine. + +v1.2 (2025-05-02) + • ADD centralized logging configuration + • ADD shared constants + • IMPROVE overall package structure +""" + +from __future__ import annotations + +import logging +import logging.config + +# Constants moved from pipeline.py +EVIDENCE_FIELD = "evidence" +SIM_FIELD = "similarity_Pubmedbert" +CONFLICT_FIELD = "duo_conflict" + + +# Setup logging +def setup_logging(level: str = "INFO", log_file: str = None) -> None: + """ + Configure logging for the entire package. + + Args: + level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + log_file: Optional path to log file + """ + log_config = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": {"format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s"}, + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "level": level, + "formatter": "standard", + "stream": "ext://sys.stdout", + }, + }, + "loggers": { + "concord": {"level": level, "handlers": ["console"], "propagate": False} + }, + } + + # Add file handler if log_file is specified + if log_file: + log_config["handlers"]["file"] = { + "class": "logging.FileHandler", + "level": level, + "formatter": "standard", + "filename": log_file, + "mode": "a", + } + log_config["loggers"]["concord"]["handlers"].append("file") + + logging.config.dictConfig(log_config) + + +# Setup default logging +setup_logging() + +# Version +__version__ = "1.2.0" __all__ = ["pipeline"] -__version__ = version("concordia") diff --git a/concord/cli.py b/concord/cli.py index bcad94c..b87df58 100644 --- a/concord/cli.py +++ b/concord/cli.py @@ -1,108 +1,266 @@ """ -concord.cli -=========== -Typer wrapper around the pipeline. +concord.cli ─ Typer wrapper +=========================== +Command-line interface for the CONCORDIA annotation engine. -!! No `str | None` or `Path | None` annotations !! +v1.2 (2025-05-02) + • ADD logging options and configuration + • ADD batch processing controls + • ADD device selection for embeddings + • ADD model preloading option + • IMPROVE error reporting and handling """ from __future__ import annotations +import logging +import os import pathlib as P -import tempfile import sys -import yaml +import tempfile +from typing import Optional + import typer +import yaml from rich import print as echo +from rich.console import Console -from .pipeline import run_file # file workflow +from . import setup_logging +from .embedding import clear_cache, preload_model +from .pipeline import run_file, run_pair + +# Create console for rich output +console = Console() app = typer.Typer( add_completion=False, help="Concordia – annotation concordance engine", + invoke_without_command=True, + context_settings={"allow_interspersed_args": True}, ) -@app.command(name="concord") -def main( # noqa: C901 (CLI only) +@app.callback(invoke_without_command=True) +def concord( # noqa: C901 file: str = typer.Argument( - None, metavar="[FILE]", show_default=False, - help="Input table (.csv /.tsv /.json)"), - text_a: str = typer.Option( - None, help="Free-text annotation A"), - text_b: str = typer.Option( - None, help="Free-text annotation B"), - col_a: str = typer.Option( - None, help="Column name for annotation A"), - col_b: str = typer.Option( - None, help="Column name for annotation B"), - cfg: str = typer.Option( - "concord/config.yaml", help="YAML config"), - mode: str = typer.Option( - None, help="llm | local | simhint | dual (overrides config)"), - llm_model: str = typer.Option( - None, help="Override gateway model – e.g. gpt4o"), - output: str = typer.Option( - None, help="Destination CSV"), - sep: str = typer.Option( - None, help="Custom delimiter for text files (e.g. '\\t')"), - force: bool = typer.Option( - False, "--force", - help="Overwrite existing OUTPUT instead of resuming/appending"), + None, metavar="[FILE]", help="Input table (.csv/.tsv/.json)" + ), + text_a: str = typer.Option(None, help="Free-text annotation A"), + text_b: str = typer.Option(None, help="Free-text annotation B"), + col_a: str = typer.Option(None, help="Column name for annotation A"), + col_b: str = typer.Option(None, help="Column name for annotation B"), + cfg: str = typer.Option("config.yaml", help="YAML config at repo root"), + mode: str = typer.Option(None, help="local | zero-shot | vote"), + llm_model: str = typer.Option(None, help="Override gateway model"), + prompt_ver: str = typer.Option(None, help="Freeze a prompt version"), + output: str = typer.Option(None, help="Destination CSV"), + overwrite: bool = typer.Option(False, help="Overwrite existing output file"), + sep: str = typer.Option(None, help="Custom delimiter (e.g. '\\t')"), + batch_size: int = typer.Option(32, help="Batch size for processing"), + llm_batch_size: int = typer.Option( + 1, "--llm-batch-size", help="Aggregate N pairs into a single LLM request" + ), + llm_stream: Optional[bool] = typer.Option( + None, + "--llm-stream/--no-llm-stream", + help="Force /streamchat (on) or /chat (off). Omit flag for auto mode.", + ), + llm_debug: bool = typer.Option( + False, "--llm-debug", help="Enable verbose Argo Gateway debugging" + ), + device: str = typer.Option("cpu", help="Device for embedding model (cpu/cuda)"), + preload: bool = typer.Option(False, help="Preload embedding model"), + log_level: str = typer.Option( + "INFO", help="Logging level (DEBUG/INFO/WARNING/ERROR)" + ), + log_file: str = typer.Option(None, help="Log to file in addition to console"), + list_templates: bool = typer.Option(False, help="List available prompt templates"), + verbose: bool = typer.Option( + False, "--verbose", "-v", help="Show detailed evidence and explanations" + ), + sim_hint: bool = typer.Option( + False, "--sim-hint", help="Prefix similarity hint to LLM prompt" + ), ): - # ── sanity ─────────────────────────────────────────────── - if file and (text_a or text_b): - echo("[red]Give a FILE *or* two strings, not both.[/red]") + """ + Run the CONCORDIA annotation engine on a file or a pair of texts. + + Examples: + concord data.csv --mode llm + concord --text-a "Protein A" --text-b "Protein B" --mode duo + """ + try: + # Setup logging based on CLI options + setup_logging(level=log_level.upper(), log_file=log_file) + logger = logging.getLogger("concord.cli") + + # List available templates if requested + if list_templates: + from .llm.prompts import list_available_templates + + templates = list_available_templates() + console.print("[bold]Available templates:[/bold]") + for template in templates: + console.print(f" • {template}") + return + + # ── sanity ─────────────────────────────────────────────── + if file and (text_a or text_b): + echo("[red]Give a FILE *or* two strings – not both.[/red]") + raise typer.Exit(1) + if not file and not (text_a and text_b): + echo("[red]Need FILE *or* --text-a + --text-b.[/red]") + raise typer.Exit(1) + + # ── patch config (temporary copy) ──────────────────────── + try: + with open(cfg) as f: + cfg_dict = yaml.safe_load(f) + except (IOError, yaml.YAMLError) as e: + echo(f"[red]Error loading config file: {e}[/red]") + raise typer.Exit(1) + + # Update config with CLI options + if mode: + cfg_dict.setdefault("engine", {})["mode"] = mode + if llm_model: + # Normalize model name by removing hyphens to match Argo API requirements + if llm_model == "gpt-4o": + llm_model = "gpt4o" + cfg_dict.setdefault("llm", {})["model"] = llm_model + if prompt_ver: + cfg_dict["prompt_ver"] = prompt_ver + + # Add sim_hint flag to engine config + cfg_dict.setdefault("engine", {})["sim_hint"] = sim_hint + + # Add embedding options + cfg_dict.setdefault("embedding", {})["device"] = device + cfg_dict["embedding"]["batch_size"] = batch_size + + # propagate llm_debug to config/environment + if llm_debug: + cfg_dict.setdefault("llm", {})["debug"] = True + # Set env var so ArgoGatewayClient picks it up even if constructed elsewhere + os.environ["ARGO_DEBUG"] = "1" + + # propagate llm_stream override if provided + if llm_stream is not None: + cfg_dict.setdefault("llm", {})["stream"] = llm_stream + + # Write updated config to temporary file + with tempfile.NamedTemporaryFile("w", delete=False, suffix=".yml") as tmp: + yaml.safe_dump(cfg_dict, tmp) + cfg_path = P.Path(tmp.name) + + # Check Argo Gateway connectivity + from .llm.argo_gateway import ArgoGatewayClient + + client = ArgoGatewayClient(**cfg_dict.get("llm", {})) + if client.ping(): + echo("[green]✓ Argo Gateway API reachable[/green]") + else: + echo("[yellow]⚠️ Argo Gateway API unreachable; LLM calls may fail[/yellow]") + + # Preload embedding model if requested + if preload: + with console.status( + "[bold green]Preloading embedding model...[/bold green]" + ): + preload_model(cfg_dict) + console.print("[bold green]✓ Model preloaded[/bold green]") + + # ── run ───────────────────────────────────────────────── + if file: + # Run on file + try: + out = run_file( + P.Path(file), + cfg_path, + col_a, + col_b, + out_path=P.Path(output) if output else None, + overwrite=overwrite, + sep=sep, + batch_size=batch_size, + llm_batch_size=llm_batch_size, + ) + echo(f"[green]✓ wrote {out}[/green]") + except Exception as e: + logger.error(f"Error processing file: {e}", exc_info=True) + echo(f"[red]Error processing file: {e}[/red]") + raise typer.Exit(1) + else: + # Run on a single pair + try: + with console.status("[bold green]Processing...[/bold green]"): + label, sim, note = run_pair(text_a, text_b, cfg_path) + + # Display results + console.print("[bold]Results:[/bold]") + console.print(f" • [bold]Label:[/bold] {label}") + if sim is not None: + console.print(f" • [bold]Similarity:[/bold] {sim:.3f}") + if note: + if verbose: + # Show the full evidence with proper formatting in verbose mode + console.print(" • [bold]Evidence:[/bold]") + console.print(f" {note}") + else: + # In non-verbose mode, show a shortened version if it's too long + if len(note) > 50: + console.print( + f" • [bold]Evidence:[/bold] {note[:50]}... (use --verbose for full details)" + ) + else: + console.print(f" • [bold]Evidence:[/bold] {note}") + except Exception as e: + logger.error(f"Error processing text pair: {e}", exc_info=True) + echo(f"[red]Error processing text pair: {e}[/red]") + raise typer.Exit(1) + + # Clean up after run + clear_cache() # Free memory from embedding cache + + except Exception as e: + logger.error(f"Unhandled exception: {e}", exc_info=True) + echo(f"[red]Error: {e}[/red]") raise typer.Exit(1) - if not file and not (text_a and text_b): - echo("[red]Need FILE *or* --text-a + --text-b.[/red]") + + +@app.command() +def validate( + cfg: str = typer.Option("config.yaml", help="YAML config to validate"), +): + """Validate a configuration file.""" + try: + with open(cfg) as f: + cfg_dict = yaml.safe_load(f) + + # Check for required sections + errors = [] + + if "engine" not in cfg_dict: + errors.append("Missing 'engine' section") + elif "mode" not in cfg_dict["engine"]: + errors.append("Missing 'engine.mode' setting") + + if "llm" not in cfg_dict: + errors.append("Missing 'llm' section") + + # Report results + if errors: + echo("[red]Configuration validation failed:[/red]") + for error in errors: + echo(f" • [red]{error}[/red]") + raise typer.Exit(1) + else: + echo("[green]✓ Configuration is valid[/green]") + + except (IOError, yaml.YAMLError) as e: + echo(f"[red]Error loading config file: {e}[/red]") raise typer.Exit(1) - # ── patch config ───────────────────────────────────────── - cfg_dict = yaml.safe_load(open(cfg)) - if mode: - cfg_dict.setdefault("engine", {})["mode"] = mode - if llm_model: - cfg_dict.setdefault("llm", {})["model"] = llm_model - - with tempfile.NamedTemporaryFile("w", delete=False, suffix=".yml") as tmp: - yaml.safe_dump(cfg_dict, tmp) - cfg_path = P.Path(tmp.name) - - # ── run ───────────────────────────────────────────────── - if file: - out_path = P.Path(output) if output else None - if out_path and out_path.exists(): - if force: - out_path.unlink() - echo(f"[red]✗ removed existing {out_path}[/red]") - else: - echo( - f"[yellow]⚠ Output {out_path} already exists — " - "rerun will *resume* and write only missing rows.\n" - "Use --force to overwrite.[/yellow]" - ) - out = run_file( - P.Path(file), - cfg_path, - col_a, col_b, - out_path=out_path, - sep=sep, - ) - echo(f"[green]✓ wrote {out}[/green]") - - else: - # ad-hoc pair (import lazily to avoid union signature) - from .pipeline import run_pair - label, sim, note = run_pair(text_a, text_b, cfg_path) - msg = f"label={label}" - if sim is not None: - msg += f" similarity={sim:.3f}" - if note: - msg += f" note={note}" - echo(msg) - - -if __name__ == "__main__": - sys.exit(app()) \ No newline at end of file +if __name__ == "__main__": # python -m concord.cli + sys.exit(app()) diff --git a/concord/constants.py b/concord/constants.py new file mode 100644 index 0000000..866def9 --- /dev/null +++ b/concord/constants.py @@ -0,0 +1,56 @@ +""" +concord.constants +================ +Centralized constants for the CONCORDIA engine. + +This module contains shared constants used throughout the codebase. +""" + +from __future__ import annotations + +from typing import Final, List, Set + +# Output field names +EVIDENCE_FIELD: Final[str] = "evidence" +SIM_FIELD: Final[str] = "similarity_Pubmedbert" +CONFLICT_FIELD: Final[str] = "duo_conflict" + +# Embedding model constants +EMBEDDING_MODEL_ID: Final[str] = "NeuML/pubmedbert-base-embeddings" +DEFAULT_DEVICE: Final[str] = "cpu" +DEFAULT_BATCH_SIZE: Final[int] = 32 +MAX_CACHE_SIZE: Final[int] = 10000 + +# Prompt constants +DEFAULT_PROMPT_VERSION: Final[str] = "v1.0" +EXAMPLES_PER_BUCKET: Final[int] = 3 + +# Similarity thresholds +EXACT_SIMILARITY_THRESHOLD: Final[float] = 0.98 + +# Label set +LABEL_SET: Final[Set[str]] = { + "Exact", + "Synonym", + "Broader", + "Narrower", + "Related", + "Uninformative", + "Different", +} + +# Engine modes +VALID_ENGINE_MODES: Final[List[str]] = [ + "local", + "llm", + "dual", + "bucket", + "duo", +] + +# Default configuration paths +DEFAULT_CONFIG_PATH: Final[str] = "concord/config.yaml" + +# API retries +MAX_RETRIES: Final[int] = 3 +RETRY_BACKOFF_FACTOR: Final[float] = 0.5 diff --git a/concord/embedding.py b/concord/embedding.py new file mode 100644 index 0000000..5a676a3 --- /dev/null +++ b/concord/embedding.py @@ -0,0 +1,292 @@ +# concord/embedding.py +""" +Light wrapper around the PubMedBERT sentence-embedding model +(keeps old function names/arg-lists so pipeline keeps working). + +v1.1 (2025-05-02) + • ADD batch processing + • ADD error handling with retries + • ADD caching for better performance + • IMPROVE device handling from config +""" + +from __future__ import annotations + +import functools +import logging +import threading +import time +from typing import Any, Callable, Dict, List, Optional + +import torch + +# Configure logging +logger = logging.getLogger(__name__) + +_MODEL_ID = "NeuML/pubmedbert-base-embeddings" +_model: SentenceTransformer | None = None +_embedding_cache: Dict[str, torch.Tensor] = {} +_MAX_CACHE_SIZE = 10000 # Maximum number of cached embeddings +_cache_lock = threading.RLock() # Lock for thread safety + +try: + from sentence_transformers import SentenceTransformer, util +except ImportError: + # Fallback for environments without sentence_transformers (e.g., tests) + logger = logging.getLogger(__name__) + logger.warning( + "sentence_transformers not installed; using dummy SentenceTransformer" + ) + + class SentenceTransformer: + def __init__(self, model_id): + pass + + def to(self, device): + pass + + def encode(self, texts, convert_to_tensor=True, device=None): + if isinstance(texts, (list, tuple)): + return [torch.zeros(768) for _ in texts] + return torch.zeros(768) + + util = None + + +# ── internal helpers ─────────────────────────────────────────────── +def _get_model(device: Optional[str] = None) -> SentenceTransformer: + """ + Get or initialize the embedding model. + + Args: + device: The device to load the model on ('cpu', 'cuda', etc.) + + Returns: + Initialized SentenceTransformer model + """ + global _model + if _model is None: + try: + start_time = time.time() + logger.info(f"Loading embedding model {_MODEL_ID}...") + + # Use provided device or default to CPU + device = device or "cpu" + + # Handle device availability + if device.startswith("cuda") and not torch.cuda.is_available(): + logger.warning("CUDA requested but not available, falling back to CPU") + device = "cpu" + + _model = SentenceTransformer(_MODEL_ID) + _model.to(device) + + elapsed = time.time() - start_time + logger.info(f"Model loaded in {elapsed:.2f}s on {device}") + except Exception as e: + logger.error(f"Failed to load embedding model: {e}") + raise RuntimeError(f"Failed to load embedding model: {e}") + + return _model + + +def _manage_cache() -> None: + """ + Manage the embedding cache size, removing least recently used items if needed. + Thread-safe with lock protection. + """ + global _embedding_cache + with _cache_lock: + if len(_embedding_cache) > _MAX_CACHE_SIZE: + # Remove 20% of the cache (the oldest entries) + remove_count = int(0.2 * len(_embedding_cache)) + keys_to_remove = list(_embedding_cache.keys())[:remove_count] + for key in keys_to_remove: + del _embedding_cache[key] + logger.debug(f"Cache cleaned: removed {remove_count} entries") + + +def with_retries(max_retries: int = 3, backoff_factor: float = 0.5) -> Callable: + """Decorator to retry functions on exception with exponential backoff.""" + + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + for attempt in range(max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + if attempt == max_retries - 1: + logger.error(f"Failed after {max_retries} attempts: {e}") + raise + wait_time = backoff_factor * (2**attempt) + logger.warning( + f"Attempt {attempt+1} failed: {e}. Retrying in {wait_time:.2f}s" + ) + time.sleep(wait_time) + + return wrapper + + return decorator + + +# ── public API (legacy + modern) ─────────────────────────────────── +@with_retries(max_retries=3) +def embed_sentence(text: str, cfg: Dict[str, Any] | None = None) -> torch.Tensor: + """ + Return a 768-d tensor for *text*. + + Uses caching to avoid recomputing embeddings for identical text. + + Args: + text: Text to embed + cfg: Configuration dictionary with optional embedding settings + + Returns: + Tensor embedding of the text + """ + # Check if embedding is already in cache (thread-safe) + with _cache_lock: + if text in _embedding_cache: + return _embedding_cache[text] + + # Extract device from config or use default + device = "cpu" + if cfg and "embedding" in cfg: + device = cfg.get("embedding", {}).get("device", "cpu") + + try: + model = _get_model(device) + embedding = model.encode(text, convert_to_tensor=True, device=device) + + # Cache the result (thread-safe) + with _cache_lock: + _embedding_cache[text] = embedding + _manage_cache() + + return embedding + except Exception as e: + logger.error(f"Error embedding text: {e}") + raise RuntimeError(f"Embedding failed: {e}") from e + + +@with_retries(max_retries=3) +def batch_embed( + texts: List[str], cfg: Dict[str, Any] | None = None, batch_size: int = 32 +) -> List[torch.Tensor]: + """ + Embed multiple texts efficiently in batches. + + Args: + texts: List of texts to embed + cfg: Configuration dictionary + batch_size: Size of batches for processing + + Returns: + List of tensor embeddings + """ + device = "cpu" + if cfg and "embedding" in cfg: + device = cfg.get("embedding", {}).get("device", "cpu") + + # Filter texts that are already cached (thread-safe) + new_texts = [] + new_indices = [] + results = [None] * len(texts) + + with _cache_lock: + for i, text in enumerate(texts): + if text in _embedding_cache: + results[i] = _embedding_cache[text] + else: + new_texts.append(text) + new_indices.append(i) + + # If all embeddings were cached, return early + if not new_texts: + return results + + try: + model = _get_model(device) + + # Process in batches + for i in range(0, len(new_texts), batch_size): + batch = new_texts[i : i + batch_size] + batch_indices = new_indices[i : i + batch_size] + + start_time = time.time() + embeddings = model.encode(batch, convert_to_tensor=True, device=device) + elapsed = time.time() - start_time + + # Fix #5: Check for unexpected batch length from model + if len(embeddings) != len(batch): + logger.warning( + f"Model returned {len(embeddings)} embeddings for {len(batch)} texts" + ) + # Handle mismatch by padding or truncating as needed + if len(embeddings) < len(batch): + # Pad with zeros if we got fewer embeddings than expected + logger.warning("Padding missing embeddings with zeros") + zeros = torch.zeros_like( + embeddings[0] if len(embeddings) > 0 else torch.zeros(768) + ) + embeddings = list(embeddings) + [zeros] * ( + len(batch) - len(embeddings) + ) + else: + # Truncate if we got more embeddings than expected + logger.warning("Truncating extra embeddings") + embeddings = embeddings[: len(batch)] + + logger.debug(f"Embedded batch of {len(batch)} texts in {elapsed:.2f}s") + + # Store results and update cache (thread-safe) + with _cache_lock: + for j, embedding in enumerate(embeddings): + idx = batch_indices[j] + text = new_texts[j - i] + results[idx] = embedding + _embedding_cache[text] = embedding + + _manage_cache() + return results + except Exception as e: + logger.error(f"Error batch embedding texts: {e}") + raise RuntimeError(f"Batch embedding failed: {e}") from e + + +def cosine_sim(vec1: torch.Tensor, vec2: torch.Tensor) -> float: + """ + Calculate cosine similarity between two tensors. + """ + try: + sim = ( + util.pytorch_cos_sim(vec1, vec2).item() + if util + else torch.nn.functional.cosine_similarity( + vec1.unsqueeze(0), vec2.unsqueeze(0) + ).item() + ) + return sim + except Exception as e: + logger.error(f"Cosine similarity failed: {e}") + raise RuntimeError(f"Cosine similarity calculation failed: {e}") from e + + +def similarity(text1: str, text2: str, cfg: Dict[str, Any] | None = None) -> float: + """Compute cosine similarity between two texts.""" + return cosine_sim(embed_sentence(text1, cfg), embed_sentence(text2, cfg)) + + +def preload_model(cfg: Dict[str, Any] | None = None): + """Load embedding model into global cache.""" + device = None + if cfg and "embedding" in cfg: + device = cfg["embedding"].get("device") + return _get_model(device) + + +def clear_cache(): + """Clear the embedding cache.""" + global _embedding_cache + _embedding_cache.clear() diff --git a/concord/io/loader.py b/concord/io/loader.py index 5595675..815a221 100644 --- a/concord/io/loader.py +++ b/concord/io/loader.py @@ -5,9 +5,12 @@ """ from __future__ import annotations + import pathlib as P + import pandas as pd + def load_table(path: P.Path, *, sep: str | None = None) -> pd.DataFrame: """ Parameters @@ -28,6 +31,7 @@ def load_table(path: P.Path, *, sep: str | None = None) -> pd.DataFrame: if ext in {".tsv", ".tab"}: return pd.read_csv(path, sep=sep or "\t") if ext == ".json": - return pd.read_json(path) # list-of-objects *or* column-orient - raise ValueError(f"Unsupported file type '{ext}'. " - "Accepted: .csv .tsv .tab .json") + return pd.read_json(path) # list-of-objects *or* column-orient + raise ValueError( + f"Unsupported file type '{ext}'. " "Accepted: .csv .tsv .tab .json" + ) diff --git a/concord/llm/argo_gateway.py b/concord/llm/argo_gateway.py index a1dff56..74eba94 100644 --- a/concord/llm/argo_gateway.py +++ b/concord/llm/argo_gateway.py @@ -1,13 +1,13 @@ -# concord/llm/argo_gateway.py """ -Networking layer for Concordia → Argo Gateway. +Networking helper + LLM label parser. -• ArgoGatewayClient.chat() → one-shot prompt with heavy-duty retry -• llm_label() → convenience helper returning +* ArgoGatewayClient – simple JSON POST wrapper with retries +* llm_label – build prompt (template may be injected) & parse """ from __future__ import annotations +import logging import os import random import re @@ -15,172 +15,431 @@ from typing import Optional, Tuple import httpx +from dotenv import load_dotenv -from .prompts import build_annotation_prompt, LABEL_SET +from .prompts import LABEL_SET, build_annotation_prompt, get_prompt_template -# ────────────────────────────────────────── -_SYSTEM_MSG = ( - "You are a bioinformatics assistant. " - "Reply **only** with '