From d528db3f927065db8539369e23b0b8d54d6637c8 Mon Sep 17 00:00:00 2001 From: "Jose P. Faria" Date: Mon, 5 May 2025 00:36:44 -0500 Subject: [PATCH 01/15] Fix Argo Gateway API connection, enhance evidence display, buckest mode and duo mode and update documentation --- README.md | 118 +++- concord/__init__.py | 74 ++- concord/cli.py | 269 ++++++--- concord/config.yaml | 2 +- concord/constants.py | 46 ++ concord/embedding.py | 259 +++++++++ concord/llm/argo_gateway.py | 214 ++++---- concord/llm/prompt_buckets.py | 109 ++++ concord/llm/prompts.py | 466 +++++++++++++--- .../llm/templates/template_v1.2-general.txt | 26 + concord/llm/templates/template_v1.3-test.txt | 17 + concord/local/embeddings.py | 14 - concord/metrics.py | 217 ++++++++ concord/pipeline.py | 514 ++++++++++++------ concord/utils.py | 156 ++++++ debug_llm_call.py | 65 +++ example_data/annotations_buckets_gpt4o_v4.csv | 25 + example_data/annotations_buckets_o3mini.csv | 9 + example_data/annotations_gpt4o.csv | 25 + example_data/annotations_gpt4o_v2.csv | 25 + example_data/annotations_gpt4o_v3.csv | 25 + example_data/annotations_simhint_o3mini.csv | 25 + example_data/annotations_simhint_v1.csv | 25 + .../annotations_test.simhint_gpt4o_v3.csv | 25 + .../annotations_test.simhint_o3 copy.csv | 25 + example_data/annotations_test.simhint_o3.csv | 3 + .../annotations_test_buckets_gpt4o.csv | 25 + .../annotations_test_buckets_gpt4o.csv--force | 25 + example_data/annotations_test_buckets_o3.csv | 3 + example_data/gpt4o_refactor_test.csv | 25 + example_data/my_results_duo_gpt4o.csv | 21 + example_data/my_results_gpt4o.csv | 21 + example_data/my_results_o3.csv | 24 + test_argo_gateway.py | 178 ++++++ test_verbose_response.py | 77 +++ tests/__init__.py | 6 + tests/test_embedding.py | 127 +++++ tests/test_pipeline.py | 211 +++++++ tests/test_prompts.py | 139 +++++ 39 files changed, 3169 insertions(+), 491 deletions(-) create mode 100644 concord/constants.py create mode 100644 concord/embedding.py create mode 100644 concord/llm/prompt_buckets.py create mode 100644 concord/llm/templates/template_v1.2-general.txt create mode 100644 concord/llm/templates/template_v1.3-test.txt delete mode 100644 concord/local/embeddings.py create mode 100644 concord/metrics.py create mode 100644 concord/utils.py create mode 100644 debug_llm_call.py create mode 100644 example_data/annotations_buckets_gpt4o_v4.csv create mode 100644 example_data/annotations_buckets_o3mini.csv create mode 100644 example_data/annotations_gpt4o.csv create mode 100644 example_data/annotations_gpt4o_v2.csv create mode 100644 example_data/annotations_gpt4o_v3.csv create mode 100644 example_data/annotations_simhint_o3mini.csv create mode 100644 example_data/annotations_simhint_v1.csv create mode 100644 example_data/annotations_test.simhint_gpt4o_v3.csv create mode 100644 example_data/annotations_test.simhint_o3 copy.csv create mode 100644 example_data/annotations_test.simhint_o3.csv create mode 100644 example_data/annotations_test_buckets_gpt4o.csv create mode 100644 example_data/annotations_test_buckets_gpt4o.csv--force create mode 100644 example_data/annotations_test_buckets_o3.csv create mode 100644 example_data/gpt4o_refactor_test.csv create mode 100644 example_data/my_results_duo_gpt4o.csv create mode 100644 example_data/my_results_gpt4o.csv create mode 100644 example_data/my_results_o3.csv create mode 100755 test_argo_gateway.py create mode 100644 test_verbose_response.py create mode 100644 tests/__init__.py create mode 100644 tests/test_embedding.py create mode 100644 tests/test_pipeline.py create mode 100644 tests/test_prompts.py diff --git a/README.md b/README.md index 0340842..670eab6 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,11 @@ Concordia compares two functional-annotation sources—old vs new, RAST vs UniProt, manual vs AI—and writes a tidy table with -* **`similarity_Pubmedbert`** – cosine similarity from the PubMedBERT sentence-embedding model -* **`label`** – a one-word judgement from an LLM (`o3-mini` by default) drawn from a 7-class ontology -* **`note`** – the LLM’s ultra-short reason (blank when the heuristic label is used) +* **`similarity_Pubmedbert`** – cosine similarity from the PubMedBERT sentence-embedding model +* **`label`** – a one-word judgement from an LLM (GPT-4o or gpto3mini) drawn from a 7-class ontology +* **`evidence`** – detailed reasoning from the LLM explaining the relationship classification -You can choose from **four processing modes**: +You can choose from **six processing modes**: | Mode | What happens | |------|--------------| @@ -15,6 +15,8 @@ You can choose from **four processing modes**: | **local** | PubMedBERT embeddings → cosine → heuristic label (no LLM) | | **dual** | Embeddings **and** LLM (baseline) | | **simhint** | Same as **dual** **plus** the cosine similarity is prefixed to the prompt as a weak prior | +| **bucket** | Choose template based on embedding similarity bucket | +| **duo** | Runs the LLM 3 times with different temperatures and takes a majority vote for more robust results | --- @@ -26,19 +28,31 @@ poetry install # installs deps + CLI poetry shell # activate the virtualenv ``` +### Environment Setup +Concordia requires setting the ARGO_USER environment variable for accessing the Argo Gateway API: + +```bash +export ARGO_USER= +``` + --- ## Quick-start recipes | Goal | Command | |------|---------| -| CSV with default LLM (o3-mini → dev) | `concord example_data/annotations_test.csv` | -| TSV with GPT-4o (prod) | `concord example_data/annotations_test.tsv --llm-model gpt4o` | -| Embed-only (no LLM) | `concord …csv --mode local` | -| Dual (baseline) | `concord …csv --mode dual` | -| Similarity-hint | `concord …csv --mode simhint` | -| Two ad-hoc strings | `concord --text-a "RecA" --text-b "DNA recombinase A"` | -| **Overwrite** existing output | add `--force` | +| CSV with default LLM (gpto3mini → dev) | `python -m concord.cli concord example_data/annotations_test.csv` | +| CSV with GPT-4o | `python -m concord.cli concord example_data/annotations_test.csv --llm-model gpt4o` | +| TSV with detailed evidence | `python -m concord.cli concord example_data/annotations_test.tsv --llm-model gpt4o --verbose` | +| Embed-only (no LLM) | `python -m concord.cli concord example_data/annotations_test.csv --mode local` | +| Dual (baseline) | `python -m concord.cli concord example_data/annotations_test.csv --mode dual` | +| Similarity-hint | `python -m concord.cli concord example_data/annotations_test.csv --mode simhint` | +| Template buckets | `python -m concord.cli concord example_data/annotations_test.csv --mode bucket` | +| Voting-based consensus | `python -m concord.cli concord example_data/annotations_test.csv --mode duo` | +| Two ad-hoc strings | `python -m concord.cli concord --text-a "RecA" --text-b "DNA recombinase A"` | +| List available templates | `python -m concord.cli concord --list-templates` | +| **Overwrite** existing output | `python -m concord.cli concord example_data/test.csv --output results.csv --overwrite` | +| Debug with verbose logging | `python -m concord.cli concord example_data/test.csv --log-level DEBUG` | --- @@ -50,7 +64,7 @@ poetry shell # activate the virtualenv | `.tsv` / `.tab` | `pandas.read_csv(sep='\t')`| or `--sep "\t"` | | `.json` | `pandas.read_json()` | list-of-objects **or** column-orient | -If you do **not** pass `--col-a / --col-b`, the **first two textual columns that don’t end with `id`** are taken. +If you do **not** pass `--col-a / --col-b`, the **first two textual columns that don't end with `id`** are taken. --- @@ -70,14 +84,21 @@ Any extra columns (e.g. `gene_id`) are preserved in the output. |------|-------------| | **`FILE`** | Input table (`.csv`, `.tsv`, `.json`) | | `--text-a / --text-b` | Compare two strings instead of a file | -| `--mode` | `llm` (default) | `local` | `dual` | `simhint` | -| `--llm-model` | Gateway LLM (`gpto3mini`, `gpt4o`, …) | +| `--mode` | `llm` (default) \| `local` \| `dual` \| `simhint` \| `bucket` \| `duo` | +| `--llm-model` | Gateway LLM (`gpto3mini`, `gpt4o`) - Note: use without hyphens | | `--retry` | Automatic blank-reply retries (default 5) | -| `--force` | Overwrite existing output instead of appending | +| `--overwrite` | Overwrite existing output instead of appending | | `--output` | Destination path (file-mode only) | | `--cfg` | Alternate YAML config | | `--col-a / --col-b` | Explicit annotation columns | | `--sep` | Custom delimiter for text files (e.g. `"\t"`) | +| `--verbose / -v` | Show detailed evidence and explanations | +| `--log-level` | Set logging level (DEBUG/INFO/WARNING/ERROR) | +| `--log-file` | Log to file in addition to console | +| `--list-templates` | List available prompt templates | +| `--prompt-ver` | Specify template version to use (e.g., "v1.3-test") | +| `--batch-size` | Batch size for processing (default: 32) | +| `--device` | Device for embedding model (cpu/cuda) | --- @@ -85,12 +106,14 @@ Any extra columns (e.g. `gene_id`) are preserved in the output. | Mode | Pipeline | Extra columns | |------|----------|---------------| -| **llm** | Skip embeddings; every pair → LLM | `label`, `note` | +| **llm** | Skip embeddings; every pair → LLM | `label`, `evidence` | | **local** | PubMedBERT embeddings → cosine → heuristic | `similarity_Pubmedbert`, `label` | -| **dual** | Embeddings **and** LLM for every row | `similarity_Pubmedbert`, `label`, `note` | -| **simhint**| Same as **dual**, but cosine similarity is sent to the LLM prompt | `similarity_Pubmedbert`, `label`, `note` | +| **dual** | Embeddings **and** LLM for every row | `similarity_Pubmedbert`, `label`, `evidence` | +| **simhint**| Same as **dual**, but cosine similarity is sent to the LLM prompt | `similarity_Pubmedbert`, `label`, `evidence` | +| **bucket** | Choose template based on embedding similarity bucket | `similarity_Pubmedbert`, `label`, `evidence` | +| **duo** | Runs the LLM 3 times with different temperatures and uses majority vote | `label`, `evidence`, `conflict` | -> **Embedding model** [`NeuML/pubmedbert-base-embeddings`](https://huggingface.co/NeuML/pubmedbert-base-embeddings) (Apache-2.0) +> **Embedding model** [`NeuML/pubmedbert-base-embeddings`](https://huggingface.co/NeuML/pubmedbert-base-embeddings) (Apache-2.0) --- @@ -113,11 +136,21 @@ Older checkpoints reply with tokens like *Identical* or *Partial*. --- -## Prompting — tweak in one place +## Prompting — Templates and Customization -All prompt text lives in **`concord/llm/prompts.py`**. +All prompt text lives in **`concord/llm/prompts.py`** and external template files in **`concord/llm/templates/`**. Edit the few-shot examples or definitions to experiment; no other code must change. +Available templates can be listed with: +```bash +python -m concord.cli concord --list-templates +``` + +You can specify a template version with: +```bash +python -m concord.cli concord example_data/test.csv --prompt-ver "v1.3-test" +``` + --- ## Config snapshot (`concord/config.yaml`) @@ -126,26 +159,52 @@ engine: mode: llm llm: - model: gpto3mini # auto-routes to apps-dev + model: gpt4o # Use without hyphens stream: false - user: ${ARGO_USER} + user: ${ARGO_USER} # Must set ARGO_USER environment variable local: model_id: NeuML/pubmedbert-base-embeddings ``` -*(Leave `env` unset — o-series → apps-dev, GPT-4* → apps-prod.)* + +--- + +## Viewing Detailed Evidence + +By default, CONCORDIA provides abbreviated evidence for concise output. To see the full biological context: + +```bash +python -m concord.cli concord example_data/test.csv --verbose +``` + +This shows detailed reasoning behind each classification, which is particularly valuable for complex biomedical entity relationships. + +--- + +## Debugging Argo Gateway API Connections + +For troubleshooting API connections: + +1. Ensure the ARGO_USER environment variable is set +2. Use the correct model name format (e.g., "gpt4o" without hyphen) +3. Run with DEBUG logging level: + ```bash + python -m concord.cli concord --text-a "Test" --text-b "Test entity" --log-level DEBUG + ``` + +The `debug_llm_call.py` utility script can also be used to test direct API connections. --- ## Progress, recovery & overwrite * Output is **appended row-by-row** – abort with Ctrl-C and rerun; finished pairs are skipped. - Add `--force` to **replace** an existing file instead. + Add `--overwrite` to **replace** an existing file instead. * Live progress bar example: ``` Processing 73%|██████████████▋ | 730/1000 [00:14<00:05, 49.2it/s] ``` - o3-mini ≈ 0.10 s per pair; embeddings ≈ 1 ms per string on Apple M-series. + gpto3mini ≈ 0.10 s per pair; GPT-4o ≈ 1.0 s per pair; embeddings ≈ 1 ms per string on Apple M-series. --- @@ -154,10 +213,13 @@ local: | Q | A | |---|---| | **Why keep similarity in LLM mode?** | Free sanity check (~ 2 ms) | -| **Still see “Unknown”?** | Reply didn’t start with any known token; tweak alias or prompt | +| **Still see "Unknown"?** | Reply didn't start with any known token; tweak alias or prompt | | **Where are model weights?** | Hugging Face cache (`~/.cache/huggingface`) | | **Crash recovery?** | Append-only output; rerun resumes automatically | -| **Overwrite output?** | Use `--force` to replace an existing file | +| **Overwrite output?** | Use `--overwrite` to replace an existing file | +| **How to get detailed explanations?** | Use `--verbose` flag | +| **API errors?** | Ensure ARGO_USER is set and model names don't have hyphens | +| **What model should I use?** | gpto3mini is faster, gpt4o provides better biological context | --- diff --git a/concord/__init__.py b/concord/__init__.py index c5f12dd..478838f 100644 --- a/concord/__init__.py +++ b/concord/__init__.py @@ -1,3 +1,75 @@ +""" +concord +======= +CONCORDIA annotation engine. + +v1.2 (2025-05-02) + • ADD centralized logging configuration + • ADD shared constants + • IMPROVE overall package structure +""" + +from __future__ import annotations +import logging +import logging.config +from typing import Dict, Any, Optional from importlib.metadata import version + +# Constants moved from pipeline.py +EVIDENCE_FIELD = "evidence" +SIM_FIELD = "similarity_Pubmedbert" +CONFLICT_FIELD = "duo_conflict" + +# Setup logging +def setup_logging(level: str = "INFO", log_file: Optional[str] = None) -> None: + """ + Configure logging for the entire package. + + Args: + level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + log_file: Optional path to log file + """ + log_config = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": { + "format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s" + }, + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "level": level, + "formatter": "standard", + "stream": "ext://sys.stdout", + }, + }, + "loggers": { + "concord": { + "level": level, + "handlers": ["console"], + "propagate": False + } + } + } + + # Add file handler if log_file is specified + if log_file: + log_config["handlers"]["file"] = { + "class": "logging.FileHandler", + "level": level, + "formatter": "standard", + "filename": log_file, + "mode": "a", + } + log_config["loggers"]["concord"]["handlers"].append("file") + + logging.config.dictConfig(log_config) + +# Setup default logging +setup_logging() + +# Version +__version__ = "1.2.0" __all__ = ["pipeline"] -__version__ = version("concordia") diff --git a/concord/cli.py b/concord/cli.py index bcad94c..1d32532 100644 --- a/concord/cli.py +++ b/concord/cli.py @@ -1,108 +1,201 @@ """ -concord.cli -=========== -Typer wrapper around the pipeline. +concord.cli ─ Typer wrapper +=========================== +Command-line interface for the CONCORDIA annotation engine. -!! No `str | None` or `Path | None` annotations !! +v1.2 (2025-05-02) + • ADD logging options and configuration + • ADD batch processing controls + • ADD device selection for embeddings + • ADD model preloading option + • IMPROVE error reporting and handling """ from __future__ import annotations - -import pathlib as P -import tempfile -import sys -import yaml -import typer +import pathlib as P, tempfile, sys, yaml, typer, logging from rich import print as echo +from rich.console import Console +from rich.progress import Progress + +from . import setup_logging +from .pipeline import run_file, run_pair +from .embedding import preload_model, clear_cache -from .pipeline import run_file # file workflow +# Create console for rich output +console = Console() -app = typer.Typer( - add_completion=False, - help="Concordia – annotation concordance engine", -) +app = typer.Typer(add_completion=False, + help="Concordia – annotation concordance engine") -@app.command(name="concord") -def main( # noqa: C901 (CLI only) - file: str = typer.Argument( - None, metavar="[FILE]", show_default=False, - help="Input table (.csv /.tsv /.json)"), - text_a: str = typer.Option( - None, help="Free-text annotation A"), - text_b: str = typer.Option( - None, help="Free-text annotation B"), - col_a: str = typer.Option( - None, help="Column name for annotation A"), - col_b: str = typer.Option( - None, help="Column name for annotation B"), - cfg: str = typer.Option( - "concord/config.yaml", help="YAML config"), - mode: str = typer.Option( - None, help="llm | local | simhint | dual (overrides config)"), - llm_model: str = typer.Option( - None, help="Override gateway model – e.g. gpt4o"), - output: str = typer.Option( - None, help="Destination CSV"), - sep: str = typer.Option( - None, help="Custom delimiter for text files (e.g. '\\t')"), - force: bool = typer.Option( - False, "--force", - help="Overwrite existing OUTPUT instead of resuming/appending"), +@app.command() +def concord( # noqa: C901 + file: str = typer.Argument(None, metavar="[FILE]", + help="Input table (.csv/.tsv/.json)"), + text_a: str = typer.Option(None, help="Free-text annotation A"), + text_b: str = typer.Option(None, help="Free-text annotation B"), + col_a: str = typer.Option(None, help="Column name for annotation A"), + col_b: str = typer.Option(None, help="Column name for annotation B"), + cfg: str = typer.Option("concord/config.yaml", help="YAML config"), + mode: str = typer.Option(None, help="llm | local | simhint | bucket | dual | duo"), + llm_model: str = typer.Option(None, help="Override gateway model"), + prompt_ver: str = typer.Option(None, help="Freeze a prompt version"), + output: str = typer.Option(None, help="Destination CSV"), + overwrite: bool = typer.Option(False, help="Overwrite existing output file"), + sep: str = typer.Option(None, help="Custom delimiter (e.g. '\\t')"), + batch_size: int = typer.Option(32, help="Batch size for processing"), + device: str = typer.Option("cpu", help="Device for embedding model (cpu/cuda)"), + preload: bool = typer.Option(False, help="Preload embedding model"), + log_level: str = typer.Option("INFO", help="Logging level (DEBUG/INFO/WARNING/ERROR)"), + log_file: str = typer.Option(None, help="Log to file in addition to console"), + list_templates: bool = typer.Option(False, help="List available prompt templates"), + verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed evidence and explanations"), ): - # ── sanity ─────────────────────────────────────────────── - if file and (text_a or text_b): - echo("[red]Give a FILE *or* two strings, not both.[/red]") - raise typer.Exit(1) - if not file and not (text_a and text_b): - echo("[red]Need FILE *or* --text-a + --text-b.[/red]") - raise typer.Exit(1) + """ + Run the CONCORDIA annotation engine on a file or a pair of texts. + + Examples: + concord data.csv --mode llm + concord --text-a "Protein A" --text-b "Protein B" --mode duo + """ + try: + # Setup logging based on CLI options + setup_logging(level=log_level.upper(), log_file=log_file) + logger = logging.getLogger("concord.cli") + + # List available templates if requested + if list_templates: + from .llm.prompts import list_available_templates + templates = list_available_templates() + console.print("[bold]Available templates:[/bold]") + for template in templates: + console.print(f" • {template}") + return + + # ── sanity ─────────────────────────────────────────────── + if file and (text_a or text_b): + echo("[red]Give a FILE *or* two strings – not both.[/red]") + raise typer.Exit(1) + if not file and not (text_a and text_b): + echo("[red]Need FILE *or* --text-a + --text-b.[/red]") + raise typer.Exit(1) - # ── patch config ───────────────────────────────────────── - cfg_dict = yaml.safe_load(open(cfg)) - if mode: - cfg_dict.setdefault("engine", {})["mode"] = mode - if llm_model: - cfg_dict.setdefault("llm", {})["model"] = llm_model + # ── patch config (temporary copy) ──────────────────────── + try: + with open(cfg) as f: + cfg_dict = yaml.safe_load(f) + except (IOError, yaml.YAMLError) as e: + echo(f"[red]Error loading config file: {e}[/red]") + raise typer.Exit(1) + + # Update config with CLI options + if mode: cfg_dict.setdefault("engine", {})["mode"] = mode + if llm_model: + # Normalize model name by removing hyphens to match Argo API requirements + if llm_model == "gpt-4o": + llm_model = "gpt4o" + cfg_dict.setdefault("llm", {})["model"] = llm_model + if prompt_ver: cfg_dict["prompt_ver"] = prompt_ver + + # Add embedding options + cfg_dict.setdefault("embedding", {})["device"] = device + cfg_dict["embedding"]["batch_size"] = batch_size - with tempfile.NamedTemporaryFile("w", delete=False, suffix=".yml") as tmp: - yaml.safe_dump(cfg_dict, tmp) - cfg_path = P.Path(tmp.name) + # Write updated config to temporary file + with tempfile.NamedTemporaryFile("w", delete=False, suffix=".yml") as tmp: + yaml.safe_dump(cfg_dict, tmp) + cfg_path = P.Path(tmp.name) + + # Preload embedding model if requested + if preload: + with console.status("[bold green]Preloading embedding model...[/bold green]"): + preload_model(cfg_dict) + console.print("[bold green]✓ Model preloaded[/bold green]") - # ── run ───────────────────────────────────────────────── - if file: - out_path = P.Path(output) if output else None - if out_path and out_path.exists(): - if force: - out_path.unlink() - echo(f"[red]✗ removed existing {out_path}[/red]") - else: - echo( - f"[yellow]⚠ Output {out_path} already exists — " - "rerun will *resume* and write only missing rows.\n" - "Use --force to overwrite.[/yellow]" + # ── run ───────────────────────────────────────────────── + if file: + # Run on file + try: + out = run_file( + P.Path(file), cfg_path, col_a, col_b, + out_path=P.Path(output) if output else None, + overwrite=overwrite, sep=sep, + batch_size=batch_size ) + echo(f"[green]✓ wrote {out}[/green]") + except Exception as e: + logger.error(f"Error processing file: {e}", exc_info=True) + echo(f"[red]Error processing file: {e}[/red]") + raise typer.Exit(1) + else: + # Run on a single pair + try: + with console.status("[bold green]Processing...[/bold green]"): + label, sim, note = run_pair(text_a, text_b, cfg_path) + + # Display results + console.print("[bold]Results:[/bold]") + console.print(f" • [bold]Label:[/bold] {label}") + if sim is not None: + console.print(f" • [bold]Similarity:[/bold] {sim:.3f}") + if note: + if verbose: + # Show the full evidence with proper formatting in verbose mode + console.print(f" • [bold]Evidence:[/bold]") + console.print(f" {note}") + else: + # In non-verbose mode, show a shortened version if it's too long + if len(note) > 50: + console.print(f" • [bold]Evidence:[/bold] {note[:50]}... (use --verbose for full details)") + else: + console.print(f" • [bold]Evidence:[/bold] {note}") + except Exception as e: + logger.error(f"Error processing text pair: {e}", exc_info=True) + echo(f"[red]Error processing text pair: {e}[/red]") + raise typer.Exit(1) + + # Clean up after run + clear_cache() # Free memory from embedding cache + + except Exception as e: + logger.error(f"Unhandled exception: {e}", exc_info=True) + echo(f"[red]Error: {e}[/red]") + raise typer.Exit(1) - out = run_file( - P.Path(file), - cfg_path, - col_a, col_b, - out_path=out_path, - sep=sep, - ) - echo(f"[green]✓ wrote {out}[/green]") - else: - # ad-hoc pair (import lazily to avoid union signature) - from .pipeline import run_pair - label, sim, note = run_pair(text_a, text_b, cfg_path) - msg = f"label={label}" - if sim is not None: - msg += f" similarity={sim:.3f}" - if note: - msg += f" note={note}" - echo(msg) +@app.command() +def validate( + cfg: str = typer.Option("concord/config.yaml", help="YAML config to validate"), +): + """Validate a configuration file.""" + try: + with open(cfg) as f: + cfg_dict = yaml.safe_load(f) + + # Check for required sections + errors = [] + + if "engine" not in cfg_dict: + errors.append("Missing 'engine' section") + elif "mode" not in cfg_dict["engine"]: + errors.append("Missing 'engine.mode' setting") + + if "llm" not in cfg_dict: + errors.append("Missing 'llm' section") + + # Report results + if errors: + echo("[red]Configuration validation failed:[/red]") + for error in errors: + echo(f" • [red]{error}[/red]") + raise typer.Exit(1) + else: + echo("[green]✓ Configuration is valid[/green]") + + except (IOError, yaml.YAMLError) as e: + echo(f"[red]Error loading config file: {e}[/red]") + raise typer.Exit(1) -if __name__ == "__main__": +if __name__ == "__main__": # python -m concord.cli sys.exit(app()) \ No newline at end of file diff --git a/concord/config.yaml b/concord/config.yaml index 2f89a0d..9a421eb 100644 --- a/concord/config.yaml +++ b/concord/config.yaml @@ -12,7 +12,7 @@ engine: # ────────────────────────────────────────────────────────────── llm: # Argo Gateway settings - model: gpto3mini # o-series ⇒ auto-env = apps-dev + model: gpt4o # o-series ⇒ auto-env = apps-dev (using correct name format without hyphen) stream: false # set true to use /streamchat/ endpoint user: ${ARGO_USER} # export ARGO_USER= diff --git a/concord/constants.py b/concord/constants.py new file mode 100644 index 0000000..7f4684f --- /dev/null +++ b/concord/constants.py @@ -0,0 +1,46 @@ +""" +concord.constants +================ +Centralized constants for the CONCORDIA engine. + +This module contains shared constants used throughout the codebase. +""" + +from __future__ import annotations +from typing import Set, Dict, Any, List, Final + +# Output field names +EVIDENCE_FIELD: Final[str] = "evidence" +SIM_FIELD: Final[str] = "similarity_Pubmedbert" +CONFLICT_FIELD: Final[str] = "duo_conflict" + +# Embedding model constants +EMBEDDING_MODEL_ID: Final[str] = "NeuML/pubmedbert-base-embeddings" +DEFAULT_DEVICE: Final[str] = "cpu" +DEFAULT_BATCH_SIZE: Final[int] = 32 +MAX_CACHE_SIZE: Final[int] = 10000 + +# Prompt constants +DEFAULT_PROMPT_VERSION: Final[str] = "v1.0" +EXAMPLES_PER_BUCKET: Final[int] = 3 + +# Similarity thresholds +EXACT_SIMILARITY_THRESHOLD: Final[float] = 0.98 + +# Label set +LABEL_SET: Final[Set[str]] = { + "Exact", "Synonym", "Broader", "Narrower", + "Related", "Uninformative", "Different", +} + +# Engine modes +VALID_ENGINE_MODES: Final[List[str]] = [ + "local", "llm", "dual", "simhint", "bucket", "duo" +] + +# Default configuration paths +DEFAULT_CONFIG_PATH: Final[str] = "concord/config.yaml" + +# API retries +MAX_RETRIES: Final[int] = 3 +RETRY_BACKOFF_FACTOR: Final[float] = 0.5 \ No newline at end of file diff --git a/concord/embedding.py b/concord/embedding.py new file mode 100644 index 0000000..48b6e2e --- /dev/null +++ b/concord/embedding.py @@ -0,0 +1,259 @@ +# concord/embedding.py +""" +Light wrapper around the PubMedBERT sentence-embedding model +(keeps old function names/arg-lists so pipeline keeps working). + +v1.1 (2025-05-02) + • ADD batch processing + • ADD error handling with retries + • ADD caching for better performance + • IMPROVE device handling from config +""" + +from __future__ import annotations +import logging +import time +import functools +import torch +from typing import List, Dict, Any, Optional, Union, Tuple, Callable +from sentence_transformers import SentenceTransformer, util + +# Configure logging +logger = logging.getLogger(__name__) + +_MODEL_ID = "NeuML/pubmedbert-base-embeddings" +_model: SentenceTransformer | None = None +_embedding_cache: Dict[str, torch.Tensor] = {} +_MAX_CACHE_SIZE = 10000 # Maximum number of cached embeddings + +# ── internal helpers ─────────────────────────────────────────────── +def _get_model(device: Optional[str] = None) -> SentenceTransformer: + """ + Get or initialize the embedding model. + + Args: + device: The device to load the model on ('cpu', 'cuda', etc.) + + Returns: + Initialized SentenceTransformer model + """ + global _model + if _model is None: + try: + start_time = time.time() + logger.info(f"Loading embedding model {_MODEL_ID}...") + + # Use provided device or default to CPU + device = device or "cpu" + + # Handle device availability + if device.startswith("cuda") and not torch.cuda.is_available(): + logger.warning("CUDA requested but not available, falling back to CPU") + device = "cpu" + + _model = SentenceTransformer(_MODEL_ID) + _model.to(device) + + elapsed = time.time() - start_time + logger.info(f"Model loaded in {elapsed:.2f}s on {device}") + except Exception as e: + logger.error(f"Failed to load embedding model: {e}") + raise RuntimeError(f"Failed to load embedding model: {e}") + + return _model + + +def _manage_cache() -> None: + """ + Manage the embedding cache size, removing least recently used items if needed. + """ + global _embedding_cache + if len(_embedding_cache) > _MAX_CACHE_SIZE: + # Remove 20% of the cache (the oldest entries) + remove_count = int(0.2 * len(_embedding_cache)) + keys_to_remove = list(_embedding_cache.keys())[:remove_count] + for key in keys_to_remove: + del _embedding_cache[key] + logger.debug(f"Cache cleaned: removed {remove_count} entries") + + +def with_retries(max_retries: int = 3, backoff_factor: float = 0.5) -> Callable: + """Decorator to retry functions on exception with exponential backoff.""" + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + for attempt in range(max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + if attempt == max_retries - 1: + logger.error(f"Failed after {max_retries} attempts: {e}") + raise + wait_time = backoff_factor * (2 ** attempt) + logger.warning(f"Attempt {attempt+1} failed: {e}. Retrying in {wait_time:.2f}s") + time.sleep(wait_time) + return wrapper + return decorator + + +# ── public API (legacy + modern) ─────────────────────────────────── +@with_retries(max_retries=3) +def embed_sentence(text: str, cfg: Dict[str, Any] | None = None) -> torch.Tensor: + """ + Return a 768-d tensor for *text*. + + Uses caching to avoid recomputing embeddings for identical text. + + Args: + text: Text to embed + cfg: Configuration dictionary with optional embedding settings + + Returns: + Tensor embedding of the text + """ + # Check if embedding is already in cache + if text in _embedding_cache: + return _embedding_cache[text] + + # Extract device from config or use default + device = "cpu" + if cfg and "embedding" in cfg: + device = cfg.get("embedding", {}).get("device", "cpu") + + try: + model = _get_model(device) + embedding = model.encode(text, convert_to_tensor=True, device=device) + + # Cache the result + _embedding_cache[text] = embedding + _manage_cache() + + return embedding + except Exception as e: + logger.error(f"Error embedding text: {e}") + raise RuntimeError(f"Embedding failed: {e}") from e + + +@with_retries(max_retries=3) +def batch_embed(texts: List[str], cfg: Dict[str, Any] | None = None, + batch_size: int = 32) -> List[torch.Tensor]: + """ + Embed multiple texts efficiently in batches. + + Args: + texts: List of texts to embed + cfg: Configuration dictionary + batch_size: Size of batches for processing + + Returns: + List of tensor embeddings + """ + device = "cpu" + if cfg and "embedding" in cfg: + device = cfg.get("embedding", {}).get("device", "cpu") + + # Filter texts that are already cached + new_texts = [] + new_indices = [] + results = [None] * len(texts) + + for i, text in enumerate(texts): + if text in _embedding_cache: + results[i] = _embedding_cache[text] + else: + new_texts.append(text) + new_indices.append(i) + + # If all embeddings were cached, return early + if not new_texts: + return results + + try: + model = _get_model(device) + + # Process in batches + for i in range(0, len(new_texts), batch_size): + batch = new_texts[i:i+batch_size] + batch_indices = new_indices[i:i+batch_size] + + start_time = time.time() + embeddings = model.encode(batch, convert_to_tensor=True, device=device) + elapsed = time.time() - start_time + + logger.debug(f"Embedded batch of {len(batch)} texts in {elapsed:.2f}s") + + # Store results and update cache + for j, embedding in enumerate(embeddings): + idx = batch_indices[j] + text = new_texts[j-i] + results[idx] = embedding + _embedding_cache[text] = embedding + + _manage_cache() + return results + except Exception as e: + logger.error(f"Error batch embedding texts: {e}") + raise RuntimeError(f"Batch embedding failed: {e}") from e + + +def cosine_sim(vec1: torch.Tensor, vec2: torch.Tensor) -> float: + """ + Calculate cosine similarity between two tensors. + + Args: + vec1: First tensor + vec2: Second tensor + + Returns: + Cosine similarity as float + """ + try: + return float(util.cos_sim(vec1, vec2)) + except Exception as e: + logger.error(f"Error calculating cosine similarity: {e}") + raise RuntimeError(f"Similarity calculation failed: {e}") from e + + +def similarity(a: str, b: str, cfg: Dict[str, Any] | None = None) -> float: + """ + One-liner convenience: encode both strings and return the cosine. + Uses batch embedding for efficiency. + + Args: + a: First text + b: Second text + cfg: Optional configuration + + Returns: + Cosine similarity between embeddings + """ + try: + embs = batch_embed([a, b], cfg) + return float(util.cos_sim(embs[0], embs[1])) + except Exception as e: + logger.error(f"Error calculating text similarity: {e}") + raise RuntimeError(f"Text similarity calculation failed: {e}") from e + + +def preload_model(cfg: Dict[str, Any] | None = None) -> None: + """ + Preload the embedding model to avoid delays on first use. + + Args: + cfg: Configuration dictionary with optional embedding settings + """ + device = "cpu" + if cfg and "embedding" in cfg: + device = cfg.get("embedding", {}).get("device", "cpu") + + logger.info("Preloading embedding model...") + _get_model(device) + logger.info("Embedding model preloaded and ready") + + +def clear_cache() -> None: + """Clear the embedding cache to free memory.""" + global _embedding_cache + cache_size = len(_embedding_cache) + _embedding_cache = {} + logger.info(f"Embedding cache cleared ({cache_size} entries)") \ No newline at end of file diff --git a/concord/llm/argo_gateway.py b/concord/llm/argo_gateway.py index a1dff56..2fb3990 100644 --- a/concord/llm/argo_gateway.py +++ b/concord/llm/argo_gateway.py @@ -1,82 +1,67 @@ -# concord/llm/argo_gateway.py """ -Networking layer for Concordia → Argo Gateway. +Networking helper + LLM label parser. -• ArgoGatewayClient.chat() → one-shot prompt with heavy-duty retry -• llm_label() → convenience helper returning +* ArgoGatewayClient – simple JSON POST wrapper with retries +* llm_label – build prompt (template may be injected) & parse """ from __future__ import annotations - -import os -import random -import re -import time +import os, time, random, httpx, re, logging from typing import Optional, Tuple -import httpx - -from .prompts import build_annotation_prompt, LABEL_SET +from .prompts import ( + build_annotation_prompt, + LABEL_SET, + get_prompt_template, +) -# ────────────────────────────────────────── -_SYSTEM_MSG = ( - "You are a bioinformatics assistant. " - "Reply **only** with '