diff --git a/hindsight-api-slim/hindsight_api/config.py b/hindsight-api-slim/hindsight_api/config.py index d18781309..d1ccfbaaf 100644 --- a/hindsight-api-slim/hindsight_api/config.py +++ b/hindsight-api-slim/hindsight_api/config.py @@ -178,6 +178,14 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]: ENV_EMBEDDINGS_OPENAI_MODEL = "HINDSIGHT_API_EMBEDDINGS_OPENAI_MODEL" ENV_EMBEDDINGS_OPENAI_BASE_URL = "HINDSIGHT_API_EMBEDDINGS_OPENAI_BASE_URL" +# Gemini/Vertex AI embeddings configuration +ENV_EMBEDDINGS_GEMINI_API_KEY = "HINDSIGHT_API_EMBEDDINGS_GEMINI_API_KEY" +ENV_EMBEDDINGS_GEMINI_MODEL = "HINDSIGHT_API_EMBEDDINGS_GEMINI_MODEL" +ENV_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY = "HINDSIGHT_API_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY" +ENV_EMBEDDINGS_VERTEXAI_PROJECT_ID = "HINDSIGHT_API_EMBEDDINGS_VERTEXAI_PROJECT_ID" +ENV_EMBEDDINGS_VERTEXAI_REGION = "HINDSIGHT_API_EMBEDDINGS_VERTEXAI_REGION" +ENV_EMBEDDINGS_VERTEXAI_SERVICE_ACCOUNT_KEY = "HINDSIGHT_API_EMBEDDINGS_VERTEXAI_SERVICE_ACCOUNT_KEY" + # Cohere configuration (separate for embeddings and reranker) ENV_EMBEDDINGS_COHERE_API_KEY = "HINDSIGHT_API_EMBEDDINGS_COHERE_API_KEY" ENV_EMBEDDINGS_COHERE_MODEL = "HINDSIGHT_API_EMBEDDINGS_COHERE_MODEL" @@ -231,6 +239,11 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]: ENV_RERANKER_ZEROENTROPY_MODEL = "HINDSIGHT_API_RERANKER_ZEROENTROPY_MODEL" ENV_RERANKER_ZEROENTROPY_BASE_URL = "HINDSIGHT_API_RERANKER_ZEROENTROPY_BASE_URL" +# Google Discovery Engine reranker configuration +ENV_RERANKER_GOOGLE_MODEL = "HINDSIGHT_API_RERANKER_GOOGLE_MODEL" +ENV_RERANKER_GOOGLE_PROJECT_ID = "HINDSIGHT_API_RERANKER_GOOGLE_PROJECT_ID" +ENV_RERANKER_GOOGLE_SERVICE_ACCOUNT_KEY = "HINDSIGHT_API_RERANKER_GOOGLE_SERVICE_ACCOUNT_KEY" + ENV_VECTOR_EXTENSION = "HINDSIGHT_API_VECTOR_EXTENSION" ENV_TEXT_SEARCH_EXTENSION = "HINDSIGHT_API_TEXT_SEARCH_EXTENSION" @@ -403,6 +416,8 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]: DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU = False # Force CPU mode for local embeddings (avoids MPS/XPC issues on macOS) DEFAULT_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE = False # Security: disabled by default, required for some models DEFAULT_EMBEDDINGS_OPENAI_MODEL = "text-embedding-3-small" +DEFAULT_EMBEDDINGS_GEMINI_MODEL = "gemini-embedding-001" +DEFAULT_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY = 768 DEFAULT_EMBEDDING_DIMENSION = 384 DEFAULT_RERANKER_PROVIDER = "local" @@ -426,6 +441,8 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]: DEFAULT_RERANKER_ZEROENTROPY_MODEL = "zerank-2" +DEFAULT_RERANKER_GOOGLE_MODEL = "semantic-ranker-default-004" + # Vector extension (pgvector, vchord, or pgvectorscale) DEFAULT_VECTOR_EXTENSION = "pgvector" # Options: "pgvector", "vchord", "pgvectorscale" @@ -706,6 +723,13 @@ class HindsightConfig: embeddings_litellm_sdk_model: str embeddings_litellm_sdk_api_base: str | None embeddings_litellm_sdk_output_dimensions: int | None + # Gemini/Vertex AI embeddings + embeddings_gemini_api_key: str | None + embeddings_gemini_model: str + embeddings_gemini_output_dimensionality: int | None + embeddings_vertexai_project_id: str | None + embeddings_vertexai_region: str | None + embeddings_vertexai_service_account_key: str | None # Reranker reranker_provider: str @@ -733,6 +757,9 @@ class HindsightConfig: reranker_zeroentropy_api_key: str | None reranker_zeroentropy_model: str reranker_zeroentropy_base_url: str | None + reranker_google_model: str + reranker_google_project_id: str | None + reranker_google_service_account_key: str | None # Server host: str @@ -882,6 +909,10 @@ class HindsightConfig: "reranker_zeroentropy_base_url", # Service Account Keys "llm_vertexai_service_account_key", + "embeddings_vertexai_service_account_key", + "reranker_google_service_account_key", + # Embeddings API keys + "embeddings_gemini_api_key", # File storage credentials "file_storage_s3_access_key_id", "file_storage_s3_secret_access_key", @@ -1160,6 +1191,20 @@ def from_env(cls) -> "HindsightConfig": embeddings_litellm_sdk_output_dimensions=int(v) if (v := os.getenv(ENV_EMBEDDINGS_LITELLM_SDK_OUTPUT_DIMENSIONS)) else None, + # Gemini/Vertex AI embeddings (with fallback to LLM keys) + embeddings_gemini_api_key=os.getenv(ENV_EMBEDDINGS_GEMINI_API_KEY) or os.getenv(ENV_LLM_API_KEY), + embeddings_gemini_model=os.getenv(ENV_EMBEDDINGS_GEMINI_MODEL, DEFAULT_EMBEDDINGS_GEMINI_MODEL), + embeddings_gemini_output_dimensionality=int( + os.getenv( + ENV_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY, + str(DEFAULT_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY), + ) + ), + embeddings_vertexai_project_id=os.getenv(ENV_EMBEDDINGS_VERTEXAI_PROJECT_ID) + or os.getenv(ENV_LLM_VERTEXAI_PROJECT_ID), + embeddings_vertexai_region=os.getenv(ENV_EMBEDDINGS_VERTEXAI_REGION) or os.getenv(ENV_LLM_VERTEXAI_REGION), + embeddings_vertexai_service_account_key=os.getenv(ENV_EMBEDDINGS_VERTEXAI_SERVICE_ACCOUNT_KEY) + or os.getenv(ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY), # Reranker reranker_provider=os.getenv(ENV_RERANKER_PROVIDER, DEFAULT_RERANKER_PROVIDER), reranker_local_model=os.getenv(ENV_RERANKER_LOCAL_MODEL, DEFAULT_RERANKER_LOCAL_MODEL), @@ -1209,6 +1254,12 @@ def from_env(cls) -> "HindsightConfig": reranker_zeroentropy_api_key=os.getenv(ENV_RERANKER_ZEROENTROPY_API_KEY), reranker_zeroentropy_model=os.getenv(ENV_RERANKER_ZEROENTROPY_MODEL, DEFAULT_RERANKER_ZEROENTROPY_MODEL), reranker_zeroentropy_base_url=os.getenv(ENV_RERANKER_ZEROENTROPY_BASE_URL) or None, + # Google Discovery Engine reranker (with fallback to LLM Vertex AI keys) + reranker_google_model=os.getenv(ENV_RERANKER_GOOGLE_MODEL, DEFAULT_RERANKER_GOOGLE_MODEL), + reranker_google_project_id=os.getenv(ENV_RERANKER_GOOGLE_PROJECT_ID) + or os.getenv(ENV_LLM_VERTEXAI_PROJECT_ID), + reranker_google_service_account_key=os.getenv(ENV_RERANKER_GOOGLE_SERVICE_ACCOUNT_KEY) + or os.getenv(ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY), # Server host=os.getenv(ENV_HOST, DEFAULT_HOST), port=int(os.getenv(ENV_PORT, DEFAULT_PORT)), diff --git a/hindsight-api-slim/hindsight_api/engine/cross_encoder.py b/hindsight-api-slim/hindsight_api/engine/cross_encoder.py index fe99928cf..edfe60947 100644 --- a/hindsight-api-slim/hindsight_api/engine/cross_encoder.py +++ b/hindsight-api-slim/hindsight_api/engine/cross_encoder.py @@ -20,6 +20,7 @@ DEFAULT_RERANKER_COHERE_MODEL, DEFAULT_RERANKER_FLASHRANK_CACHE_DIR, DEFAULT_RERANKER_FLASHRANK_MODEL, + DEFAULT_RERANKER_GOOGLE_MODEL, DEFAULT_RERANKER_LITELLM_MAX_TOKENS_PER_DOC, DEFAULT_RERANKER_LITELLM_MODEL, DEFAULT_RERANKER_LITELLM_SDK_MODEL, @@ -36,6 +37,7 @@ ENV_RERANKER_COHERE_MODEL, ENV_RERANKER_FLASHRANK_CACHE_DIR, ENV_RERANKER_FLASHRANK_MODEL, + ENV_RERANKER_GOOGLE_PROJECT_ID, ENV_RERANKER_LITELLM_SDK_API_KEY, ENV_RERANKER_LOCAL_FORCE_CPU, ENV_RERANKER_LOCAL_MAX_CONCURRENT, @@ -1266,6 +1268,164 @@ async def predict(self, pairs: list[tuple[str, str]]) -> list[float]: return await loop.run_in_executor(None, self._predict_sync, pairs) +class GoogleCrossEncoder(CrossEncoderModel): + """ + Google Discovery Engine cross-encoder using the Ranking REST API. + + Uses httpx + google-auth for lightweight REST calls (no gRPC/protobuf). + Supports ADC (Application Default Credentials) or service account key file. + + Available models: + - semantic-ranker-default-004: Best quality, 1024 tokens/record (recommended) + - semantic-ranker-fast-004: Lower latency, 1024 tokens/record + + Max 200 records per API request. Location is always "global". + """ + + MAX_RECORDS_PER_REQUEST = 200 + API_BASE = "https://discoveryengine.googleapis.com/v1" + SCOPES = ["https://www.googleapis.com/auth/cloud-platform"] + + def __init__( + self, + project_id: str, + model: str = DEFAULT_RERANKER_GOOGLE_MODEL, + service_account_key: str | None = None, + location: str = "global", + timeout: float = 60.0, + ): + """ + Initialize Google Discovery Engine cross-encoder. + + Args: + project_id: Google Cloud project ID + model: Ranking model name (default: semantic-ranker-default-004) + service_account_key: Path to service account JSON key file. + If None, uses Application Default Credentials (ADC). + location: API location (default: "global") + timeout: Request timeout in seconds (default: 60.0) + """ + self.project_id = project_id + self.model = model + self.service_account_key = service_account_key + self.location = location + self.timeout = timeout + self._credentials = None + self._client: httpx.Client | None = None + self._rank_url: str | None = None + + @property + def provider_name(self) -> str: + return "google" + + def _get_auth_headers(self) -> dict[str, str]: + """Get Authorization header with a fresh access token.""" + import google.auth.transport.requests + + if not self._credentials.valid: + self._credentials.refresh(google.auth.transport.requests.Request()) + return {"Authorization": f"Bearer {self._credentials.token}"} + + async def initialize(self) -> None: + """Initialize credentials and HTTP client.""" + if self._client is not None: + return + + auth_method = "ADC" if not self.service_account_key else "service_account" + logger.info( + f"Reranker: initializing Google Discovery Engine provider " + f"(project={self.project_id}, model={self.model}, auth={auth_method})" + ) + if self.service_account_key: + try: + from google.oauth2 import service_account + except ImportError: + raise ImportError( + "google-auth is required for GoogleCrossEncoder. Install it with: pip install google-auth" + ) + self._credentials = service_account.Credentials.from_service_account_file( + self.service_account_key, + scopes=self.SCOPES, + ) + else: + try: + import google.auth + except ImportError: + raise ImportError( + "google-auth is required for GoogleCrossEncoder. Install it with: pip install google-auth" + ) + self._credentials, _ = google.auth.default(scopes=self.SCOPES) + + ranking_config = f"projects/{self.project_id}/locations/{self.location}/rankingConfigs/default_ranking_config" + self._rank_url = f"{self.API_BASE}/{ranking_config}:rank" + self._client = httpx.Client(timeout=self.timeout) + + logger.info("Reranker: Google Discovery Engine provider initialized") + + def _predict_sync(self, pairs: list[tuple[str, str]]) -> list[float]: + """Synchronous predict via REST API.""" + if not pairs: + return [] + + # Group pairs by query + query_groups: dict[str, list[tuple[int, str]]] = {} + for idx, (query, text) in enumerate(pairs): + if query not in query_groups: + query_groups[query] = [] + query_groups[query].append((idx, text)) + + all_scores = [0.0] * len(pairs) + + for query, indexed_texts in query_groups.items(): + texts = [text for _, text in indexed_texts] + indices = [idx for idx, _ in indexed_texts] + + # Process in batches of MAX_RECORDS_PER_REQUEST + for batch_start in range(0, len(texts), self.MAX_RECORDS_PER_REQUEST): + batch_texts = texts[batch_start : batch_start + self.MAX_RECORDS_PER_REQUEST] + batch_indices = indices[batch_start : batch_start + self.MAX_RECORDS_PER_REQUEST] + + records = [{"id": str(i), "content": text} for i, text in enumerate(batch_texts)] + + response = self._client.post( + self._rank_url, + headers=self._get_auth_headers(), + json={ + "model": self.model, + "query": query, + "records": records, + "topN": len(records), + }, + ) + response.raise_for_status() + result = response.json() + + for record in result.get("records", []): + local_idx = int(record["id"]) + all_scores[batch_indices[local_idx]] = record["score"] + + return all_scores + + async def predict(self, pairs: list[tuple[str, str]]) -> list[float]: + """ + Score query-document pairs using Google Discovery Engine Ranking API. + + Args: + pairs: List of (query, document) tuples to score + + Returns: + List of relevance scores (0-1, higher = more relevant) + """ + if self._client is None: + raise RuntimeError("Reranker not initialized. Call initialize() first.") + + if not pairs: + return [] + + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, self._predict_sync, pairs) + + def create_cross_encoder_from_env() -> CrossEncoderModel: """ Create a CrossEncoderModel instance based on configuration. @@ -1341,11 +1501,23 @@ def create_cross_encoder_from_env() -> CrossEncoderModel: api_key=api_key, model=config.reranker_zeroentropy_model, ) + elif provider == "google": + project_id = config.reranker_google_project_id + if not project_id: + raise ValueError( + f"{ENV_RERANKER_GOOGLE_PROJECT_ID} (or HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID) " + f"is required when {ENV_RERANKER_PROVIDER} is 'google'" + ) + return GoogleCrossEncoder( + project_id=project_id, + model=config.reranker_google_model, + service_account_key=config.reranker_google_service_account_key, + ) elif provider == "rrf": return RRFPassthroughCrossEncoder() elif provider == "jina-mlx": return JinaMLXCrossEncoder() else: raise ValueError( - f"Unknown reranker provider: {provider}. Supported: 'local', 'tei', 'cohere', 'zeroentropy', 'flashrank', 'litellm', 'litellm-sdk', 'rrf', 'jina-mlx'" + f"Unknown reranker provider: {provider}. Supported: 'local', 'tei', 'cohere', 'zeroentropy', 'google', 'flashrank', 'litellm', 'litellm-sdk', 'rrf', 'jina-mlx'" ) diff --git a/hindsight-api-slim/hindsight_api/engine/embeddings.py b/hindsight-api-slim/hindsight_api/engine/embeddings.py index cb9615aa7..3271effa3 100644 --- a/hindsight-api-slim/hindsight_api/engine/embeddings.py +++ b/hindsight-api-slim/hindsight_api/engine/embeddings.py @@ -19,6 +19,7 @@ from ..config import ( DEFAULT_EMBEDDINGS_COHERE_MODEL, + DEFAULT_EMBEDDINGS_GEMINI_MODEL, DEFAULT_EMBEDDINGS_LITELLM_MODEL, DEFAULT_EMBEDDINGS_LITELLM_SDK_MODEL, DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU, @@ -28,6 +29,7 @@ DEFAULT_EMBEDDINGS_PROVIDER, DEFAULT_LITELLM_API_BASE, ENV_EMBEDDINGS_COHERE_API_KEY, + ENV_EMBEDDINGS_GEMINI_API_KEY, ENV_EMBEDDINGS_LITELLM_SDK_API_KEY, ENV_EMBEDDINGS_LOCAL_FORCE_CPU, ENV_EMBEDDINGS_LOCAL_MODEL, @@ -884,6 +886,179 @@ def encode(self, texts: list[str]) -> list[list[float]]: return all_embeddings +class GeminiEmbeddings(Embeddings): + """ + Google embeddings via the google.genai SDK. + + Supports both: + 1. Gemini API (api.generativeai.google.com) with API key authentication + 2. Vertex AI with service account or Application Default Credentials (ADC) + + Uses the embed_content API: client.models.embed_content(model, contents) + """ + + def __init__( + self, + model: str = DEFAULT_EMBEDDINGS_GEMINI_MODEL, + api_key: str | None = None, + vertexai_project_id: str | None = None, + vertexai_region: str | None = None, + vertexai_service_account_key: str | None = None, + output_dimensionality: int | None = None, + batch_size: int = 100, + ): + self.model = model + self.api_key = api_key + self.vertexai_project_id = vertexai_project_id + self.vertexai_region = vertexai_region or "us-central1" + self.vertexai_service_account_key = vertexai_service_account_key + self.output_dimensionality = output_dimensionality + self.batch_size = batch_size + self._client = None + self._dimension: int | None = None + self._is_vertexai = vertexai_project_id is not None + self._embed_config = None # EmbedContentConfig, built during initialize() + + @property + def provider_name(self) -> str: + return "google" + + @property + def dimension(self) -> int: + if self._dimension is None: + raise RuntimeError("Embeddings not initialized. Call initialize() first.") + return self._dimension + + async def initialize(self) -> None: + """Initialize the Google genai client and detect embedding dimension.""" + if self._client is not None: + return + + from google import genai + from google.genai import types as genai_types + + if self._is_vertexai: + self._init_vertexai(genai) + else: + self._init_gemini(genai) + + # Build EmbedContentConfig if output_dimensionality is set + if self.output_dimensionality is not None: + self._embed_config = genai_types.EmbedContentConfig( + output_dimensionality=self.output_dimensionality, + ) + + # Detect dimension via a test embedding (respects output_dimensionality) + embed_kwargs = {"model": self.model, "contents": ["test"]} + if self._embed_config is not None: + embed_kwargs["config"] = self._embed_config + + result = self._client.models.embed_content(**embed_kwargs) # type: ignore[union-attr] + if result.embeddings and len(result.embeddings) > 0: + self._dimension = len(result.embeddings[0].values) + + auth_mode = "vertex_ai" if self._is_vertexai else "api_key" + logger.info( + f"Embeddings: google provider initialized (auth: {auth_mode}, model: {self.model}, dim: {self._dimension})" + ) + + def _init_gemini(self, genai) -> None: + """Initialize Gemini API client with API key.""" + if not self.api_key: + raise ValueError("Gemini embeddings provider requires an API key") + + self._client = genai.Client(api_key=self.api_key) + logger.info(f"Embeddings: initializing Gemini provider with model {self.model}") + + def _init_vertexai(self, genai) -> None: + """Initialize Vertex AI client with project, region, and credentials.""" + if not self.vertexai_project_id: + raise ValueError( + "HINDSIGHT_API_EMBEDDINGS_VERTEXAI_PROJECT_ID (or HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID) " + "is required for Vertex AI embeddings provider." + ) + + auth_method = "ADC" + credentials = None + + if self.vertexai_service_account_key: + try: + from google.oauth2 import service_account + except ImportError: + raise ImportError( + "Vertex AI service account auth requires 'google-auth' package. " + "Install with: pip install google-auth" + ) + credentials = service_account.Credentials.from_service_account_file( + self.vertexai_service_account_key, + scopes=["https://www.googleapis.com/auth/cloud-platform"], + ) + auth_method = "service_account" + logger.info(f"Embeddings: Vertex AI using service account key: {self.vertexai_service_account_key}") + + # Strip google/ prefix from model name — native SDK uses bare names + if self.model.startswith("google/"): + self.model = self.model[len("google/") :] + + client_kwargs = { + "vertexai": True, + "project": self.vertexai_project_id, + "location": self.vertexai_region, + } + if credentials is not None: + client_kwargs["credentials"] = credentials + + self._client = genai.Client(**client_kwargs) + logger.info( + f"Embeddings: initializing Vertex AI provider " + f"(project={self.vertexai_project_id}, region={self.vertexai_region}, " + f"model={self.model}, auth={auth_method})" + ) + + def encode(self, texts: list[str]) -> list[list[float]]: + """ + Generate embeddings using the Google genai SDK. + + Args: + texts: List of text strings to encode + + Returns: + List of embedding vectors + """ + if self._client is None: + raise RuntimeError("Embeddings not initialized. Call initialize() first.") + + if not texts: + return [] + + all_embeddings = [] + + # Process in batches + for i in range(0, len(texts), self.batch_size): + batch = texts[i : i + self.batch_size] + + embed_kwargs = {"model": self.model, "contents": batch} + if self._embed_config is not None: + embed_kwargs["config"] = self._embed_config + + result = self._client.models.embed_content(**embed_kwargs) + + all_embeddings.extend([emb.values for emb in result.embeddings]) + + # L2-normalize when output_dimensionality is set — Gemini only returns + # normalized vectors at full 3072 dims; truncated dims need re-normalization + # for accurate cosine similarity. + if self.output_dimensionality is not None: + import numpy as np + + arr = np.array(all_embeddings) + norms = np.linalg.norm(arr, axis=1, keepdims=True) + norms[norms == 0] = 1 + all_embeddings = (arr / norms).tolist() + + return all_embeddings + + def create_embeddings_from_env() -> Embeddings: """ Create an Embeddings instance based on configuration. @@ -947,8 +1122,27 @@ def create_embeddings_from_env() -> Embeddings: api_base=config.embeddings_litellm_sdk_api_base, output_dimensions=config.embeddings_litellm_sdk_output_dimensions, ) + elif provider == "google": + vertexai_project_id = config.embeddings_vertexai_project_id + if vertexai_project_id: + api_key = None # Vertex AI uses ADC or service account + else: + api_key = config.embeddings_gemini_api_key + if not api_key: + raise ValueError( + f"{ENV_EMBEDDINGS_GEMINI_API_KEY} or {ENV_LLM_API_KEY} is required " + f"when {ENV_EMBEDDINGS_PROVIDER} is 'google' (set VERTEXAI_PROJECT_ID for Vertex AI auth instead)" + ) + return GeminiEmbeddings( + model=config.embeddings_gemini_model, + api_key=api_key, + vertexai_project_id=vertexai_project_id, + vertexai_region=config.embeddings_vertexai_region, + vertexai_service_account_key=config.embeddings_vertexai_service_account_key, + output_dimensionality=config.embeddings_gemini_output_dimensionality, + ) else: raise ValueError( f"Unknown embeddings provider: {provider}. " - f"Supported: 'local', 'tei', 'openai', 'cohere', 'litellm', 'litellm-sdk'" + f"Supported: 'local', 'tei', 'openai', 'cohere', 'google', 'litellm', 'litellm-sdk'" ) diff --git a/hindsight-api-slim/tests/test_gemini_embeddings.py b/hindsight-api-slim/tests/test_gemini_embeddings.py new file mode 100644 index 000000000..79d75bf21 --- /dev/null +++ b/hindsight-api-slim/tests/test_gemini_embeddings.py @@ -0,0 +1,336 @@ +""" +Tests for Google embeddings implementation (Gemini API + Vertex AI). + +These tests cover: +1. Initialization (Gemini API key, Vertex AI with ADC/service account) +2. Dimension detection via test embedding +3. Output dimensionality configuration +4. Encode (single text, multiple texts, batching, empty list, uninitialized) +5. Provider name and model name normalization +6. Factory function (create from env, validation errors) +""" + +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +from hindsight_api.config import ( + ENV_EMBEDDINGS_GEMINI_API_KEY, + ENV_EMBEDDINGS_PROVIDER, + HindsightConfig, +) +from hindsight_api.engine.embeddings import GeminiEmbeddings, create_embeddings_from_env + + +def _make_mock_embedding(values: list[float]) -> MagicMock: + emb = MagicMock() + emb.values = values + return emb + + +def _make_mock_embed_result(embeddings_data: list[list[float]]) -> MagicMock: + result = MagicMock() + result.embeddings = [_make_mock_embedding(v) for v in embeddings_data] + return result + + +def _make_mock_genai(embed_result: Any = None) -> MagicMock: + if embed_result is None: + embed_result = _make_mock_embed_result([[0.1] * 768]) + mock_genai = MagicMock() + mock_client = MagicMock() + mock_client.models.embed_content = MagicMock(return_value=embed_result) + mock_genai.Client = MagicMock(return_value=mock_client) + return mock_genai + + +def _make_mock_google_module(mock_genai: MagicMock) -> MagicMock: + mod = MagicMock() + mod.genai = mock_genai + mod.genai.types.EmbedContentConfig = MagicMock(side_effect=lambda **kw: MagicMock(**kw)) + return mod + + +def _patch_google_import(mock_genai: MagicMock): + original_import = __import__ + + def mock_import(name, *args, **kwargs): + if name == "google": + return _make_mock_google_module(mock_genai) + if name == "google.genai": + return mock_genai + return original_import(name, *args, **kwargs) + + return patch("builtins.__import__", side_effect=mock_import) + + +class TestGeminiEmbeddings: + """Unit tests for GeminiEmbeddings with mocked google.genai.""" + + async def test_initialization_api_key_success(self): + """Test successful Gemini API key initialization.""" + mock_genai = _make_mock_genai() + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key="test-key") + + with _patch_google_import(mock_genai): + await emb.initialize() + + assert emb._client is not None + assert emb.dimension == 768 + assert emb.provider_name == "google" + assert emb._is_vertexai is False + mock_genai.Client.return_value.models.embed_content.assert_called_once() + + async def test_initialization_vertexai_success(self): + """Test successful Vertex AI initialization.""" + mock_genai = _make_mock_genai() + emb = GeminiEmbeddings( + model="gemini-embedding-001", + vertexai_project_id="test-project", + vertexai_region="us-central1", + ) + + with _patch_google_import(mock_genai): + await emb.initialize() + + assert emb._client is not None + assert emb.dimension == 768 + assert emb.provider_name == "google" + assert emb._is_vertexai is True + mock_genai.Client.assert_called_once_with( + vertexai=True, + project="test-project", + location="us-central1", + ) + + async def test_initialization_missing_api_key(self): + """Test that missing API key raises ValueError when no vertexai_project_id.""" + mock_genai = _make_mock_genai() + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key=None) + + with _patch_google_import(mock_genai): + with pytest.raises(ValueError, match="requires an API key"): + await emb.initialize() + + async def test_initialization_vertexai_missing_project_id(self): + """Test that Vertex AI mode requires project_id.""" + mock_genai = _make_mock_genai() + emb = GeminiEmbeddings(model="gemini-embedding-001", vertexai_project_id="temp") + emb.vertexai_project_id = None # Simulate misconfiguration + + with _patch_google_import(mock_genai): + with pytest.raises(ValueError, match="is required for Vertex AI"): + await emb.initialize() + + async def test_initialization_idempotent(self): + """Test that calling initialize() twice is a no-op.""" + mock_genai = _make_mock_genai() + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key="test-key") + + with _patch_google_import(mock_genai): + await emb.initialize() + first_client = emb._client + await emb.initialize() + assert emb._client is first_client + + async def test_dimension_detection_via_test_embedding(self): + """Test that dimension is detected via a test embedding call.""" + test_embed = _make_mock_embed_result([[0.5] * 256]) + mock_genai = _make_mock_genai(embed_result=test_embed) + emb = GeminiEmbeddings(model="some-new-model", api_key="test-key") + + with _patch_google_import(mock_genai): + await emb.initialize() + + assert emb.dimension == 256 + + async def test_output_dimensionality(self): + """Test that output_dimensionality is passed via EmbedContentConfig.""" + test_embed = _make_mock_embed_result([[0.1] * 256]) + mock_genai = _make_mock_genai(embed_result=test_embed) + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key="test-key", output_dimensionality=256) + + with _patch_google_import(mock_genai): + await emb.initialize() + + assert emb.dimension == 256 + assert emb._embed_config is not None + call_kwargs = mock_genai.Client.return_value.models.embed_content.call_args + assert "config" in call_kwargs.kwargs + + async def test_no_output_dimensionality(self): + """Test that no EmbedContentConfig is built when output_dimensionality is None.""" + mock_genai = _make_mock_genai() + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key="test-key", output_dimensionality=None) + + with _patch_google_import(mock_genai): + await emb.initialize() + + assert emb._embed_config is None + call_kwargs = mock_genai.Client.return_value.models.embed_content.call_args + assert "config" not in call_kwargs.kwargs + + def test_auto_detect_vertexai(self): + """Test that _is_vertexai is auto-detected from vertexai_project_id.""" + assert GeminiEmbeddings(model="m", api_key="k")._is_vertexai is False + assert GeminiEmbeddings(model="m", vertexai_project_id="p")._is_vertexai is True + + def test_encode_single_text(self): + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key="test-key") + mock_client = MagicMock() + mock_client.models.embed_content = MagicMock(return_value=_make_mock_embed_result([[0.1, 0.2, 0.3]])) + emb._client = mock_client + emb._dimension = 3 + + assert emb.encode(["hello"]) == [[0.1, 0.2, 0.3]] + + def test_encode_multiple_texts(self): + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key="test-key") + mock_client = MagicMock() + mock_client.models.embed_content = MagicMock( + return_value=_make_mock_embed_result([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) + ) + emb._client = mock_client + emb._dimension = 2 + + result = emb.encode(["a", "b", "c"]) + assert len(result) == 3 + assert result[1] == [0.3, 0.4] + + def test_encode_batching(self): + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key="test-key", batch_size=2) + mock_client = MagicMock() + mock_client.models.embed_content = MagicMock( + side_effect=[_make_mock_embed_result([[0.1], [0.2]]), _make_mock_embed_result([[0.3]])] + ) + emb._client = mock_client + emb._dimension = 1 + + assert emb.encode(["a", "b", "c"]) == [[0.1], [0.2], [0.3]] + assert mock_client.models.embed_content.call_count == 2 + + def test_encode_passes_config(self): + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key="test-key") + mock_client = MagicMock() + mock_client.models.embed_content = MagicMock(return_value=_make_mock_embed_result([[0.1, 0.2]])) + emb._client = mock_client + emb._dimension = 2 + emb._embed_config = MagicMock() + + emb.encode(["hello"]) + assert mock_client.models.embed_content.call_args.kwargs["config"] is emb._embed_config + + def test_encode_empty_list(self): + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key="test-key") + emb._client = MagicMock() + emb._dimension = 768 + assert emb.encode([]) == [] + + def test_encode_before_initialization(self): + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key="test-key") + with pytest.raises(RuntimeError, match="not initialized"): + emb.encode(["test"]) + + def test_dimension_before_initialization(self): + emb = GeminiEmbeddings(model="gemini-embedding-001", api_key="test-key") + with pytest.raises(RuntimeError, match="not initialized"): + _ = emb.dimension + + def test_provider_name_always_google(self): + assert GeminiEmbeddings(model="m", api_key="k").provider_name == "google" + assert GeminiEmbeddings(model="m", vertexai_project_id="p").provider_name == "google" + + def test_vertexai_strips_google_prefix(self): + mock_genai = _make_mock_genai() + emb = GeminiEmbeddings(model="google/gemini-embedding-001", vertexai_project_id="test-project") + emb._init_vertexai(mock_genai) + assert emb.model == "gemini-embedding-001" + + def test_default_region(self): + emb = GeminiEmbeddings(model="m", vertexai_project_id="proj") + assert emb.vertexai_region == "us-central1" + + def test_custom_region(self): + emb = GeminiEmbeddings(model="m", vertexai_project_id="proj", vertexai_region="europe-west1") + assert emb.vertexai_region == "europe-west1" + + +class TestGeminiEmbeddingsFactory: + """Tests for create_embeddings_from_env() with 'google' provider.""" + + def _make_config(self, **overrides) -> HindsightConfig: + from dataclasses import fields + + defaults = {} + for f in fields(HindsightConfig): + if f.type == "str": + defaults[f.name] = "" + elif f.type == "str | None": + defaults[f.name] = None + elif f.type == "int": + defaults[f.name] = 0 + elif f.type == "int | None": + defaults[f.name] = None + elif f.type == "float": + defaults[f.name] = 0.0 + elif f.type == "float | None": + defaults[f.name] = None + elif f.type == "bool": + defaults[f.name] = False + elif f.type == "list | None": + defaults[f.name] = None + else: + defaults[f.name] = None + + defaults["embeddings_provider"] = "google" + defaults["embeddings_gemini_api_key"] = "test-key" + defaults["embeddings_gemini_model"] = "gemini-embedding-001" + defaults["embeddings_gemini_output_dimensionality"] = 768 + defaults["embeddings_vertexai_project_id"] = None + defaults["embeddings_vertexai_region"] = None + defaults["embeddings_vertexai_service_account_key"] = None + + defaults.update(overrides) + return HindsightConfig(**defaults) + + def test_create_with_api_key(self): + config = self._make_config() + with patch("hindsight_api.config.get_config", return_value=config): + emb = create_embeddings_from_env() + assert isinstance(emb, GeminiEmbeddings) + assert emb.provider_name == "google" + assert emb.api_key == "test-key" + assert emb._is_vertexai is False + + def test_create_with_vertexai(self): + config = self._make_config( + embeddings_gemini_api_key=None, + embeddings_vertexai_project_id="my-project", + embeddings_vertexai_region="us-east1", + ) + with patch("hindsight_api.config.get_config", return_value=config): + emb = create_embeddings_from_env() + assert isinstance(emb, GeminiEmbeddings) + assert emb._is_vertexai is True + assert emb.api_key is None + assert emb.vertexai_project_id == "my-project" + + def test_create_missing_all_credentials(self): + config = self._make_config(embeddings_gemini_api_key=None, embeddings_vertexai_project_id=None) + with patch("hindsight_api.config.get_config", return_value=config): + with pytest.raises(ValueError, match="is required"): + create_embeddings_from_env() + + def test_vertexai_takes_priority(self): + config = self._make_config(embeddings_gemini_api_key="key", embeddings_vertexai_project_id="proj") + with patch("hindsight_api.config.get_config", return_value=config): + emb = create_embeddings_from_env() + assert emb._is_vertexai is True + assert emb.api_key is None + + def test_create_with_custom_dimensionality(self): + config = self._make_config(embeddings_gemini_output_dimensionality=256) + with patch("hindsight_api.config.get_config", return_value=config): + emb = create_embeddings_from_env() + assert emb.output_dimensionality == 256 diff --git a/hindsight-api-slim/tests/test_google_cross_encoder.py b/hindsight-api-slim/tests/test_google_cross_encoder.py new file mode 100644 index 000000000..26d92eeb7 --- /dev/null +++ b/hindsight-api-slim/tests/test_google_cross_encoder.py @@ -0,0 +1,275 @@ +""" +Tests for Google Discovery Engine cross-encoder (Ranking REST API). + +These tests cover: +1. Initialization (service account, ADC, missing project_id) +2. Predict (single query, multiple queries, batching, empty pairs, uninitialized) +3. Provider name +4. Factory function (create from env, validation errors) +""" + +from unittest.mock import MagicMock, patch + +import httpx +import pytest + +from hindsight_api.config import ( + ENV_RERANKER_GOOGLE_PROJECT_ID, + ENV_RERANKER_PROVIDER, + HindsightConfig, +) +from hindsight_api.engine.cross_encoder import GoogleCrossEncoder, create_cross_encoder_from_env + + +def _make_rank_response(records: list[tuple[str, float]]) -> dict: + """Build a JSON response matching the Discovery Engine REST API format.""" + return {"records": [{"id": rid, "score": score} for rid, score in records]} + + +def _make_mock_httpx_client(responses: list[dict] | None = None) -> MagicMock: + """Create a mock httpx.Client that returns predefined responses.""" + mock_client = MagicMock(spec=httpx.Client) + if responses: + side_effects = [] + for resp_json in responses: + mock_resp = MagicMock(spec=httpx.Response) + mock_resp.json.return_value = resp_json + mock_resp.raise_for_status.return_value = None + side_effects.append(mock_resp) + mock_client.post.side_effect = side_effects + return mock_client + + +def _make_mock_credentials() -> MagicMock: + """Create mock credentials with a valid token.""" + creds = MagicMock() + creds.valid = True + creds.token = "mock-token" + return creds + + +class TestGoogleCrossEncoder: + """Unit tests for GoogleCrossEncoder with mocked httpx + google-auth.""" + + async def test_initialization_adc_success(self): + """Test successful initialization with ADC (no service account key).""" + mock_creds = _make_mock_credentials() + + encoder = GoogleCrossEncoder(project_id="test-project") + + with patch("google.auth.default", return_value=(mock_creds, "test-project")): + await encoder.initialize() + + assert encoder._client is not None + assert encoder._credentials is mock_creds + assert encoder.provider_name == "google" + assert "test-project" in encoder._rank_url + + async def test_initialization_service_account(self): + """Test initialization with service account key.""" + mock_creds = _make_mock_credentials() + + encoder = GoogleCrossEncoder( + project_id="test-project", + service_account_key="/path/to/key.json", + ) + + with patch( + "google.oauth2.service_account.Credentials.from_service_account_file", + return_value=mock_creds, + ): + await encoder.initialize() + + assert encoder._client is not None + assert encoder._credentials is mock_creds + + async def test_initialization_idempotent(self): + """Test that calling initialize() twice is a no-op.""" + mock_creds = _make_mock_credentials() + encoder = GoogleCrossEncoder(project_id="test-project") + + with patch("google.auth.default", return_value=(mock_creds, "test-project")): + await encoder.initialize() + first_client = encoder._client + await encoder.initialize() + assert encoder._client is first_client + + async def test_predict_single_query(self): + """Test prediction with a single query and multiple documents.""" + mock_creds = _make_mock_credentials() + mock_client = _make_mock_httpx_client([ + _make_rank_response([("1", 0.95), ("0", 0.30)]), + ]) + + encoder = GoogleCrossEncoder(project_id="test-project") + with patch("google.auth.default", return_value=(mock_creds, "p")): + await encoder.initialize() + encoder._client = mock_client + + scores = await encoder.predict([ + ("What is AI?", "AI is artificial intelligence"), + ("What is AI?", "The sky is blue"), + ]) + + assert len(scores) == 2 + assert scores[0] == 0.30 # id="0" -> index 0 + assert scores[1] == 0.95 # id="1" -> index 1 + mock_client.post.assert_called_once() + + async def test_predict_multiple_queries(self): + """Test prediction with multiple distinct queries.""" + mock_creds = _make_mock_credentials() + mock_client = _make_mock_httpx_client([ + _make_rank_response([("0", 0.9), ("1", 0.1)]), + _make_rank_response([("0", 0.8)]), + ]) + + encoder = GoogleCrossEncoder(project_id="test-project") + with patch("google.auth.default", return_value=(mock_creds, "p")): + await encoder.initialize() + encoder._client = mock_client + + scores = await encoder.predict([ + ("Query A", "Doc A1"), + ("Query A", "Doc A2"), + ("Query B", "Doc B1"), + ]) + + assert len(scores) == 3 + assert scores[0] == 0.9 + assert scores[1] == 0.1 + assert scores[2] == 0.8 + assert mock_client.post.call_count == 2 + + async def test_predict_empty_pairs(self): + """Test that empty pairs returns empty list.""" + mock_creds = _make_mock_credentials() + encoder = GoogleCrossEncoder(project_id="test-project") + + with patch("google.auth.default", return_value=(mock_creds, "p")): + await encoder.initialize() + + scores = await encoder.predict([]) + assert scores == [] + + async def test_predict_not_initialized(self): + """Test that predict raises if not initialized.""" + encoder = GoogleCrossEncoder(project_id="test-project") + with pytest.raises(RuntimeError, match="not initialized"): + await encoder.predict([("q", "d")]) + + async def test_predict_batching(self): + """Test that >200 records are split into batches.""" + mock_creds = _make_mock_credentials() + mock_client = _make_mock_httpx_client([ + _make_rank_response([(str(i), 0.5) for i in range(200)]), + _make_rank_response([(str(i), 0.3) for i in range(50)]), + ]) + + encoder = GoogleCrossEncoder(project_id="test-project") + with patch("google.auth.default", return_value=(mock_creds, "p")): + await encoder.initialize() + encoder._client = mock_client + + pairs = [("same query", f"doc {i}") for i in range(250)] + scores = await encoder.predict(pairs) + + assert len(scores) == 250 + assert mock_client.post.call_count == 2 + + async def test_auth_header_sent(self): + """Test that Authorization header is sent with requests.""" + mock_creds = _make_mock_credentials() + mock_creds.token = "test-bearer-token" + mock_client = _make_mock_httpx_client([ + _make_rank_response([("0", 0.9)]), + ]) + + encoder = GoogleCrossEncoder(project_id="test-project") + with patch("google.auth.default", return_value=(mock_creds, "p")): + await encoder.initialize() + encoder._client = mock_client + + await encoder.predict([("q", "d")]) + + call_kwargs = mock_client.post.call_args + assert call_kwargs.kwargs["headers"]["Authorization"] == "Bearer test-bearer-token" + + def test_provider_name(self): + assert GoogleCrossEncoder(project_id="p").provider_name == "google" + + def test_default_model(self): + encoder = GoogleCrossEncoder(project_id="p") + assert encoder.model == "semantic-ranker-default-004" + + def test_custom_model(self): + encoder = GoogleCrossEncoder(project_id="p", model="semantic-ranker-fast-004") + assert encoder.model == "semantic-ranker-fast-004" + + def test_default_location(self): + encoder = GoogleCrossEncoder(project_id="p") + assert encoder.location == "global" + + +class TestGoogleCrossEncoderFactory: + """Tests for create_cross_encoder_from_env() with 'google' provider.""" + + def _make_config(self, **overrides) -> HindsightConfig: + from dataclasses import fields + + defaults = {} + for f in fields(HindsightConfig): + if f.type == "str": + defaults[f.name] = "" + elif f.type == "str | None": + defaults[f.name] = None + elif f.type == "int": + defaults[f.name] = 0 + elif f.type == "int | None": + defaults[f.name] = None + elif f.type == "float": + defaults[f.name] = 0.0 + elif f.type == "float | None": + defaults[f.name] = None + elif f.type == "bool": + defaults[f.name] = False + elif f.type == "list | None": + defaults[f.name] = None + else: + defaults[f.name] = None + + defaults["reranker_provider"] = "google" + defaults["reranker_google_model"] = "semantic-ranker-default-004" + defaults["reranker_google_project_id"] = "test-project" + defaults["reranker_google_service_account_key"] = None + + defaults.update(overrides) + return HindsightConfig(**defaults) + + def test_create_with_project_id(self): + config = self._make_config() + with patch("hindsight_api.config.get_config", return_value=config): + encoder = create_cross_encoder_from_env() + assert isinstance(encoder, GoogleCrossEncoder) + assert encoder.provider_name == "google" + assert encoder.project_id == "test-project" + assert encoder.service_account_key is None + + def test_create_with_service_account(self): + config = self._make_config(reranker_google_service_account_key="/path/to/key.json") + with patch("hindsight_api.config.get_config", return_value=config): + encoder = create_cross_encoder_from_env() + assert isinstance(encoder, GoogleCrossEncoder) + assert encoder.service_account_key == "/path/to/key.json" + + def test_create_missing_project_id(self): + config = self._make_config(reranker_google_project_id=None) + with patch("hindsight_api.config.get_config", return_value=config): + with pytest.raises(ValueError, match="is required"): + create_cross_encoder_from_env() + + def test_create_with_custom_model(self): + config = self._make_config(reranker_google_model="semantic-ranker-fast-004") + with patch("hindsight_api.config.get_config", return_value=config): + encoder = create_cross_encoder_from_env() + assert encoder.model == "semantic-ranker-fast-004" diff --git a/hindsight-docs/docs/developer/configuration.md b/hindsight-docs/docs/developer/configuration.md index 1179310bc..310d07691 100644 --- a/hindsight-docs/docs/developer/configuration.md +++ b/hindsight-docs/docs/developer/configuration.md @@ -352,7 +352,7 @@ export HINDSIGHT_API_RETAIN_LLM_MAX_BACKOFF=120.0 # Cap at 2min instead of 1m | Variable | Description | Default | |----------|-------------|---------| -| `HINDSIGHT_API_EMBEDDINGS_PROVIDER` | Provider: `local`, `tei`, `openai`, `cohere`, `litellm`, or `litellm-sdk` | `local` | +| `HINDSIGHT_API_EMBEDDINGS_PROVIDER` | Provider: `local`, `tei`, `openai`, `cohere`, `google`, `litellm`, or `litellm-sdk` | `local` | | `HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL` | Model for local provider | `BAAI/bge-small-en-v1.5` | | `HINDSIGHT_API_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE` | Allow loading models with custom code (security risk, disabled by default) | `false` | | `HINDSIGHT_API_EMBEDDINGS_LOCAL_FORCE_CPU` | Force CPU mode for local embeddings (avoids MPS/XPC issues on macOS) | `false` | @@ -370,6 +370,12 @@ export HINDSIGHT_API_RETAIN_LLM_MAX_BACKOFF=120.0 # Cap at 2min instead of 1m | `HINDSIGHT_API_EMBEDDINGS_LITELLM_SDK_MODEL` | LiteLLM SDK embedding model (use provider prefix, e.g., `cohere/embed-english-v3.0`) | `cohere/embed-english-v3.0` | | `HINDSIGHT_API_EMBEDDINGS_LITELLM_SDK_API_BASE` | Custom base URL for LiteLLM SDK embeddings (optional) | - | | `HINDSIGHT_API_EMBEDDINGS_LITELLM_SDK_OUTPUT_DIMENSIONS` | Optional output embedding dimensions (provider-dependent, e.g., `768` for Gemini embedding models) | - | +| `HINDSIGHT_API_EMBEDDINGS_GEMINI_API_KEY` | Gemini API key for embeddings (falls back to `HINDSIGHT_API_LLM_API_KEY`) | - | +| `HINDSIGHT_API_EMBEDDINGS_GEMINI_MODEL` | Gemini embedding model | `gemini-embedding-001` | +| `HINDSIGHT_API_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY` | Output embedding dimensions (Gemini supports configurable dimensionality) | `768` | +| `HINDSIGHT_API_EMBEDDINGS_VERTEXAI_PROJECT_ID` | Vertex AI project ID for embeddings (falls back to `HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID`) | - | +| `HINDSIGHT_API_EMBEDDINGS_VERTEXAI_REGION` | Vertex AI region for embeddings (falls back to `HINDSIGHT_API_LLM_VERTEXAI_REGION`) | - | +| `HINDSIGHT_API_EMBEDDINGS_VERTEXAI_SERVICE_ACCOUNT_KEY` | Service account key for Vertex AI embeddings (falls back to `HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY`) | - | ```bash # Local (default) - uses SentenceTransformers @@ -413,6 +419,19 @@ export HINDSIGHT_API_EMBEDDINGS_LITELLM_API_BASE=http://localhost:4000 export HINDSIGHT_API_EMBEDDINGS_LITELLM_API_KEY=your-litellm-key # optional export HINDSIGHT_API_EMBEDDINGS_LITELLM_MODEL=text-embedding-3-small # or cohere/embed-english-v3.0 +# Google - Gemini API (API key auth) +export HINDSIGHT_API_EMBEDDINGS_PROVIDER=google +export HINDSIGHT_API_EMBEDDINGS_GEMINI_API_KEY=xxxxxxxxxxxx # or reuses HINDSIGHT_API_LLM_API_KEY +export HINDSIGHT_API_EMBEDDINGS_GEMINI_MODEL=gemini-embedding-001 # 768 dimensions (default) +# export HINDSIGHT_API_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY=768 # configurable: 256, 512, 768, 1024, etc. + +# Google - Vertex AI auth (auto-detected when project ID is set) +export HINDSIGHT_API_EMBEDDINGS_PROVIDER=google +export HINDSIGHT_API_EMBEDDINGS_GEMINI_MODEL=gemini-embedding-001 +export HINDSIGHT_API_EMBEDDINGS_VERTEXAI_PROJECT_ID=your-gcp-project-id # falls back to HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID +# export HINDSIGHT_API_EMBEDDINGS_VERTEXAI_REGION=us-central1 # falls back to HINDSIGHT_API_LLM_VERTEXAI_REGION +# export HINDSIGHT_API_EMBEDDINGS_VERTEXAI_SERVICE_ACCOUNT_KEY=/path/to/key.json # falls back to LLM config, or uses ADC + # LiteLLM SDK - direct API access without proxy server (recommended) export HINDSIGHT_API_EMBEDDINGS_PROVIDER=litellm-sdk export HINDSIGHT_API_EMBEDDINGS_LITELLM_SDK_API_KEY=your-provider-api-key @@ -444,13 +463,15 @@ Supported OpenAI embedding dimensions: - `text-embedding-3-small`: 1536 dimensions - `text-embedding-3-large`: 3072 dimensions - `text-embedding-ada-002`: 1536 dimensions (legacy) + +Google's `gemini-embedding-001` produces 3072 dimensions natively but supports configurable output dimensionality. Set `HINDSIGHT_API_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY` to control the output size (default: 768). ::: ### Reranker | Variable | Description | Default | |----------|-------------|---------| -| `HINDSIGHT_API_RERANKER_PROVIDER` | Provider: `local`, `tei`, `cohere`, `zeroentropy`, `flashrank`, `litellm`, `litellm-sdk`, `jina-mlx`, or `rrf` | `local` | +| `HINDSIGHT_API_RERANKER_PROVIDER` | Provider: `local`, `tei`, `cohere`, `zeroentropy`, `google`, `flashrank`, `litellm`, `litellm-sdk`, `jina-mlx`, or `rrf` | `local` | | `HINDSIGHT_API_RERANKER_LOCAL_MODEL` | Model for local provider | `cross-encoder/ms-marco-MiniLM-L-6-v2` | | `HINDSIGHT_API_RERANKER_LOCAL_MAX_CONCURRENT` | Max concurrent local reranking (prevents CPU thrashing under load) | `4` | | `HINDSIGHT_API_RERANKER_LOCAL_TRUST_REMOTE_CODE` | Allow loading models with custom code (security risk, disabled by default) | `false` | @@ -474,6 +495,9 @@ Supported OpenAI embedding dimensions: | `HINDSIGHT_API_RERANKER_ZEROENTROPY_API_KEY` | ZeroEntropy API key for reranking | - | | `HINDSIGHT_API_RERANKER_ZEROENTROPY_MODEL` | ZeroEntropy rerank model (`zerank-2`, `zerank-2-small`) | `zerank-2` | | `HINDSIGHT_API_RERANKER_ZEROENTROPY_BASE_URL` | Custom base URL for ZeroEntropy-compatible API (e.g., mock server, proxy, or self-hosted deployment) | `https://api.zeroentropy.dev` | +| `HINDSIGHT_API_RERANKER_GOOGLE_PROJECT_ID` | Google Cloud project ID for Discovery Engine reranking (falls back to `HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID`) | - | +| `HINDSIGHT_API_RERANKER_GOOGLE_MODEL` | Google Discovery Engine ranking model | `semantic-ranker-default-004` | +| `HINDSIGHT_API_RERANKER_GOOGLE_SERVICE_ACCOUNT_KEY` | Path to service account JSON key (falls back to `HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY`). If unset, uses ADC. | - | | `HINDSIGHT_API_RERANKER_FLASHRANK_MODEL` | FlashRank model for fast CPU-based reranking | `ms-marco-MiniLM-L-12-v2` | | `HINDSIGHT_API_RERANKER_FLASHRANK_CACHE_DIR` | Cache directory for FlashRank models | System default | | `HINDSIGHT_API_RERANKER_JINA_MLX_MODEL_PATH` | Local path to downloaded `jina-reranker-v3-mlx` model (auto-downloads from HuggingFace if unset) | - | @@ -521,6 +545,12 @@ export HINDSIGHT_API_RERANKER_PROVIDER=litellm-sdk export HINDSIGHT_API_RERANKER_LITELLM_SDK_API_KEY=your-deepinfra-api-key export HINDSIGHT_API_RERANKER_LITELLM_SDK_MODEL=deepinfra/Qwen3-reranker-8B # or cohere/rerank-english-v3.0, etc. +# Google Discovery Engine - cloud-based semantic reranking +export HINDSIGHT_API_RERANKER_PROVIDER=google +export HINDSIGHT_API_RERANKER_GOOGLE_PROJECT_ID=your-gcp-project-id +export HINDSIGHT_API_RERANKER_GOOGLE_SERVICE_ACCOUNT_KEY=/path/to/service-account.json # optional, uses ADC if unset +export HINDSIGHT_API_RERANKER_GOOGLE_MODEL=semantic-ranker-default-004 # or semantic-ranker-fast-004 + # Jina MLX - Apple Silicon native reranking (no GPU/cloud required) # Model (~1.2 GB) is downloaded automatically from HuggingFace Hub on first use. export HINDSIGHT_API_RERANKER_PROVIDER=jina-mlx diff --git a/hindsight-docs/docs/developer/models.mdx b/hindsight-docs/docs/developer/models.mdx index ab42c80b0..5764a6f2c 100644 --- a/hindsight-docs/docs/developer/models.mdx +++ b/hindsight-docs/docs/developer/models.mdx @@ -376,6 +376,7 @@ Converts text into dense vector representations for semantic similarity search. | `local` | SentenceTransformers (default) | Development, low latency | | `openai` | OpenAI embeddings API | Production, high quality | | `cohere` | Cohere embeddings API | Production, multilingual | +| `google` | Google embeddings (Gemini API or Vertex AI) | Production, multilingual, high quality | | `tei` | HuggingFace Text Embeddings Inference | Production, self-hosted | | `litellm` | LiteLLM proxy (unified gateway) | Multi-provider setups | @@ -394,6 +395,14 @@ Converts text into dense vector representations for semantic similarity search. | `text-embedding-3-large` | 3072 | Higher quality, more expensive | | `text-embedding-ada-002` | 1536 | Legacy model | +### Google Models + +| Model | Dimensions | Use Case | +|-------|------------|----------| +| `gemini-embedding-001` | 768 (configurable) | Default Google, general purpose | + +Google's `gemini-embedding-001` supports configurable output dimensionality via truncation, google recommend using: 768, 1536, 3072, via `HINDSIGHT_API_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY`. Default is 768. + ### Cohere Models | Model | Dimensions | Use Case | @@ -422,6 +431,16 @@ export HINDSIGHT_API_EMBEDDINGS_PROVIDER=cohere export HINDSIGHT_API_COHERE_API_KEY=your-api-key export HINDSIGHT_API_EMBEDDINGS_COHERE_MODEL=embed-english-v3.0 +# Google (API key auth) +export HINDSIGHT_API_EMBEDDINGS_PROVIDER=google +export HINDSIGHT_API_EMBEDDINGS_GEMINI_API_KEY=xxxxxxxxxxxx +export HINDSIGHT_API_EMBEDDINGS_GEMINI_MODEL=gemini-embedding-001 + +# Google (Vertex AI auth - auto-detected when project ID is set) +export HINDSIGHT_API_EMBEDDINGS_PROVIDER=google +export HINDSIGHT_API_EMBEDDINGS_GEMINI_MODEL=gemini-embedding-001 +export HINDSIGHT_API_EMBEDDINGS_VERTEXAI_PROJECT_ID=your-gcp-project-id + # TEI (self-hosted) export HINDSIGHT_API_EMBEDDINGS_PROVIDER=tei export HINDSIGHT_API_EMBEDDINGS_TEI_URL=http://localhost:8080 diff --git a/skills/hindsight-docs/references/developer/configuration.md b/skills/hindsight-docs/references/developer/configuration.md index 66c4696a2..aceca4ac9 100644 --- a/skills/hindsight-docs/references/developer/configuration.md +++ b/skills/hindsight-docs/references/developer/configuration.md @@ -352,7 +352,7 @@ export HINDSIGHT_API_RETAIN_LLM_MAX_BACKOFF=120.0 # Cap at 2min instead of 1m | Variable | Description | Default | |----------|-------------|---------| -| `HINDSIGHT_API_EMBEDDINGS_PROVIDER` | Provider: `local`, `tei`, `openai`, `cohere`, `litellm`, or `litellm-sdk` | `local` | +| `HINDSIGHT_API_EMBEDDINGS_PROVIDER` | Provider: `local`, `tei`, `openai`, `cohere`, `google`, `litellm`, or `litellm-sdk` | `local` | | `HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL` | Model for local provider | `BAAI/bge-small-en-v1.5` | | `HINDSIGHT_API_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE` | Allow loading models with custom code (security risk, disabled by default) | `false` | | `HINDSIGHT_API_EMBEDDINGS_LOCAL_FORCE_CPU` | Force CPU mode for local embeddings (avoids MPS/XPC issues on macOS) | `false` | @@ -370,6 +370,12 @@ export HINDSIGHT_API_RETAIN_LLM_MAX_BACKOFF=120.0 # Cap at 2min instead of 1m | `HINDSIGHT_API_EMBEDDINGS_LITELLM_SDK_MODEL` | LiteLLM SDK embedding model (use provider prefix, e.g., `cohere/embed-english-v3.0`) | `cohere/embed-english-v3.0` | | `HINDSIGHT_API_EMBEDDINGS_LITELLM_SDK_API_BASE` | Custom base URL for LiteLLM SDK embeddings (optional) | - | | `HINDSIGHT_API_EMBEDDINGS_LITELLM_SDK_OUTPUT_DIMENSIONS` | Optional output embedding dimensions (provider-dependent, e.g., `768` for Gemini embedding models) | - | +| `HINDSIGHT_API_EMBEDDINGS_GEMINI_API_KEY` | Gemini API key for embeddings (falls back to `HINDSIGHT_API_LLM_API_KEY`) | - | +| `HINDSIGHT_API_EMBEDDINGS_GEMINI_MODEL` | Gemini embedding model | `gemini-embedding-001` | +| `HINDSIGHT_API_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY` | Output embedding dimensions (Gemini supports configurable dimensionality) | `768` | +| `HINDSIGHT_API_EMBEDDINGS_VERTEXAI_PROJECT_ID` | Vertex AI project ID for embeddings (falls back to `HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID`) | - | +| `HINDSIGHT_API_EMBEDDINGS_VERTEXAI_REGION` | Vertex AI region for embeddings (falls back to `HINDSIGHT_API_LLM_VERTEXAI_REGION`) | - | +| `HINDSIGHT_API_EMBEDDINGS_VERTEXAI_SERVICE_ACCOUNT_KEY` | Service account key for Vertex AI embeddings (falls back to `HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY`) | - | ```bash # Local (default) - uses SentenceTransformers @@ -413,6 +419,19 @@ export HINDSIGHT_API_EMBEDDINGS_LITELLM_API_BASE=http://localhost:4000 export HINDSIGHT_API_EMBEDDINGS_LITELLM_API_KEY=your-litellm-key # optional export HINDSIGHT_API_EMBEDDINGS_LITELLM_MODEL=text-embedding-3-small # or cohere/embed-english-v3.0 +# Google - Gemini API (API key auth) +export HINDSIGHT_API_EMBEDDINGS_PROVIDER=google +export HINDSIGHT_API_EMBEDDINGS_GEMINI_API_KEY=xxxxxxxxxxxx # or reuses HINDSIGHT_API_LLM_API_KEY +export HINDSIGHT_API_EMBEDDINGS_GEMINI_MODEL=gemini-embedding-001 # 768 dimensions (default) +# export HINDSIGHT_API_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY=768 # configurable: 256, 512, 768, 1024, etc. + +# Google - Vertex AI auth (auto-detected when project ID is set) +export HINDSIGHT_API_EMBEDDINGS_PROVIDER=google +export HINDSIGHT_API_EMBEDDINGS_GEMINI_MODEL=gemini-embedding-001 +export HINDSIGHT_API_EMBEDDINGS_VERTEXAI_PROJECT_ID=your-gcp-project-id # falls back to HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID +# export HINDSIGHT_API_EMBEDDINGS_VERTEXAI_REGION=us-central1 # falls back to HINDSIGHT_API_LLM_VERTEXAI_REGION +# export HINDSIGHT_API_EMBEDDINGS_VERTEXAI_SERVICE_ACCOUNT_KEY=/path/to/key.json # falls back to LLM config, or uses ADC + # LiteLLM SDK - direct API access without proxy server (recommended) export HINDSIGHT_API_EMBEDDINGS_PROVIDER=litellm-sdk export HINDSIGHT_API_EMBEDDINGS_LITELLM_SDK_API_KEY=your-provider-api-key @@ -444,13 +463,15 @@ Supported OpenAI embedding dimensions: - `text-embedding-3-small`: 1536 dimensions - `text-embedding-3-large`: 3072 dimensions - `text-embedding-ada-002`: 1536 dimensions (legacy) + +Google's `gemini-embedding-001` produces 3072 dimensions natively but supports configurable output dimensionality. Set `HINDSIGHT_API_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY` to control the output size (default: 768). ::: ### Reranker | Variable | Description | Default | |----------|-------------|---------| -| `HINDSIGHT_API_RERANKER_PROVIDER` | Provider: `local`, `tei`, `cohere`, `zeroentropy`, `flashrank`, `litellm`, `litellm-sdk`, `jina-mlx`, or `rrf` | `local` | +| `HINDSIGHT_API_RERANKER_PROVIDER` | Provider: `local`, `tei`, `cohere`, `zeroentropy`, `google`, `flashrank`, `litellm`, `litellm-sdk`, `jina-mlx`, or `rrf` | `local` | | `HINDSIGHT_API_RERANKER_LOCAL_MODEL` | Model for local provider | `cross-encoder/ms-marco-MiniLM-L-6-v2` | | `HINDSIGHT_API_RERANKER_LOCAL_MAX_CONCURRENT` | Max concurrent local reranking (prevents CPU thrashing under load) | `4` | | `HINDSIGHT_API_RERANKER_LOCAL_TRUST_REMOTE_CODE` | Allow loading models with custom code (security risk, disabled by default) | `false` | @@ -474,6 +495,9 @@ Supported OpenAI embedding dimensions: | `HINDSIGHT_API_RERANKER_ZEROENTROPY_API_KEY` | ZeroEntropy API key for reranking | - | | `HINDSIGHT_API_RERANKER_ZEROENTROPY_MODEL` | ZeroEntropy rerank model (`zerank-2`, `zerank-2-small`) | `zerank-2` | | `HINDSIGHT_API_RERANKER_ZEROENTROPY_BASE_URL` | Custom base URL for ZeroEntropy-compatible API (e.g., mock server, proxy, or self-hosted deployment) | `https://api.zeroentropy.dev` | +| `HINDSIGHT_API_RERANKER_GOOGLE_PROJECT_ID` | Google Cloud project ID for Discovery Engine reranking (falls back to `HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID`) | - | +| `HINDSIGHT_API_RERANKER_GOOGLE_MODEL` | Google Discovery Engine ranking model | `semantic-ranker-default-004` | +| `HINDSIGHT_API_RERANKER_GOOGLE_SERVICE_ACCOUNT_KEY` | Path to service account JSON key (falls back to `HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY`). If unset, uses ADC. | - | | `HINDSIGHT_API_RERANKER_FLASHRANK_MODEL` | FlashRank model for fast CPU-based reranking | `ms-marco-MiniLM-L-12-v2` | | `HINDSIGHT_API_RERANKER_FLASHRANK_CACHE_DIR` | Cache directory for FlashRank models | System default | | `HINDSIGHT_API_RERANKER_JINA_MLX_MODEL_PATH` | Local path to downloaded `jina-reranker-v3-mlx` model (auto-downloads from HuggingFace if unset) | - | @@ -521,6 +545,12 @@ export HINDSIGHT_API_RERANKER_PROVIDER=litellm-sdk export HINDSIGHT_API_RERANKER_LITELLM_SDK_API_KEY=your-deepinfra-api-key export HINDSIGHT_API_RERANKER_LITELLM_SDK_MODEL=deepinfra/Qwen3-reranker-8B # or cohere/rerank-english-v3.0, etc. +# Google Discovery Engine - cloud-based semantic reranking +export HINDSIGHT_API_RERANKER_PROVIDER=google +export HINDSIGHT_API_RERANKER_GOOGLE_PROJECT_ID=your-gcp-project-id +export HINDSIGHT_API_RERANKER_GOOGLE_SERVICE_ACCOUNT_KEY=/path/to/service-account.json # optional, uses ADC if unset +export HINDSIGHT_API_RERANKER_GOOGLE_MODEL=semantic-ranker-default-004 # or semantic-ranker-fast-004 + # Jina MLX - Apple Silicon native reranking (no GPU/cloud required) # Model (~1.2 GB) is downloaded automatically from HuggingFace Hub on first use. export HINDSIGHT_API_RERANKER_PROVIDER=jina-mlx diff --git a/skills/hindsight-docs/references/developer/models.md b/skills/hindsight-docs/references/developer/models.md index f82bdf93e..1bc5b01f3 100644 --- a/skills/hindsight-docs/references/developer/models.md +++ b/skills/hindsight-docs/references/developer/models.md @@ -367,6 +367,7 @@ Converts text into dense vector representations for semantic similarity search. | `local` | SentenceTransformers (default) | Development, low latency | | `openai` | OpenAI embeddings API | Production, high quality | | `cohere` | Cohere embeddings API | Production, multilingual | +| `google` | Google embeddings (Gemini API or Vertex AI) | Production, multilingual, high quality | | `tei` | HuggingFace Text Embeddings Inference | Production, self-hosted | | `litellm` | LiteLLM proxy (unified gateway) | Multi-provider setups | @@ -385,6 +386,14 @@ Converts text into dense vector representations for semantic similarity search. | `text-embedding-3-large` | 3072 | Higher quality, more expensive | | `text-embedding-ada-002` | 1536 | Legacy model | +### Google Models + +| Model | Dimensions | Use Case | +|-------|------------|----------| +| `gemini-embedding-001` | 768 (configurable) | Default Google, general purpose | + +Google's `gemini-embedding-001` supports configurable output dimensionality via truncation, google recommend using: 768, 1536, 3072, via `HINDSIGHT_API_EMBEDDINGS_GEMINI_OUTPUT_DIMENSIONALITY`. Default is 768. + ### Cohere Models | Model | Dimensions | Use Case | @@ -412,6 +421,16 @@ export HINDSIGHT_API_EMBEDDINGS_PROVIDER=cohere export HINDSIGHT_API_COHERE_API_KEY=your-api-key export HINDSIGHT_API_EMBEDDINGS_COHERE_MODEL=embed-english-v3.0 +# Google (API key auth) +export HINDSIGHT_API_EMBEDDINGS_PROVIDER=google +export HINDSIGHT_API_EMBEDDINGS_GEMINI_API_KEY=xxxxxxxxxxxx +export HINDSIGHT_API_EMBEDDINGS_GEMINI_MODEL=gemini-embedding-001 + +# Google (Vertex AI auth - auto-detected when project ID is set) +export HINDSIGHT_API_EMBEDDINGS_PROVIDER=google +export HINDSIGHT_API_EMBEDDINGS_GEMINI_MODEL=gemini-embedding-001 +export HINDSIGHT_API_EMBEDDINGS_VERTEXAI_PROJECT_ID=your-gcp-project-id + # TEI (self-hosted) export HINDSIGHT_API_EMBEDDINGS_PROVIDER=tei export HINDSIGHT_API_EMBEDDINGS_TEI_URL=http://localhost:8080