-
Notifications
You must be signed in to change notification settings - Fork 29
fix for indexing issue with knowledge bases #274
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e5dbd45
82ec696
da2062b
988f23f
37acaf2
762e812
d5dd2ee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,84 @@ | ||
| """add hnsw index on knowledge_base_embeddings | ||
|
|
||
| Revision ID: e8f2a1c3b5d9 | ||
| Revises: c7a9e2f4b1d0 | ||
| Create Date: 2026-04-10 10:00:00.000000 | ||
|
|
||
| """ | ||
|
|
||
| from typing import Sequence, Union | ||
|
|
||
| from alembic import op | ||
| from sqlalchemy import text | ||
|
|
||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision: str = 'e8f2a1c3b5d9' | ||
|
|
||
| down_revision: Union[str, None] = 'c7a9e2f4b1d0' | ||
|
github-code-quality[bot] marked this conversation as resolved.
Fixed
|
||
| branch_labels: Union[str, Sequence[str], None] = None | ||
|
|
||
| depends_on: Union[str, Sequence[str], None] = None | ||
|
github-code-quality[bot] marked this conversation as resolved.
Fixed
|
||
|
|
||
|
|
||
| def upgrade() -> None: | ||
| # HNSW requires a dimensioned vector expression — the column is stored without | ||
| # dimensions so we cast inline. | ||
| # embedding_vector → 512 dims (CLIP image / text embeddings) | ||
| # embedding_vector_1 → 1024 dims (DINO image embeddings) | ||
| # | ||
| # CREATE INDEX CONCURRENTLY cannot run inside a transaction block. | ||
| # SQLAlchemy 2.x autobegins a transaction on op.get_bind(), and | ||
| # execution_options(isolation_level=AUTOCOMMIT) is rejected while a | ||
| # Transaction object is active. We get a fresh AUTOCOMMIT connection | ||
| # directly from the underlying sync engine instead. | ||
| bind = op.get_bind() | ||
| sync_engine = getattr(bind.engine, 'sync_engine', bind.engine) | ||
|
|
||
| with sync_engine.execution_options(isolation_level='AUTOCOMMIT').connect() as conn: | ||
| conn.execute(text("SET maintenance_work_mem = '2GB'")) | ||
|
|
||
| conn.execute( | ||
| text(""" | ||
| CREATE INDEX CONCURRENTLY IF NOT EXISTS | ||
| ix_kbe_embedding_vector_hnsw_cosine | ||
| ON knowledge_base_embeddings | ||
| USING hnsw ((embedding_vector::vector(512)) vector_cosine_ops) | ||
| WITH (m = 16, ef_construction = 64) | ||
| """) | ||
| ) | ||
|
|
||
| conn.execute( | ||
| text(""" | ||
| CREATE INDEX CONCURRENTLY IF NOT EXISTS | ||
| ix_kbe_embedding_vector_1_hnsw_cosine | ||
| ON knowledge_base_embeddings | ||
| USING hnsw ((embedding_vector_1::vector(1024)) vector_cosine_ops) | ||
| WITH (m = 16, ef_construction = 64) | ||
| """) | ||
| ) | ||
|
|
||
| conn.execute( | ||
| text(""" | ||
| CREATE INDEX CONCURRENTLY IF NOT EXISTS | ||
| ix_kbe_token_gin | ||
| ON knowledge_base_embeddings | ||
| USING gin (token) | ||
| """) | ||
| ) | ||
|
|
||
|
|
||
| def downgrade() -> None: | ||
| bind = op.get_bind() | ||
| sync_engine = getattr(bind.engine, 'sync_engine', bind.engine) | ||
|
|
||
| with sync_engine.execution_options(isolation_level='AUTOCOMMIT').connect() as conn: | ||
| conn.execute( | ||
| text( | ||
| 'DROP INDEX CONCURRENTLY IF EXISTS ix_kbe_embedding_vector_hnsw_cosine' | ||
| ) | ||
| ) | ||
| conn.execute( | ||
| text( | ||
| 'DROP INDEX CONCURRENTLY IF EXISTS ix_kbe_embedding_vector_1_hnsw_cosine' | ||
| ) | ||
| ) | ||
| conn.execute(text('DROP INDEX CONCURRENTLY IF EXISTS ix_kbe_token_gin')) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -89,24 +89,37 @@ def get_combined_search_query( | |
| ) | ||
| query_params.update(filter_params) | ||
| sql_query = f""" | ||
| WITH vector_results AS ( | ||
| WITH hnsw_candidates AS ( | ||
| SELECT | ||
| e.id as embedding_id, | ||
| e.chunk_text, | ||
| e.chunk_index, | ||
| id, | ||
| document_id, | ||
| chunk_text, | ||
| chunk_index, | ||
| (embedding_vector::vector(512)) <=> :query_embed ::vector(512) AS distance | ||
| FROM | ||
| {KnowledgeBaseEmbeddings.__tablename__} | ||
| ORDER BY | ||
| (embedding_vector::vector(512)) <=> :query_embed ::vector(512) | ||
| LIMIT :limit * 20 | ||
| ), | ||
| vector_results AS ( | ||
| SELECT | ||
| hc.id as embedding_id, | ||
| hc.chunk_text, | ||
| hc.chunk_index, | ||
| d.id as document_id, | ||
| d.file_path, | ||
| d.knowledge_base_id, | ||
| d.metadata_value, | ||
| 1 - (e.embedding_vector <=> :query_embed ::vector) as vector_score | ||
| 1 - hc.distance as vector_score | ||
| FROM | ||
| {KnowledgeBaseEmbeddings.__tablename__} e | ||
| hnsw_candidates hc | ||
| JOIN | ||
| {KnowledgeBaseDocuments.__tablename__} d ON e.document_id = d.id | ||
| {KnowledgeBaseDocuments.__tablename__} d ON hc.document_id = d.id | ||
| WHERE | ||
| d.knowledge_base_id = :kb_id {'AND (' + metadata_filter_clause_inner + ')' if metadata_filter_clause_inner else ''} | ||
| ORDER BY | ||
| vector_score DESC | ||
| hc.distance ASC | ||
| LIMIT :limit | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| ), | ||
| keyword_results AS ( | ||
|
|
@@ -169,40 +182,45 @@ def get_image_embedding( | |
| 'kb_id': kb_id, | ||
| 'top_k': top_k, | ||
| } | ||
| metadata_filter_clause_final = '' | ||
| metadata_filter_clause = '' | ||
| if filter: | ||
| where_clause, filter_params = self.odata_parser.prepare_odata_filter(filter) | ||
| if where_clause and filter_params: | ||
| metadata_filter_clause_final = self.build_metadata_clause( | ||
| metadata_filter_clause = self.build_metadata_clause( | ||
| where_clause, | ||
| filter_params, | ||
| lambda field: f"(d.metadata_value ->> '{field}')", | ||
| ) | ||
| params.update(filter_params) | ||
| sql_query = f""" | ||
| WITH ranked_embeddings AS ( | ||
| WITH hnsw_candidates AS ( | ||
| SELECT | ||
| e.id AS embedding_id, | ||
| e.chunk_text, | ||
| e.chunk_index, | ||
| d.id AS document_id, | ||
| d.file_path, | ||
| d.file_name, | ||
| d.knowledge_base_id, | ||
| d.metadata_value, | ||
| e.embedding_vector <-> :query_embedding ::vector AS distance | ||
| id, | ||
| document_id, | ||
| chunk_text, | ||
| chunk_index, | ||
| (embedding_vector::vector(512)) <=> :query_embedding ::vector(512) AS distance | ||
| FROM | ||
| {KnowledgeBaseEmbeddings.__tablename__} e | ||
| JOIN | ||
| {KnowledgeBaseDocuments.__tablename__} d ON e.document_id = d.id | ||
| WHERE | ||
| d.knowledge_base_id = :kb_id {'AND (' + metadata_filter_clause_final + ')' if metadata_filter_clause_final else ''} | ||
| ORDER BY distance ASC | ||
| {KnowledgeBaseEmbeddings.__tablename__} | ||
| ORDER BY | ||
| (embedding_vector::vector(512)) <=> :query_embedding ::vector(512) | ||
| LIMIT :top_k * 20 | ||
| ) | ||
| SELECT | ||
| * | ||
| FROM | ||
| ranked_embeddings | ||
| hc.id AS embedding_id, | ||
| hc.chunk_text, | ||
| hc.chunk_index, | ||
| d.id AS document_id, | ||
| d.file_path, | ||
| d.file_name, | ||
| d.knowledge_base_id, | ||
| d.metadata_value, | ||
| hc.distance | ||
| FROM hnsw_candidates hc | ||
| JOIN {KnowledgeBaseDocuments.__tablename__} d ON hc.document_id = d.id | ||
| WHERE d.knowledge_base_id = :kb_id | ||
| {'AND (' + metadata_filter_clause + ')' if metadata_filter_clause else ''} | ||
| ORDER BY hc.distance ASC | ||
| LIMIT :top_k | ||
|
Comment on lines
+196
to
224
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Scope the CLIP ANN candidate set before
Possible fix WITH hnsw_candidates AS (
SELECT
id,
document_id,
chunk_text,
chunk_index,
(embedding_vector::vector(512)) <=> :query_embedding ::vector(512) AS distance
FROM
{KnowledgeBaseEmbeddings.__tablename__}
+ WHERE
+ document_id IN (
+ SELECT d.id
+ FROM {KnowledgeBaseDocuments.__tablename__} d
+ WHERE d.knowledge_base_id = :kb_id
+ {'AND (' + metadata_filter_clause + ')' if metadata_filter_clause else ''}
+ )
ORDER BY
(embedding_vector::vector(512)) <=> :query_embedding ::vector(512)
LIMIT :top_k * 20
)🤖 Prompt for AI Agents |
||
| """ | ||
|
|
||
|
|
@@ -232,49 +250,52 @@ def get_image_embedding_dino( | |
| params = { | ||
| 'query_embedding': query_embeddings, | ||
| 'kb_id': kb_id, | ||
| 'top_k': effective_limit, | ||
| 'reference_ids': processed_reference_ids, | ||
| 'offset': effective_offset, | ||
| 'limit': effective_limit, | ||
| } | ||
|
|
||
| metadata_filter_clause_final = '' | ||
| metadata_filter_clause = '' | ||
| if filter: | ||
| where_clause, filter_params = self.odata_parser.prepare_odata_filter(filter) | ||
| if where_clause and filter_params: | ||
| metadata_filter_clause_final = self.build_metadata_clause( | ||
| metadata_filter_clause = self.build_metadata_clause( | ||
| where_clause, | ||
| filter_params, | ||
| lambda field: f"(d.metadata_value ->> '{field}')", | ||
| ) | ||
| params.update(filter_params) | ||
| # Use ANY operator for PostgreSQL array matching | ||
| reference_filter = ( | ||
| 'AND e.document_id = ANY(:reference_ids)' if processed_reference_ids else '' | ||
| ) | ||
|
|
||
| sql_query = f""" | ||
| WITH ranked_embeddings AS ( | ||
| WITH hnsw_candidates AS ( | ||
| SELECT | ||
| e.id AS embedding_id, | ||
| e.chunk_text, | ||
| e.chunk_index, | ||
| d.id AS document_id, | ||
| d.file_path, | ||
| d.file_name, | ||
| d.knowledge_base_id, | ||
| d.metadata_value, | ||
| (1 - (e.embedding_vector_1 <=> :query_embedding ::vector)) AS similarity | ||
| FROM {KnowledgeBaseEmbeddings.__tablename__} e | ||
| JOIN {KnowledgeBaseDocuments.__tablename__} d ON e.document_id = d.id | ||
| WHERE | ||
| d.knowledge_base_id = :kb_id {reference_filter} {'AND (' + metadata_filter_clause_final + ')' if metadata_filter_clause_final else ''} | ||
| ORDER BY similarity DESC | ||
| id, | ||
| document_id, | ||
| chunk_text, | ||
| chunk_index, | ||
| (embedding_vector_1::vector(1024)) <=> :query_embedding ::vector(1024) AS distance | ||
| FROM | ||
| {KnowledgeBaseEmbeddings.__tablename__} | ||
| ORDER BY | ||
| (embedding_vector_1::vector(1024)) <=> :query_embedding ::vector(1024) | ||
| LIMIT :limit * 20 | ||
| ) | ||
| SELECT | ||
| * | ||
| FROM | ||
| ranked_embeddings | ||
| hc.id AS embedding_id, | ||
| hc.chunk_text, | ||
| hc.chunk_index, | ||
| d.id AS document_id, | ||
| d.file_path, | ||
| d.file_name, | ||
| d.knowledge_base_id, | ||
| d.metadata_value, | ||
| 1 - hc.distance AS similarity | ||
| FROM hnsw_candidates hc | ||
| JOIN {KnowledgeBaseDocuments.__tablename__} d ON hc.document_id = d.id | ||
| WHERE d.knowledge_base_id = :kb_id | ||
| {('AND d.id = ANY(:reference_ids)' if processed_reference_ids else '')} | ||
| {'AND (' + metadata_filter_clause + ')' if metadata_filter_clause else ''} | ||
| ORDER BY similarity DESC | ||
| LIMIT :limit OFFSET :offset | ||
|
Comment on lines
+270
to
299
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Filter
Possible fix WITH hnsw_candidates AS (
SELECT
id,
document_id,
chunk_text,
chunk_index,
(embedding_vector_1::vector(1024)) <=> :query_embedding ::vector(1024) AS distance
FROM
{KnowledgeBaseEmbeddings.__tablename__}
+ WHERE
+ document_id IN (
+ SELECT d.id
+ FROM {KnowledgeBaseDocuments.__tablename__} d
+ WHERE d.knowledge_base_id = :kb_id
+ {('AND d.id = ANY(:reference_ids)' if processed_reference_ids else '')}
+ {'AND (' + metadata_filter_clause + ')' if metadata_filter_clause else ''}
+ )
ORDER BY
(embedding_vector_1::vector(1024)) <=> :query_embedding ::vector(1024)
LIMIT :limit * 20
)🤖 Prompt for AI Agents |
||
| """ | ||
|
|
||
|
|
@@ -346,4 +367,4 @@ def get_update_tokens_query() -> str: | |
| Returns: | ||
| SQL query string | ||
| """ | ||
| return "UPDATE knowledge_base_embeddings SET token = to_tsvector('english', chunk_text)" | ||
| return "UPDATE knowledge_base_embeddings SET token = to_tsvector('english', chunk_text) WHERE token IS NULL" | ||
Uh oh!
There was an error while loading. Please reload this page.