From f3716ab9d3d71c7ceef8a0cb70cf8190884c94ea Mon Sep 17 00:00:00 2001 From: yc111233 Date: Fri, 10 Apr 2026 02:19:43 +0800 Subject: [PATCH] fix: backfill abstract from file content in vectorize_file When index_resource calls vectorize_file without a summary in summary_dict, the abstract field on the Context is set to an empty string. This means leaf (L2) records in the vector database end up with an empty abstract. Downstream, hierarchical_retriever passes these empty abstracts as documents to the rerank API, which causes rerank providers (e.g. DashScope qwen3-rerank) to return HTTP 400 because they reject empty document strings. Fix: when vectorize_file reads raw file content for embedding and the abstract is still empty, backfill it with the first 200 characters of the file content. This ensures every L2 record has a non-empty abstract for reranking. Co-Authored-By: Claude Opus 4.6 --- openviking/utils/embedding_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py index cf442cc0a..54a7ba985 100644 --- a/openviking/utils/embedding_utils.py +++ b/openviking/utils/embedding_utils.py @@ -282,6 +282,8 @@ def _truncate_text(value: str) -> str: if isinstance(content, bytes): content = content.decode("utf-8", errors="replace") content = _truncate_text(content) + if not context.abstract and content: + context.abstract = content[:200] context.set_vectorize(Vectorize(text=content)) except Exception as e: logger.warning(