⚡️ Speed up method VectorIndexAutoRetriever.agenerate_retrieval_spec by 684%
#136
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 684% (6.84x) speedup for
VectorIndexAutoRetriever.agenerate_retrieval_specinllama-index-core/llama_index/core/indices/vector_store/retrievers/auto_retriever/auto_retriever.py⏱️ Runtime :
39.4 milliseconds→5.03 milliseconds(best of38runs)📝 Explanation and details
The optimization achieves a 683% speedup by caching expensive JSON serialization operations that were being redundantly computed on every method call.
Key Optimization:
__init__: The calls toVectorStoreQuerySpec.schema_json(indent=4)andself._vector_store_info.json(indent=4)are now cached as instance attributes_schema_strand_info_strduring initialization, rather than being recomputed every timeagenerate_retrieval_specis called.Why This Works:
schema_json()alone consumed 85.8% of execution time in the original version (212ms out of 247ms total). These values are static - the schema doesn't change between calls, and vector store info is typically immutable.Performance Impact:
Test Case Benefits:
The optimization excels in scenarios with repeated calls (throughput tests, concurrent execution tests, and large-scale operations), where the same retriever instance handles multiple queries - exactly the common usage pattern for vector search applications.
✅ Correctness verification report:
🌀 Generated Regression Tests and Runtime
import asyncio # used to run async functions
import logging
from typing import Any, List, Optional
import pytest # used for our unit tests
from llama_index.core.indices.vector_store.retrievers.auto_retriever.auto_retriever import
VectorIndexAutoRetriever
--- Minimal stubs for dependencies ---
class VectorStoreQuerySpec:
"""Minimal stub for VectorStoreQuerySpec."""
def init(self, query: str, filters: List[Any], top_k: Optional[int]):
self.query = query
self.filters = filters
self.top_k = top_k
class OutputParserException(Exception):
pass
class VectorStoreInfo:
def init(self, description="desc", metadata_filters=None):
self.description = description
self.metadata_filters = metadata_filters
class QueryBundle:
def init(self, query_str: str):
self.query_str = query_str
class MetadataFilters:
def init(self, filters=None, condition=None):
self.filters = filters or []
self.condition = condition
class FilterCondition:
OR = "OR"
AND = "AND"
class ServiceContext:
def init(self, llm=None, callback_manager=None):
self.llm = llm
self.callback_manager = callback_manager
class VectorStoreIndex:
def init(self, service_context=None, object_map=None):
self.service_context = service_context or ServiceContext()
self._object_map = object_map or {}
--- Minimal stub for LLM and its async predict ---
class DummyLLM:
def init(self, output_map=None, raise_on=None):
self.output_map = output_map or {}
self.raise_on = raise_on or set()
self.metadata = type("Meta", (), {"is_chat_model": False})()
self.output_parser = None
self.messages_to_prompt = None
self.completion_to_prompt = None
self.system_prompt = None
self.query_wrapper_prompt = None
--- Unit tests for VectorIndexAutoRetriever.agenerate_retrieval_spec ---
1. Basic Test Cases
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_basic():
"""Test basic async retrieval spec generation with a normal query."""
llm = DummyLLM()
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(description="A test vector store"),
llm=llm,
)
query_bundle = QueryBundle(query_str="search for cats")
result = await retriever.agenerate_retrieval_spec(query_bundle)
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_empty_query():
"""Test async retrieval spec generation with an empty query string."""
llm = DummyLLM()
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(description="Empty query test"),
llm=llm,
)
query_bundle = QueryBundle(query_str="")
result = await retriever.agenerate_retrieval_spec(query_bundle)
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_extra_filters_and_condition_and():
"""Test that extra_filters with AND condition is accepted."""
llm = DummyLLM()
extra_filters = MetadataFilters(filters=[{"field": "color", "value": "blue"}], condition=FilterCondition.AND)
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(description="AND filters"),
llm=llm,
extra_filters=extra_filters,
)
query_bundle = QueryBundle(query_str="blue things")
result = await retriever.agenerate_retrieval_spec(query_bundle)
async def test_agenerate_retrieval_spec_concurrent_execution():
"""Test concurrent execution of agenerate_retrieval_spec with different queries."""
llm = DummyLLM(output_map={
"q1": '{"query": "q1", "filters": [], "top_k": 1}',
"q2": '{"query": "q2", "filters": [], "top_k": 2}',
"q3": '{"query": "q3", "filters": [], "top_k": 3}',
})
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(description="Concurrent test"),
llm=llm,
)
bundles = [QueryBundle("q1"), QueryBundle("q2"), QueryBundle("q3")]
results = await asyncio.gather(*(retriever.agenerate_retrieval_spec(qb) for qb in bundles))
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_large_scale_concurrent():
"""Test large scale concurrent execution of agenerate_retrieval_spec."""
llm = DummyLLM()
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(description="Large scale test"),
llm=llm,
)
# Create 100 concurrent queries
bundles = [QueryBundle(f"query_{i}") for i in range(100)]
results = await asyncio.gather(*(retriever.agenerate_retrieval_spec(qb) for qb in bundles))
# Check that all results are correct
for i, result in enumerate(results):
pass
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_large_scale_custom_outputs():
"""Test large scale concurrent execution with custom LLM outputs."""
output_map = {f"custom_{i}": f'{{"query": "custom_{i}", "filters": [], "top_k": {i % 10}}}' for i in range(50)}
llm = DummyLLM(output_map=output_map)
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(description="Large scale custom"),
llm=llm,
)
bundles = [QueryBundle(f"custom_{i}") for i in range(50)]
results = await asyncio.gather(*(retriever.agenerate_retrieval_spec(qb) for qb in bundles))
for i, result in enumerate(results):
pass
4. Throughput Test Cases
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_throughput_small_load():
"""Throughput test: small load (10 concurrent requests)."""
llm = DummyLLM()
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(description="Throughput small"),
llm=llm,
)
bundles = [QueryBundle(f"small_{i}") for i in range(10)]
results = await asyncio.gather(*(retriever.agenerate_retrieval_spec(qb) for qb in bundles))
for i, result in enumerate(results):
pass
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_throughput_medium_load():
"""Throughput test: medium load (50 concurrent requests)."""
llm = DummyLLM()
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(description="Throughput medium"),
llm=llm,
)
bundles = [QueryBundle(f"medium_{i}") for i in range(50)]
results = await asyncio.gather(*(retriever.agenerate_retrieval_spec(qb) for qb in bundles))
for i, result in enumerate(results):
pass
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_throughput_high_volume():
"""Throughput test: high volume (200 concurrent requests)."""
llm = DummyLLM()
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(description="Throughput high"),
llm=llm,
)
bundles = [QueryBundle(f"high_{i}") for i in range(200)]
results = await asyncio.gather(*(retriever.agenerate_retrieval_spec(qb) for qb in bundles))
for i, result in enumerate(results):
pass
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
import asyncio
import logging
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import pytest
from llama_index.core.indices.vector_store.retrievers.auto_retriever.auto_retriever import
VectorIndexAutoRetriever
--- Minimal stubs and helpers to support the test environment ---
Simulate QueryBundle object
@DataClass
class QueryBundle:
query_str: str
Simulate VectorStoreQuerySpec (Pydantic-like)
class VectorStoreQuerySpec:
def init(self, query: str, filters: Optional[List[dict]] = None, top_k: Optional[int] = None):
self.query = query
self.filters = filters or []
self.top_k = top_k
Simulate VectorStoreInfo (Pydantic-like)
class VectorStoreInfo:
def init(self, description: str = "Test vector store", supported_metadata: Optional[List[str]] = None):
self.description = description
self.supported_metadata = supported_metadata or []
Simulate LLM (async)
class DummyLLM:
def init(self, outputs: Optional[List[str]] = None, raise_exc: bool = False):
self.outputs = outputs or []
self.call_count = 0
self.raise_exc = raise_exc
Simulate MetadataFilters and FilterCondition
class MetadataFilters:
def init(self, filters, condition=None):
self.filters = filters
self.condition = condition
class FilterCondition:
OR = "OR"
AND = "AND"
Simulate VectorStoreIndex
class VectorStoreIndex:
def init(self, service_context=None):
self.service_context = service_context
self._object_map = {}
--- Tests ---
1. Basic Test Cases
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_basic_empty_filters():
"""Test: function handles empty filters and top_k=None."""
llm = DummyLLM(outputs=[
'{"query": "bar", "filters": [], "top_k": null}'
])
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(),
llm=llm,
)
bundle = QueryBundle(query_str="bar")
result = await retriever.agenerate_retrieval_spec(bundle)
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_llm_raises_exception():
"""Test: function raises if LLM apredict raises an exception."""
llm = DummyLLM(raise_exc=True)
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(),
llm=llm,
)
bundle = QueryBundle(query_str="fail")
with pytest.raises(RuntimeError):
await retriever.agenerate_retrieval_spec(bundle)
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_concurrent_same_query():
"""Test: concurrent calls with same query string are isolated."""
llm = DummyLLM(outputs=[
'{"query": "same", "filters": [], "top_k": 1}',
'{"query": "same", "filters": [], "top_k": 2}',
])
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(),
llm=llm,
)
bundle = QueryBundle(query_str="same")
results = await asyncio.gather(
retriever.agenerate_retrieval_spec(bundle),
retriever.agenerate_retrieval_spec(bundle),
)
3. Large Scale Test Cases
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_many_concurrent_requests():
"""Test: function can handle many concurrent requests."""
# 20 concurrent requests with unique outputs
outputs = [
f'{{"query": "q{i}", "filters": [], "top_k": {i}}}' for i in range(20)
]
llm = DummyLLM(outputs=outputs)
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(),
llm=llm,
)
bundles = [QueryBundle(query_str=f"q{i}") for i in range(20)]
results = await asyncio.gather(*(retriever.agenerate_retrieval_spec(b) for b in bundles))
for i, res in enumerate(results):
pass
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_throughput_small_load():
"""Throughput: function handles a small burst of requests quickly."""
llm = DummyLLM(outputs=[
'{"query": "t1", "filters": [], "top_k": 1}',
'{"query": "t2", "filters": [], "top_k": 2}',
'{"query": "t3", "filters": [], "top_k": 3}',
'{"query": "t4", "filters": [], "top_k": 4}',
'{"query": "t5", "filters": [], "top_k": 5}',
])
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(),
llm=llm,
)
bundles = [QueryBundle(query_str=f"t{i}") for i in range(1, 6)]
results = await asyncio.gather(*(retriever.agenerate_retrieval_spec(b) for b in bundles))
for i, res in enumerate(results, 1):
pass
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_throughput_medium_load():
"""Throughput: function handles a moderate burst of requests."""
n = 50
outputs = [
f'{{"query": "m{i}", "filters": [], "top_k": {i}}}' for i in range(n)
]
llm = DummyLLM(outputs=outputs)
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(),
llm=llm,
)
bundles = [QueryBundle(query_str=f"m{i}") for i in range(n)]
results = await asyncio.gather(*(retriever.agenerate_retrieval_spec(b) for b in bundles))
for i, res in enumerate(results):
pass
@pytest.mark.asyncio
async def test_agenerate_retrieval_spec_throughput_large_load():
"""Throughput: function handles a large number of requests in parallel."""
n = 100
outputs = [
f'{{"query": "l{i}", "filters": [], "top_k": {i}}}' for i in range(n)
]
llm = DummyLLM(outputs=outputs)
retriever = VectorIndexAutoRetriever(
index=VectorStoreIndex(),
vector_store_info=VectorStoreInfo(),
llm=llm,
)
bundles = [QueryBundle(query_str=f"l{i}") for i in range(n)]
results = await asyncio.gather(*(retriever.agenerate_retrieval_spec(b) for b in bundles))
for i, res in enumerate(results):
pass
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
To edit these changes
git checkout codeflash/optimize-VectorIndexAutoRetriever.agenerate_retrieval_spec-mhvf3adjand push.