diff --git a/pinecone/db_data/dataclasses/search_query.py b/pinecone/db_data/dataclasses/search_query.py index 3adb80829..6ebd55ac9 100644 --- a/pinecone/db_data/dataclasses/search_query.py +++ b/pinecone/db_data/dataclasses/search_query.py @@ -38,6 +38,19 @@ class SearchQuery: The unique ID of the vector to be used as a query vector. """ + match_terms: Optional[Dict[str, Any]] = None + """ + Specifies which terms must be present in the text of each search hit based on the specified strategy. + The match is performed against the text field specified in the integrated index field_map configuration. + Terms are normalized and tokenized into single tokens before matching, and order does not matter. + Expected format: {"strategy": "all", "terms": ["term1", "term2", ...]} + Currently only "all" strategy is supported, which means all specified terms must be present. + + **Limitations:** match_terms is only supported for sparse indexes with integrated embedding + configured to use the pinecone-sparse-english-v0 model. + Optional. + """ + def __post_init__(self): """ Converts `vector` to a `SearchQueryVectorTypedDict` instance if an enum is provided. @@ -55,5 +68,6 @@ def as_dict(self) -> Dict[str, Any]: "filter": self.filter, "vector": self.vector, "id": self.id, + "match_terms": self.match_terms, } return {k: v for k, v in d.items() if v is not None} diff --git a/pinecone/db_data/index_asyncio_interface.py b/pinecone/db_data/index_asyncio_interface.py index 50e4d1f65..889ce215a 100644 --- a/pinecone/db_data/index_asyncio_interface.py +++ b/pinecone/db_data/index_asyncio_interface.py @@ -773,7 +773,13 @@ async def search( """ :param namespace: The namespace in the index to search. :type namespace: str, required - :param query: The SearchQuery to use for the search. + :param query: The SearchQuery to use for the search. The query can include a ``match_terms`` field + to specify which terms must be present in the text of each search hit. The match_terms + should be a dict with ``strategy`` (str) and ``terms`` (List[str]) keys, e.g. + ``{"strategy": "all", "terms": ["term1", "term2"]}``. Currently only "all" strategy + is supported, which means all specified terms must be present. + **Note:** match_terms is only supported for sparse indexes with integrated embedding + configured to use the pinecone-sparse-english-v0 model. :type query: Union[Dict, SearchQuery], required :param rerank: The SearchRerank to use with the search request. :type rerank: Union[Dict, SearchRerank], optional diff --git a/pinecone/db_data/interfaces.py b/pinecone/db_data/interfaces.py index 263de553a..f486a77bb 100644 --- a/pinecone/db_data/interfaces.py +++ b/pinecone/db_data/interfaces.py @@ -352,7 +352,13 @@ def search( """ :param namespace: The namespace in the index to search. :type namespace: str, required - :param query: The SearchQuery to use for the search. + :param query: The SearchQuery to use for the search. The query can include a ``match_terms`` field + to specify which terms must be present in the text of each search hit. The match_terms + should be a dict with ``strategy`` (str) and ``terms`` (List[str]) keys, e.g. + ``{"strategy": "all", "terms": ["term1", "term2"]}``. Currently only "all" strategy + is supported, which means all specified terms must be present. + **Note:** match_terms is only supported for sparse indexes with integrated embedding + configured to use the pinecone-sparse-english-v0 model. :type query: Union[Dict, SearchQuery], required :param rerank: The SearchRerank to use with the search request. :type rerank: Union[Dict, SearchRerank], optional diff --git a/pinecone/db_data/request_factory.py b/pinecone/db_data/request_factory.py index 780a3fa0b..64bb65d9c 100644 --- a/pinecone/db_data/request_factory.py +++ b/pinecone/db_data/request_factory.py @@ -11,6 +11,7 @@ SearchRecordsRequest, SearchRecordsRequestQuery, SearchRecordsRequestRerank, + SearchMatchTerms, VectorValues, SearchRecordsVector, UpsertRecord, @@ -218,11 +219,18 @@ def _parse_search_query( if isinstance(query_dict.get("vector", None), SearchQueryVector): query_dict["vector"] = query_dict["vector"].as_dict() + # Extract match_terms for conversion if present + match_terms = query_dict.pop("match_terms", None) + if match_terms is not None and isinstance(match_terms, dict): + match_terms = SearchMatchTerms(**match_terms) + srrq = SearchRecordsRequestQuery( **{k: v for k, v in query_dict.items() if k not in {"vector"}} ) if query_dict.get("vector", None) is not None: srrq.vector = IndexRequestFactory._parse_search_vector(query_dict["vector"]) + if match_terms is not None: + srrq.match_terms = match_terms return srrq @staticmethod diff --git a/pinecone/db_data/types/search_query_typed_dict.py b/pinecone/db_data/types/search_query_typed_dict.py index c21ba1202..5887203f7 100644 --- a/pinecone/db_data/types/search_query_typed_dict.py +++ b/pinecone/db_data/types/search_query_typed_dict.py @@ -34,3 +34,16 @@ class SearchQueryTypedDict(TypedDict): """ The unique ID of the vector to be used as a query vector. """ + + match_terms: Optional[Dict[str, Any]] + """ + Specifies which terms must be present in the text of each search hit based on the specified strategy. + The match is performed against the text field specified in the integrated index field_map configuration. + Terms are normalized and tokenized into single tokens before matching, and order does not matter. + Expected format: {"strategy": "all", "terms": ["term1", "term2", ...]} + Currently only "all" strategy is supported, which means all specified terms must be present. + + **Limitations:** match_terms is only supported for sparse indexes with integrated embedding + configured to use the pinecone-sparse-english-v0 model. + Optional. + """ diff --git a/tests/integration/data/test_search_and_upsert_records.py b/tests/integration/data/test_search_and_upsert_records.py index 0a269a49a..7b60934e8 100644 --- a/tests/integration/data/test_search_and_upsert_records.py +++ b/tests/integration/data/test_search_and_upsert_records.py @@ -185,6 +185,66 @@ def test_search_with_rerank_query(self, model_idx, records_to_upsert): assert len(response.result.hits) == 3 assert response.usage is not None + def test_search_with_match_terms_dict(self, model_idx, records_to_upsert): + """Test that match_terms can be passed via dict query.""" + from pinecone import PineconeApiException + + target_namespace = random_string(10) + model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + poll_until_fetchable( + model_idx, target_namespace, [r["id"] for r in records_to_upsert], timeout=180 + ) + + # Search with match_terms using dict + query_dict = { + "inputs": {"text": "Apple corporation"}, + "top_k": 3, + "match_terms": {"strategy": "all", "terms": ["Apple", "corporation"]}, + } + # match_terms is only supported for pinecone-sparse-english-v0 model + # If the API rejects it due to model incompatibility, that's expected + # and shows our code is correctly passing the parameter + try: + response = model_idx.search_records(namespace=target_namespace, query=query_dict) + assert response.usage is not None + # Test search alias + response2 = model_idx.search(namespace=target_namespace, query=query_dict) + assert response == response2 + except PineconeApiException as e: + # Verify the error is about model compatibility, not parameter format + assert "match_terms" in str(e) or "pinecone-sparse-english-v0" in str(e) + + def test_search_with_match_terms_searchquery(self, model_idx, records_to_upsert): + """Test that match_terms can be passed via SearchQuery dataclass.""" + from pinecone import SearchQuery, PineconeApiException + + target_namespace = random_string(10) + model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + poll_until_fetchable( + model_idx, target_namespace, [r["id"] for r in records_to_upsert], timeout=180 + ) + + # Search with match_terms using SearchQuery dataclass + query = SearchQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + match_terms={"strategy": "all", "terms": ["Apple", "corporation"]}, + ) + # match_terms is only supported for pinecone-sparse-english-v0 model + # If the API rejects it due to model incompatibility, that's expected + # and shows our code is correctly passing the parameter + try: + response = model_idx.search_records(namespace=target_namespace, query=query) + assert response.usage is not None + # Test search alias + response2 = model_idx.search(namespace=target_namespace, query=query) + assert response == response2 + except PineconeApiException as e: + # Verify the error is about model compatibility, not parameter format + assert "match_terms" in str(e) or "pinecone-sparse-english-v0" in str(e) + @pytest.mark.skipif( os.getenv("USE_GRPC") != "false", reason="These actions are not supported in gRPC" diff --git a/tests/integration/data_asyncio/test_search_and_upsert_records.py b/tests/integration/data_asyncio/test_search_and_upsert_records.py index 2e43a9c2d..09e2242cb 100644 --- a/tests/integration/data_asyncio/test_search_and_upsert_records.py +++ b/tests/integration/data_asyncio/test_search_and_upsert_records.py @@ -161,6 +161,66 @@ async def test_search_with_rerank_query(self, model_index_host, records_to_upser assert response.usage is not None await model_idx.close() + async def test_search_with_match_terms_dict(self, model_index_host, records_to_upsert): + """Test that match_terms can be passed via dict query.""" + from pinecone import PineconeApiException + + model_idx = build_asyncioindex_client(model_index_host) + target_namespace = random_string(10) + await model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + await poll_for_freshness(model_idx, target_namespace, len(records_to_upsert)) + + # Search with match_terms using dict + query_dict = { + "inputs": {"text": "Apple corporation"}, + "top_k": 3, + "match_terms": {"strategy": "all", "terms": ["Apple", "corporation"]}, + } + # match_terms is only supported for pinecone-sparse-english-v0 model + # If the API rejects it due to model incompatibility, that's expected + # and shows our code is correctly passing the parameter + try: + response = await model_idx.search_records(namespace=target_namespace, query=query_dict) + assert response.usage is not None + # Test search alias + response2 = await model_idx.search(namespace=target_namespace, query=query_dict) + assert response == response2 + except PineconeApiException as e: + # Verify the error is about model compatibility, not parameter format + assert "match_terms" in str(e) or "pinecone-sparse-english-v0" in str(e) + await model_idx.close() + + async def test_search_with_match_terms_searchquery(self, model_index_host, records_to_upsert): + """Test that match_terms can be passed via SearchQuery dataclass.""" + from pinecone import SearchQuery, PineconeApiException + + model_idx = build_asyncioindex_client(model_index_host) + target_namespace = random_string(10) + await model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + await poll_for_freshness(model_idx, target_namespace, len(records_to_upsert)) + + # Search with match_terms using SearchQuery dataclass + query = SearchQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + match_terms={"strategy": "all", "terms": ["Apple", "corporation"]}, + ) + # match_terms is only supported for pinecone-sparse-english-v0 model + # If the API rejects it due to model incompatibility, that's expected + # and shows our code is correctly passing the parameter + try: + response = await model_idx.search_records(namespace=target_namespace, query=query) + assert response.usage is not None + # Test search alias + response2 = await model_idx.search(namespace=target_namespace, query=query) + assert response == response2 + except PineconeApiException as e: + # Verify the error is about model compatibility, not parameter format + assert "match_terms" in str(e) or "pinecone-sparse-english-v0" in str(e) + await model_idx.close() + @pytest.mark.asyncio class TestUpsertAndSearchRecordsErrorCases: