From 9fcb8dbff6440c1ffe4d2d6af5c78813b5a24ea9 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Thu, 30 Jan 2025 08:43:47 -0500 Subject: [PATCH 1/5] Implement search, search_records, and upsert_records --- codegen/build-oas.sh | 39 ++ pinecone/config/__init__.py | 2 +- pinecone/control/pinecone.py | 30 +- pinecone/core/openapi/db_data/model/hit.py | 28 +- .../openapi/db_data/model/upsert_record.py | 16 +- pinecone/data/dataclasses/__init__.py | 3 + pinecone/data/dataclasses/search_query.py | 59 +++ .../data/dataclasses/search_query_vector.py | 38 ++ pinecone/data/dataclasses/search_rerank.py | 61 +++ .../inference/inference_request_builder.py | 4 +- pinecone/data/index.py | 49 ++- pinecone/data/interfaces.py | 52 +++ pinecone/data/request_factory.py | 100 ++++- pinecone/data/types/__init__.py | 3 + .../data/types/search_query_typed_dict.py | 36 ++ .../types/search_query_vector_typed_dict.py | 25 ++ .../data/types/search_rerank_typed_dict.py | 41 ++ pinecone/openapi_support/api_client.py | 3 +- .../openapi_support/asyncio_api_client.py | 3 +- pinecone/openapi_support/configuration.py | 8 +- pinecone/openapi_support/exceptions.py | 2 +- pinecone/openapi_support/model_utils.py | 6 +- pinecone/openapi_support/rest_urllib3.py | 24 +- pinecone/openapi_support/rest_utils.py | 5 - pinecone/utils/__init__.py | 2 + pinecone/utils/convert_enum_to_string.py | 8 + poetry.lock | 29 +- pyproject.toml | 1 + tests/__init__.py | 5 + tests/integration/data/conftest.py | 49 ++- .../data/test_search_and_upsert_records.py | 221 ++++++++++ tests/integration/helpers/__init__.py | 1 + tests/integration/helpers/helpers.py | 5 + tests/perf/test_re_vs_in.py | 46 ++ tests/unit/data/test_request_factory.py | 401 ++++++++++++++++++ 35 files changed, 1326 insertions(+), 79 deletions(-) create mode 100644 pinecone/data/dataclasses/search_query.py create mode 100644 pinecone/data/dataclasses/search_query_vector.py create mode 100644 pinecone/data/dataclasses/search_rerank.py create mode 100644 pinecone/data/types/search_query_typed_dict.py create mode 100644 pinecone/data/types/search_query_vector_typed_dict.py create mode 100644 pinecone/data/types/search_rerank_typed_dict.py create mode 100644 pinecone/utils/convert_enum_to_string.py create mode 100644 tests/integration/data/test_search_and_upsert_records.py create mode 100644 tests/perf/test_re_vs_in.py create mode 100644 tests/unit/data/test_request_factory.py diff --git a/codegen/build-oas.sh b/codegen/build-oas.sh index 738b12703..98321d6aa 100755 --- a/codegen/build-oas.sh +++ b/codegen/build-oas.sh @@ -134,6 +134,44 @@ remove_shared_classes() { done } +# Generated Python code attempts to internally map OpenAPI fields that begin +# with "_" to a non-underscored alternative. Along with a polymorphic object, +# this causes collisions and headaches. We massage the generated models to +# maintain the original field names from the OpenAPI spec and circumvent +# the remapping behavior as this is simpler for now than creating a fully +# custom java generator class. +clean_oas_underscore_manipulation() { + temp_file="$(mktemp)" + + db_data_destination="${destination}/db_data" + + # echo "Cleaning up upsert_record.py" + sed -i '' \ + -e "s/'id'/'_id'/g" \ + -e 's/self.id/self._id/g' \ + -e 's/id (/_id (/g' \ + -e 's/= id/= _id/g' \ + -e 's/id,/_id,/g' \ + -e "s/'vector\'/'_vector'/g" \ + -e "s/'embed\'/'_embed'/g" \ + -e 's/vector (/_vector (/g' \ + -e 's/embed (/_embed (/g' \ + "${db_data_destination}/model/upsert_record.py" + + # echo "Cleaning up hit.py" + sed -i '' \ + -e "s/'id'/'_id'/g" \ + -e "s/'score'/'_score'/g" \ + -e 's/ id, score,/ _id, _score,/g' \ + -e 's/id (/_id (/g' \ + -e 's/score (/_score (/g' \ + -e 's/self.id/self._id/g' \ + -e 's/self.score/self._score/g' \ + -e 's/= id/= _id/g' \ + -e 's/= score/= _score/g' \ + "${db_data_destination}/model/hit.py" +} + update_apis_repo update_templates_repo verify_spec_version $version @@ -144,6 +182,7 @@ mkdir -p "${destination}" for module in "${modules[@]}"; do generate_client $module done +clean_oas_underscore_manipulation # This also exists in the generated module code, but we need to reference it # in the pinecone.openapi_support package as well without creating a circular diff --git a/pinecone/config/__init__.py b/pinecone/config/__init__.py index 236260820..7abb7278e 100644 --- a/pinecone/config/__init__.py +++ b/pinecone/config/__init__.py @@ -5,4 +5,4 @@ from .pinecone_config import PineconeConfig if os.getenv("PINECONE_DEBUG") is not None: - logging.basicConfig(level=logging.DEBUG) + logging.getLogger("pinecone").setLevel(level=logging.DEBUG) diff --git a/pinecone/control/pinecone.py b/pinecone/control/pinecone.py index 569625685..5bf594e01 100644 --- a/pinecone/control/pinecone.py +++ b/pinecone/control/pinecone.py @@ -1,7 +1,6 @@ import time import logging from typing import Optional, Dict, Any, Union -from enum import Enum from .index_host_store import IndexHostStore from .pinecone_interface import PineconeDBControlInterface @@ -12,7 +11,12 @@ from pinecone.openapi_support.api_client import ApiClient -from pinecone.utils import normalize_host, setup_openapi_client, build_plugin_setup_client +from pinecone.utils import ( + normalize_host, + setup_openapi_client, + build_plugin_setup_client, + convert_enum_to_string, +) from pinecone.core.openapi.db_control.models import ( CreateCollectionRequest, CreateIndexForModelRequest, @@ -156,17 +160,12 @@ def __parse_tags(self, tags: Optional[Dict[str, str]]) -> IndexTags: def __parse_deletion_protection( self, deletion_protection: Union[DeletionProtection, str] ) -> DeletionProtectionModel: - deletion_protection = self.__parse_enum_to_string(deletion_protection) + deletion_protection = convert_enum_to_string(deletion_protection) if deletion_protection in ["enabled", "disabled"]: return DeletionProtectionModel(deletion_protection) else: raise ValueError("deletion_protection must be either 'enabled' or 'disabled'") - def __parse_enum_to_string(self, value: Union[Enum, str]) -> str: - if isinstance(value, Enum): - return value.value - return value - def __parse_index_spec(self, spec: Union[Dict, ServerlessSpec, PodSpec]) -> IndexSpec: if isinstance(spec, dict): if "serverless" in spec: @@ -227,9 +226,9 @@ def create_index( tags: Optional[Dict[str, str]] = None, ) -> IndexModel: if metric is not None: - metric = self.__parse_enum_to_string(metric) + metric = convert_enum_to_string(metric) if vector_type is not None: - vector_type = self.__parse_enum_to_string(vector_type) + vector_type = convert_enum_to_string(vector_type) if deletion_protection is not None: dp = self.__parse_deletion_protection(deletion_protection) else: @@ -270,8 +269,8 @@ def create_index_for_model( deletion_protection: Optional[Union[DeletionProtection, str]] = DeletionProtection.DISABLED, timeout: Optional[int] = None, ) -> IndexModel: - cloud = self.__parse_enum_to_string(cloud) - region = self.__parse_enum_to_string(region) + cloud = convert_enum_to_string(cloud) + region = convert_enum_to_string(region) if deletion_protection is not None: dp = self.__parse_deletion_protection(deletion_protection) else: @@ -289,10 +288,7 @@ def create_index_for_model( raise ValueError(f"{field} is required in embed") parsed_embed = {} for key, value in embed.items(): - if isinstance(value, Enum): - parsed_embed[key] = value.value - else: - parsed_embed[key] = value + parsed_embed[key] = convert_enum_to_string(value) args = parse_non_empty_args( [ @@ -325,7 +321,7 @@ def is_ready(): # Wait indefinitely while not is_ready(): logger.debug( - f"Waiting for index {name} to be ready. Total wait time: {total_wait_time}" + f"Waiting for index {name} to be ready. Total wait time {total_wait_time} seconds." ) total_wait_time += 5 time.sleep(5) diff --git a/pinecone/core/openapi/db_data/model/hit.py b/pinecone/core/openapi/db_data/model/hit.py index 93583fdba..448ff2b12 100644 --- a/pinecone/core/openapi/db_data/model/hit.py +++ b/pinecone/core/openapi/db_data/model/hit.py @@ -78,8 +78,8 @@ def openapi_types(): and the value is attribute type. """ return { - "id": (str,), # noqa: E501 - "score": (float,), # noqa: E501 + "_id": (str,), # noqa: E501 + "_score": (float,), # noqa: E501 "fields": ({str: (bool, dict, float, int, list, str, none_type)},), # noqa: E501 } @@ -88,8 +88,8 @@ def discriminator(): return None attribute_map = { - "id": "_id", # noqa: E501 - "score": "_score", # noqa: E501 + "_id": "_id", # noqa: E501 + "_score": "_score", # noqa: E501 "fields": "fields", # noqa: E501 } @@ -99,12 +99,12 @@ def discriminator(): @classmethod @convert_js_args_to_python_args - def _from_openapi_data(cls, id, score, fields, *args, **kwargs): # noqa: E501 + def _from_openapi_data(cls, _id, _score, fields, *args, **kwargs): # noqa: E501 """Hit - a model defined in OpenAPI Args: - id (str): The record id of the search hit. - score (float): The similarity score of the returned record. + _id (str): The record id of the search hit. + _score (float): The similarity score of the returned record. fields ({str: (bool, dict, float, int, list, str, none_type)}): The selected record fields associated with the search hit. Keyword Args: @@ -163,8 +163,8 @@ def _from_openapi_data(cls, id, score, fields, *args, **kwargs): # noqa: E501 self._configuration = _configuration self._visited_composed_classes = _visited_composed_classes + (self.__class__,) - self.id = id - self.score = score + self._id = _id + self._score = _score self.fields = fields for var_name, var_value in kwargs.items(): if ( @@ -190,12 +190,12 @@ def _from_openapi_data(cls, id, score, fields, *args, **kwargs): # noqa: E501 ) @convert_js_args_to_python_args - def __init__(self, id, score, fields, *args, **kwargs): # noqa: E501 + def __init__(self, _id, _score, fields, *args, **kwargs): # noqa: E501 """Hit - a model defined in OpenAPI Args: - id (str): The record id of the search hit. - score (float): The similarity score of the returned record. + _id (str): The record id of the search hit. + _score (float): The similarity score of the returned record. fields ({str: (bool, dict, float, int, list, str, none_type)}): The selected record fields associated with the search hit. Keyword Args: @@ -252,8 +252,8 @@ def __init__(self, id, score, fields, *args, **kwargs): # noqa: E501 self._configuration = _configuration self._visited_composed_classes = _visited_composed_classes + (self.__class__,) - self.id = id - self.score = score + self._id = _id + self._score = _score self.fields = fields for var_name, var_value in kwargs.items(): if ( diff --git a/pinecone/core/openapi/db_data/model/upsert_record.py b/pinecone/core/openapi/db_data/model/upsert_record.py index e76b9f65a..5fe322b27 100644 --- a/pinecone/core/openapi/db_data/model/upsert_record.py +++ b/pinecone/core/openapi/db_data/model/upsert_record.py @@ -78,7 +78,7 @@ def openapi_types(): and the value is attribute type. """ return { - "id": (str,) # noqa: E501 + "_id": (str,) # noqa: E501 } @cached_property @@ -86,7 +86,7 @@ def discriminator(): return None attribute_map = { - "id": "_id" # noqa: E501 + "_id": "_id" # noqa: E501 } read_only_vars = {} @@ -95,11 +95,11 @@ def discriminator(): @classmethod @convert_js_args_to_python_args - def _from_openapi_data(cls, id, *args, **kwargs): # noqa: E501 + def _from_openapi_data(cls, _id, *args, **kwargs): # noqa: E501 """UpsertRecord - a model defined in OpenAPI Args: - id (str): The unique ID of the record to upsert. Note that `id` can be used as an alias for `_id`. + _id (str): The unique ID of the record to upsert. Note that `id` can be used as an alias for `_id`. Keyword Args: _check_type (bool): if True, values for parameters in openapi_types @@ -157,7 +157,7 @@ def _from_openapi_data(cls, id, *args, **kwargs): # noqa: E501 self._configuration = _configuration self._visited_composed_classes = _visited_composed_classes + (self.__class__,) - self.id = id + self._id = _id for var_name, var_value in kwargs.items(): if ( var_name not in self.attribute_map @@ -182,11 +182,11 @@ def _from_openapi_data(cls, id, *args, **kwargs): # noqa: E501 ) @convert_js_args_to_python_args - def __init__(self, id, *args, **kwargs): # noqa: E501 + def __init__(self, _id, *args, **kwargs): # noqa: E501 """UpsertRecord - a model defined in OpenAPI Args: - id (str): The unique ID of the record to upsert. Note that `id` can be used as an alias for `_id`. + _id (str): The unique ID of the record to upsert. Note that `id` can be used as an alias for `_id`. Keyword Args: _check_type (bool): if True, values for parameters in openapi_types @@ -242,7 +242,7 @@ def __init__(self, id, *args, **kwargs): # noqa: E501 self._configuration = _configuration self._visited_composed_classes = _visited_composed_classes + (self.__class__,) - self.id = id + self._id = _id for var_name, var_value in kwargs.items(): if ( var_name not in self.attribute_map diff --git a/pinecone/data/dataclasses/__init__.py b/pinecone/data/dataclasses/__init__.py index c481d1ab4..0183eb74e 100644 --- a/pinecone/data/dataclasses/__init__.py +++ b/pinecone/data/dataclasses/__init__.py @@ -1,3 +1,6 @@ from .sparse_values import SparseValues from .vector import Vector from .fetch_response import FetchResponse +from .search_query import SearchQuery +from .search_query_vector import SearchQueryVector +from .search_rerank import SearchRerank diff --git a/pinecone/data/dataclasses/search_query.py b/pinecone/data/dataclasses/search_query.py new file mode 100644 index 000000000..3adb80829 --- /dev/null +++ b/pinecone/data/dataclasses/search_query.py @@ -0,0 +1,59 @@ +from dataclasses import dataclass +from typing import Optional, Any, Dict, Union +from .search_query_vector import SearchQueryVector +from ..types.search_query_vector_typed_dict import SearchQueryVectorTypedDict + + +@dataclass +class SearchQuery: + """ + SearchQuery represents the query when searching within a specific namespace. + """ + + inputs: Dict[str, Any] + """ + The input data to search with. + Required. + """ + + top_k: int + """ + The number of results to return with each search. + Required. + """ + + filter: Optional[Dict[str, Any]] = None + """ + The filter to apply to the search. + Optional. + """ + + vector: Optional[Union[SearchQueryVectorTypedDict, SearchQueryVector]] = None + """ + The vector values to search with. If provided, it overwrites the inputs. + """ + + id: Optional[str] = None + """ + The unique ID of the vector to be used as a query vector. + """ + + def __post_init__(self): + """ + Converts `vector` to a `SearchQueryVectorTypedDict` instance if an enum is provided. + """ + if isinstance(self.vector, SearchQueryVector): + self.vector = self.vector.as_dict() + + def as_dict(self) -> Dict[str, Any]: + """ + Returns the SearchQuery as a dictionary. + """ + d = { + "inputs": self.inputs, + "top_k": self.top_k, + "filter": self.filter, + "vector": self.vector, + "id": self.id, + } + return {k: v for k, v in d.items() if v is not None} diff --git a/pinecone/data/dataclasses/search_query_vector.py b/pinecone/data/dataclasses/search_query_vector.py new file mode 100644 index 000000000..d829102f6 --- /dev/null +++ b/pinecone/data/dataclasses/search_query_vector.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass +from typing import Optional, List + + +@dataclass +class SearchQueryVector: + """ + SearchQueryVector represents the vector values used to query. + """ + + values: Optional[List[float]] = None + """ + The vector data included in the search request. + Optional. + """ + + sparse_values: Optional[List[float]] = None + """ + The sparse embedding values to search with. + Optional. + """ + + sparse_indices: Optional[List[int]] = None + """ + The sparse embedding indices to search with. + Optional. + """ + + def as_dict(self) -> dict: + """ + Returns the SearchQueryVector as a dictionary. + """ + d = { + "values": self.values, + "sparse_values": self.sparse_values, + "sparse_indices": self.sparse_indices, + } + return {k: v for k, v in d.items() if v is not None} diff --git a/pinecone/data/dataclasses/search_rerank.py b/pinecone/data/dataclasses/search_rerank.py new file mode 100644 index 000000000..1b9534ba3 --- /dev/null +++ b/pinecone/data/dataclasses/search_rerank.py @@ -0,0 +1,61 @@ +from dataclasses import dataclass +from typing import Optional, Dict, Any, List +from ..features.inference import RerankModel + + +@dataclass +class SearchRerank: + """ + SearchRerank represents a rerank request when searching within a specific namespace. + """ + + model: str + """ + The name of the [reranking model](https://docs.pinecone.io/guides/inference/understanding-inference#reranking-models) to use. + Required. + """ + + rank_fields: List[str] + """ + The fields to use for reranking. + Required. + """ + + top_n: Optional[int] = None + """ + The number of top results to return after reranking. Defaults to top_k. + Optional. + """ + + parameters: Optional[Dict[str, Any]] = None + """ + Additional model-specific parameters. Refer to the [model guide](https://docs.pinecone.io/guides/inference/understanding-inference#models) + for available model parameters. + Optional. + """ + + query: Optional[str] = None + """ + The query to rerank documents against. If a specific rerank query is specified, it overwrites + the query input that was provided at the top level. + """ + + def __post_init__(self): + """ + Converts `model` to a string if an instance of `RerankEnum` is provided. + """ + if isinstance(self.model, RerankModel): + self.model = self.model.value # Convert Enum to string + + def as_dict(self) -> Dict[str, Any]: + """ + Returns the SearchRerank as a dictionary. + """ + d = { + "model": self.model, + "rank_fields": self.rank_fields, + "top_n": self.top_n, + "parameters": self.parameters, + "query": self.query, + } + return {k: v for k, v in d.items() if v is not None} diff --git a/pinecone/data/features/inference/inference_request_builder.py b/pinecone/data/features/inference/inference_request_builder.py index 1c4e7e12a..0a8611390 100644 --- a/pinecone/data/features/inference/inference_request_builder.py +++ b/pinecone/data/features/inference/inference_request_builder.py @@ -7,6 +7,7 @@ Document, RerankRequest, ) +from pinecone.utils import convert_enum_to_string class EmbedModel(Enum): @@ -27,8 +28,7 @@ def embed_request( inputs: Union[str, List[Dict], List[str]], parameters: Optional[Dict[str, Any]] = None, ) -> EmbedRequest: - if isinstance(model, EmbedModel): - model = model.value + model = convert_enum_to_string(model) embeddings_inputs: List[EmbedRequestInputs] = [] if isinstance(inputs, str): embeddings_inputs = [EmbedRequestInputs(text=inputs)] diff --git a/pinecone/data/index.py b/pinecone/data/index.py index 7958e808f..bb57df262 100644 --- a/pinecone/data/index.py +++ b/pinecone/data/index.py @@ -14,8 +14,10 @@ IndexDescription as DescribeIndexStatsResponse, UpsertResponse, ListResponse, + UpsertRecord, + SearchRecordsResponse, ) -from .dataclasses import Vector, SparseValues, FetchResponse +from .dataclasses import Vector, SparseValues, FetchResponse, SearchQuery, SearchRerank from .interfaces import IndexInterface from .request_factory import IndexRequestFactory from .features.bulk_import import ImportFeatureMixin @@ -26,6 +28,8 @@ VectorTuple, VectorTupleWithMetadata, FilterTypedDict, + SearchRerankTypedDict, + SearchQueryTypedDict, ) from ..utils import ( setup_openapi_client, @@ -201,6 +205,49 @@ def upsert_from_dataframe( return UpsertResponse(upserted_count=upserted_count) + def upsert_records(self, namespace: str, records: List[Dict]): + if namespace is None: + raise ValueError("namespace is required when upserting records") + if not records or len(records) == 0: + raise ValueError("No records provided") + + records_to_upsert = [] + for record in records: + if not record.get("_id") and not record.get("id"): + raise ValueError("Each record must have an '_id' or 'id' value") + + records_to_upsert.append( + UpsertRecord( + record.get("_id", record.get("id")), + **{k: v for k, v in record.items() if k not in {"_id", "id"}}, + ) + ) + + self._vector_api.upsert_records_namespace(namespace, records_to_upsert) + + def search( + self, + namespace: str, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + if not namespace: + raise Exception("Namespace is required when searching records") + + request = IndexRequestFactory.search_request(query=query, rerank=rerank, fields=fields) + + return self._vector_api.search_records_namespace(namespace, request) + + def search_records( + self, + namespace: str, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + return self.search(namespace, query=query, rerank=rerank, fields=fields) + @validate_and_convert_errors def delete( self, diff --git a/pinecone/data/interfaces.py b/pinecone/data/interfaces.py index 5dd2bd968..2930e2754 100644 --- a/pinecone/data/interfaces.py +++ b/pinecone/data/interfaces.py @@ -9,6 +9,7 @@ Vector, ListResponse, SparseValues, + SearchRecordsResponse, ) from .query_results_aggregator import QueryNamespacesResults from multiprocessing.pool import ApplyResult @@ -20,6 +21,7 @@ VectorTuple, VectorTupleWithMetadata, ) +from .dataclasses import SearchQuery, SearchRerank class IndexInterface(ABC): @@ -102,6 +104,56 @@ def upsert_from_dataframe( """ pass + @abstractmethod + def upsert_records(self, namespace: str, records: List[Dict]): + """ + Upsert records to a namespace. + + Converts records into embeddings and upserts them into a namespacce in the index. + + :param namespace: The namespace of the index to upsert records to. + :type namespace: str, required + :param records: The records to upsert into the index. + :type records: List[Dict], required + """ + pass + + @abstractmethod + def search( + self, + namespace: str, + query: Union[Dict, SearchQuery], + rerank: Optional[Union[Dict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + """ + Search for records. + + This operation converts a query to a vector embedding and then searches a namespace. You + can optionally provide a reranking operation as part of the search. + + :param namespace: The namespace in the index to search. + :type namespace: str, required + :param query: The SearchQuery to use for the search. + :type query: Union[Dict, SearchQuery], required + :param rerank: The SearchRerank to use with the search request. + :type rerank: Union[Dict, SearchRerank], optional + :return: The records that match the search. + :rtype: RecordModel + """ + pass + + @abstractmethod + def search_records( + self, + namespace: str, + query: Union[Dict, SearchQuery], + rerank: Optional[Union[Dict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + """Alias of the search() method.""" + pass + @abstractmethod def delete( self, diff --git a/pinecone/data/request_factory.py b/pinecone/data/request_factory.py index a17965fa9..270963928 100644 --- a/pinecone/data/request_factory.py +++ b/pinecone/data/request_factory.py @@ -7,8 +7,13 @@ DeleteRequest, UpdateRequest, DescribeIndexStatsRequest, + SearchRecordsRequest, + SearchRecordsRequestQuery, + SearchRecordsRequestRerank, + VectorValues, + SearchRecordsVector, ) -from ..utils import parse_non_empty_args +from ..utils import parse_non_empty_args, convert_enum_to_string from .vector_factory import VectorFactory from .sparse_values_factory import SparseValuesFactory from pinecone.openapi_support import OPENAPI_ENDPOINT_PARAMS @@ -19,8 +24,12 @@ VectorTuple, VectorTupleWithMetadata, FilterTypedDict, + SearchQueryTypedDict, + SearchRerankTypedDict, + SearchQueryVectorTypedDict, ) -from .dataclasses import Vector, SparseValues + +from .dataclasses import Vector, SparseValues, SearchQuery, SearchRerank, SearchQueryVector logger = logging.getLogger(__name__) @@ -150,3 +159,90 @@ def list_paginated_args( ("pagination_token", pagination_token), ] ) + + @staticmethod + def search_request( + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsRequest: + request_args = parse_non_empty_args( + [ + ("query", IndexRequestFactory._parse_search_query(query)), + ("fields", fields), + ("rerank", IndexRequestFactory._parse_search_rerank(rerank)), + ] + ) + + return SearchRecordsRequest(**request_args) + + @staticmethod + def _parse_search_query( + query: Union[SearchQueryTypedDict, SearchQuery], + ) -> SearchRecordsRequestQuery: + if isinstance(query, SearchQuery): + query_dict = query.as_dict() + else: + query_dict = query + + required_fields = {"top_k"} + for key in required_fields: + if query_dict.get(key, None) is None: + raise ValueError(f"Missing required field '{key}' in search query.") + + # User-provided dict could contain object that need conversion + if isinstance(query_dict.get("vector", None), SearchQueryVector): + query_dict["vector"] = query_dict["vector"].as_dict() + + srrq = SearchRecordsRequestQuery( + **{k: v for k, v in query_dict.items() if k not in {"vector"}} + ) + if query_dict.get("vector", None) is not None: + srrq.vector = IndexRequestFactory._parse_search_vector(query_dict["vector"]) + return srrq + + @staticmethod + def _parse_search_vector( + vector: Optional[Union[SearchQueryVectorTypedDict, SearchQueryVector]], + ): + if vector is None: + return None + + if isinstance(vector, SearchQueryVector): + if vector.values is None and vector.sparse_values is None: + return None + vector_dict = vector.as_dict() + else: + vector_dict = vector + if ( + vector_dict.get("values", None) is None + and vector_dict.get("sparse_values", None) is None + ): + return None + + srv = SearchRecordsVector(**{k: v for k, v in vector_dict.items() if k not in {"values"}}) + + values = vector_dict.get("values", None) + if values is not None: + srv.values = VectorValues(value=values) + + return srv + + @staticmethod + def _parse_search_rerank(rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None): + if rerank is None: + return None + + if isinstance(rerank, SearchRerank): + rerank_dict = rerank.as_dict() + else: + rerank_dict = rerank + + required_fields = {"model", "rank_fields"} + for key in required_fields: + if rerank_dict.get(key, None) is None: + raise ValueError(f"Missing required field '{key}' in rerank.") + + rerank_dict["model"] = convert_enum_to_string(rerank_dict["model"]) + + return SearchRecordsRequestRerank(**rerank_dict) diff --git a/pinecone/data/types/__init__.py b/pinecone/data/types/__init__.py index 38f9c0879..ccc9fb50d 100644 --- a/pinecone/data/types/__init__.py +++ b/pinecone/data/types/__init__.py @@ -3,3 +3,6 @@ from .vector_metadata_dict import VectorMetadataTypedDict from .vector_tuple import VectorTuple, VectorTupleWithMetadata from .query_filter import FilterTypedDict +from .search_rerank_typed_dict import SearchRerankTypedDict +from .search_query_typed_dict import SearchQueryTypedDict +from .search_query_vector_typed_dict import SearchQueryVectorTypedDict diff --git a/pinecone/data/types/search_query_typed_dict.py b/pinecone/data/types/search_query_typed_dict.py new file mode 100644 index 000000000..118607ae0 --- /dev/null +++ b/pinecone/data/types/search_query_typed_dict.py @@ -0,0 +1,36 @@ +from typing import TypedDict, Optional, Union, Dict, Any +from .search_query_vector_typed_dict import SearchQueryVectorTypedDict + + +class SearchQueryTypedDict(TypedDict): + """ + SearchQuery represents the query when searching within a specific namespace. + """ + + inputs: Dict[str, Any] + """ + The input data to search with. + Required. + """ + + top_k: int + """ + The number of results to return with each search. + Required. + """ + + filter: Optional[Dict[str, Any]] = None + """ + The filter to apply to the search. + Optional. + """ + + vector: Optional[Union[SearchQueryVectorTypedDict]] + """ + The vector values to search with. If provided, it overwrites the inputs. + """ + + id: Optional[str] + """ + The unique ID of the vector to be used as a query vector. + """ diff --git a/pinecone/data/types/search_query_vector_typed_dict.py b/pinecone/data/types/search_query_vector_typed_dict.py new file mode 100644 index 000000000..4269b904a --- /dev/null +++ b/pinecone/data/types/search_query_vector_typed_dict.py @@ -0,0 +1,25 @@ +from typing import TypedDict, Optional, List + + +class SearchQueryVectorTypedDict(TypedDict): + """ + SearchQueryVector represents the vector values used to query. + """ + + values: Optional[List[float]] + """ + The vector data included in the search request. + Optional. + """ + + sparse_values: Optional[List[float]] + """ + The sparse embedding values to search with. + Optional. + """ + + sparse_indices: Optional[List[int]] + """ + The sparse embedding indices to search with. + Optional. + """ diff --git a/pinecone/data/types/search_rerank_typed_dict.py b/pinecone/data/types/search_rerank_typed_dict.py new file mode 100644 index 000000000..89c4f8d8a --- /dev/null +++ b/pinecone/data/types/search_rerank_typed_dict.py @@ -0,0 +1,41 @@ +from typing import TypedDict, Optional, Union, Dict, Any +from ..features.inference import RerankModel + + +class SearchRerankTypedDict(TypedDict): + # """ + # SearchRerank represents a rerank request when searching within a specific namespace. + # """ + + model: Union[str, RerankModel] + # model: str + # """ + # The name of the [reranking model](https://docs.pinecone.io/guides/inference/understanding-inference#reranking-models) to use. + # Required. + # """ + + rank_fields: list[str] + # rank_fields: List[str] + # """ + # The fields to use for reranking. + # Required. + # """ + + top_n: Optional[int] + # """ + # The number of top results to return after reranking. Defaults to top_k. + # Optional. + # """ + + parameters: Optional[Dict[str, Any]] + # """ + # Additional model-specific parameters. Refer to the [model guide](https://docs.pinecone.io/guides/inference/understanding-inference#models) + # for available model parameters. + # Optional. + # """ + + query: Optional[str] + # """ + # The query to rerank documents against. If a specific rerank query is specified, it overwrites + # the query input that was provided at the top level. + # """ diff --git a/pinecone/openapi_support/api_client.py b/pinecone/openapi_support/api_client.py index e9766b19e..19767a848 100644 --- a/pinecone/openapi_support/api_client.py +++ b/pinecone/openapi_support/api_client.py @@ -557,8 +557,7 @@ def request( ) else: raise PineconeApiValueError( - "http method must be `GET`, `HEAD`, `OPTIONS`," - " `POST`, `PATCH`, `PUT` or `DELETE`." + "http method must be `GET`, `HEAD`, `OPTIONS`, `POST`, `PATCH`, `PUT` or `DELETE`." ) @classmethod diff --git a/pinecone/openapi_support/asyncio_api_client.py b/pinecone/openapi_support/asyncio_api_client.py index d21b0a1cb..068259d59 100644 --- a/pinecone/openapi_support/asyncio_api_client.py +++ b/pinecone/openapi_support/asyncio_api_client.py @@ -552,8 +552,7 @@ async def request( ) else: raise PineconeApiValueError( - "http method must be `GET`, `HEAD`, `OPTIONS`," - " `POST`, `PATCH`, `PUT` or `DELETE`." + "http method must be `GET`, `HEAD`, `OPTIONS`, `POST`, `PATCH`, `PUT` or `DELETE`." ) @classmethod diff --git a/pinecone/openapi_support/configuration.py b/pinecone/openapi_support/configuration.py index fa7dfa8b6..1fdee2ea2 100644 --- a/pinecone/openapi_support/configuration.py +++ b/pinecone/openapi_support/configuration.py @@ -418,8 +418,9 @@ def get_host_from_settings(self, index, variables=None, servers=None): server = servers[index] except IndexError: raise ValueError( - "Invalid index {0} when selecting the host settings. " - "Must be less than {1}".format(index, len(servers)) + "Invalid index {0} when selecting the host settings. Must be less than {1}".format( + index, len(servers) + ) ) url = server["url"] @@ -430,8 +431,7 @@ def get_host_from_settings(self, index, variables=None, servers=None): if "enum_values" in variable and used_value not in variable["enum_values"]: raise ValueError( - "The variable `{0}` in the host URL has invalid value " - "{1}. Must be {2}.".format( + "The variable `{0}` in the host URL has invalid value {1}. Must be {2}.".format( variable_name, variables[variable_name], variable["enum_values"] ) ) diff --git a/pinecone/openapi_support/exceptions.py b/pinecone/openapi_support/exceptions.py index cd6045cd3..4107c4e75 100644 --- a/pinecone/openapi_support/exceptions.py +++ b/pinecone/openapi_support/exceptions.py @@ -99,7 +99,7 @@ def __init__(self, status=None, reason=None, http_resp=None): def __str__(self): """Custom error messages for exception""" - error_message = "({0})\n" "Reason: {1}\n".format(self.status, self.reason) + error_message = "({0})\nReason: {1}\n".format(self.status, self.reason) if self.headers: error_message += "HTTP response headers: {0}\n".format(self.headers) diff --git a/pinecone/openapi_support/model_utils.py b/pinecone/openapi_support/model_utils.py index 9db3c72b7..a96e59b8e 100644 --- a/pinecone/openapi_support/model_utils.py +++ b/pinecone/openapi_support/model_utils.py @@ -1653,10 +1653,8 @@ def type_error_message(var_value=None, var_name=None, valid_classes=None, key_ty if key_type: key_or_value = "key" valid_classes_phrase = get_valid_classes_phrase(valid_classes) - msg = ( - "Invalid type for variable '{0}'. Required {1} type {2} and " "passed type was {3}".format( - var_name, key_or_value, valid_classes_phrase, type(var_value).__name__ - ) + msg = "Invalid type for variable '{0}'. Required {1} type {2} and passed type was {3}".format( + var_name, key_or_value, valid_classes_phrase, type(var_value).__name__ ) return msg diff --git a/pinecone/openapi_support/rest_urllib3.py b/pinecone/openapi_support/rest_urllib3.py index fef886296..59d3457e5 100644 --- a/pinecone/openapi_support/rest_urllib3.py +++ b/pinecone/openapi_support/rest_urllib3.py @@ -1,6 +1,5 @@ import json import logging -import re import ssl import os from urllib.parse import urlencode, quote @@ -162,12 +161,18 @@ def request( headers["Content-Type"] = "application/json" if query_params: url += "?" + urlencode(query_params, quote_via=quote) - if ("Content-Type" not in headers) or ( - re.search("json", headers["Content-Type"], re.IGNORECASE) - ): - request_body = None - if body is not None: - request_body = json.dumps(body) + + content_type = headers.get("Content-Type", "").lower() + if content_type == "" or ("json" in content_type): + if body is None: + request_body = None + else: + if content_type == "application/x-ndjson": + # for x-ndjson requests, we are expecting an array of elements + # that need to be converted to a newline separated string + request_body = "\n".join(json.dumps(element) for element in body) + else: # content_type == "application/json": + request_body = json.dumps(body) r = self.pool_manager.request( method, url, @@ -176,7 +181,8 @@ def request( timeout=timeout, headers=headers, ) - elif headers["Content-Type"] == "application/x-www-form-urlencoded": # noqa: E501 + + elif content_type == "application/x-www-form-urlencoded": # noqa: E501 r = self.pool_manager.request( method, url, @@ -186,7 +192,7 @@ def request( timeout=timeout, headers=headers, ) - elif headers["Content-Type"] == "multipart/form-data": + elif content_type == "multipart/form-data": # must del headers['Content-Type'], or the correct # Content-Type which generated by urllib3 will be # overwritten. diff --git a/pinecone/openapi_support/rest_utils.py b/pinecone/openapi_support/rest_utils.py index b002ef579..85a957ebd 100644 --- a/pinecone/openapi_support/rest_utils.py +++ b/pinecone/openapi_support/rest_utils.py @@ -11,11 +11,6 @@ ) logger = logging.getLogger(__name__) -logging.basicConfig( - format="%(levelname)s [%(asctime)s] %(name)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - level=logging.DEBUG, -) class RESTResponse(io.IOBase): diff --git a/pinecone/utils/__init__.py b/pinecone/utils/__init__.py index 34e1d5aa5..5cd4d36c2 100644 --- a/pinecone/utils/__init__.py +++ b/pinecone/utils/__init__.py @@ -4,6 +4,7 @@ from .deprecation_notice import warn_deprecated from .fix_tuple_length import fix_tuple_length from .convert_to_list import convert_to_list +from .convert_enum_to_string import convert_enum_to_string from .normalize_host import normalize_host from .setup_openapi_client import setup_openapi_client, build_plugin_setup_client from .parse_args import parse_non_empty_args @@ -25,4 +26,5 @@ "docslinks", "install_json_repr_override", "validate_and_convert_errors", + "convert_enum_to_string", ] diff --git a/pinecone/utils/convert_enum_to_string.py b/pinecone/utils/convert_enum_to_string.py new file mode 100644 index 000000000..ba6558c2d --- /dev/null +++ b/pinecone/utils/convert_enum_to_string.py @@ -0,0 +1,8 @@ +from typing import Union +from enum import Enum + + +def convert_enum_to_string(value: Union[Enum, str]) -> str: + if isinstance(value, Enum): + return value.value + return value diff --git a/poetry.lock b/poetry.lock index b2ac36c89..2e5b50ed2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1627,6 +1627,33 @@ urllib3 = ">=1.25.10,<3.0" [package.extras] tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli", "tomli-w", "types-PyYAML", "types-requests"] +[[package]] +name = "ruff" +version = "0.9.3" +description = "An extremely fast Python linter and code formatter, written in Rust." +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruff-0.9.3-py3-none-linux_armv6l.whl", hash = "sha256:7f39b879064c7d9670197d91124a75d118d00b0990586549949aae80cdc16624"}, + {file = "ruff-0.9.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:a187171e7c09efa4b4cc30ee5d0d55a8d6c5311b3e1b74ac5cb96cc89bafc43c"}, + {file = "ruff-0.9.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:c59ab92f8e92d6725b7ded9d4a31be3ef42688a115c6d3da9457a5bda140e2b4"}, + {file = "ruff-0.9.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dc153c25e715be41bb228bc651c1e9b1a88d5c6e5ed0194fa0dfea02b026439"}, + {file = "ruff-0.9.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:646909a1e25e0dc28fbc529eab8eb7bb583079628e8cbe738192853dbbe43af5"}, + {file = "ruff-0.9.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a5a46e09355695fbdbb30ed9889d6cf1c61b77b700a9fafc21b41f097bfbba4"}, + {file = "ruff-0.9.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c4bb09d2bbb394e3730d0918c00276e79b2de70ec2a5231cd4ebb51a57df9ba1"}, + {file = "ruff-0.9.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96a87ec31dc1044d8c2da2ebbed1c456d9b561e7d087734336518181b26b3aa5"}, + {file = "ruff-0.9.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bb7554aca6f842645022fe2d301c264e6925baa708b392867b7a62645304df4"}, + {file = "ruff-0.9.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cabc332b7075a914ecea912cd1f3d4370489c8018f2c945a30bcc934e3bc06a6"}, + {file = "ruff-0.9.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:33866c3cc2a575cbd546f2cd02bdd466fed65118e4365ee538a3deffd6fcb730"}, + {file = "ruff-0.9.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:006e5de2621304c8810bcd2ee101587712fa93b4f955ed0985907a36c427e0c2"}, + {file = "ruff-0.9.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:ba6eea4459dbd6b1be4e6bfc766079fb9b8dd2e5a35aff6baee4d9b1514ea519"}, + {file = "ruff-0.9.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:90230a6b8055ad47d3325e9ee8f8a9ae7e273078a66401ac66df68943ced029b"}, + {file = "ruff-0.9.3-py3-none-win32.whl", hash = "sha256:eabe5eb2c19a42f4808c03b82bd313fc84d4e395133fb3fc1b1516170a31213c"}, + {file = "ruff-0.9.3-py3-none-win_amd64.whl", hash = "sha256:040ceb7f20791dfa0e78b4230ee9dce23da3b64dd5848e40e3bf3ab76468dcf4"}, + {file = "ruff-0.9.3-py3-none-win_arm64.whl", hash = "sha256:800d773f6d4d33b0a3c60e2c6ae8f4c202ea2de056365acfa519aa48acf28e0b"}, + {file = "ruff-0.9.3.tar.gz", hash = "sha256:8293f89985a090ebc3ed1064df31f3b4b56320cdfcec8b60d3295bddb955c22a"}, +] + [[package]] name = "six" version = "1.16.0" @@ -1910,4 +1937,4 @@ grpc = ["googleapis-common-protos", "grpcio", "grpcio", "lz4", "protobuf", "prot [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "ed686662d6a59d70a89367d84a17857aef9143eeff20610190c95dcb5be7c71b" +content-hash = "b1bd9c6ecec94c1615c1b09c43808fca8344b16b5c15c61ec11775ad70b82448" diff --git a/pyproject.toml b/pyproject.toml index 911cb5ef9..c6f8665c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,6 +88,7 @@ pytest-benchmark = [ ] urllib3_mock = "0.3.3" responses = ">=0.8.1" +ruff = "^0.9.3" [tool.poetry.extras] diff --git a/tests/__init__.py b/tests/__init__.py index e69de29bb..f2dab92a1 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,5 @@ +import logging + +logging.basicConfig( + format="%(levelname)s [%(asctime)s] %(name)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) diff --git a/tests/integration/data/conftest.py b/tests/integration/data/conftest.py index 473bf809e..c7498cb81 100644 --- a/tests/integration/data/conftest.py +++ b/tests/integration/data/conftest.py @@ -3,6 +3,7 @@ import json from ..helpers import get_environment_var, generate_index_name import logging +from pinecone import EmbedModel, CloudProvider, AwsRegion, IndexEmbed logger = logging.getLogger(__name__) @@ -61,6 +62,11 @@ def sparse_index_name(): return generate_index_name("sparse") +@pytest.fixture(scope="session") +def model_index_name(): + return generate_index_name("embed") + + def build_index_client(client, index_name, index_host): if use_grpc(): return client.Index(name=index_name, host=index_host) @@ -78,20 +84,51 @@ def sparse_idx(client, sparse_index_name, sparse_index_host): return build_index_client(client, sparse_index_name, sparse_index_host) +@pytest.fixture(scope="session") +def model_idx(client, model_index_name, model_index_host): + return build_index_client(client, model_index_name, model_index_host) + + +@pytest.fixture(scope="session") +def model_index_host(model_index_name): + pc = build_client() + + if model_index_name not in pc.list_indexes().names(): + logger.info(f"Creating index {model_index_name}") + pc.create_index_for_model( + name=model_index_name, + cloud=CloudProvider.AWS, + region=AwsRegion.US_WEST_2, + embed=IndexEmbed( + model=EmbedModel.Multilingual_E5_Large, + field_map={"text": "my_text_field"}, + metric="cosine", + ), + ) + else: + logger.info(f"Index {model_index_name} already exists") + + description = pc.describe_index(name=model_index_name) + yield description.host + + logger.info(f"Deleting index {model_index_name}") + pc.delete_index(model_index_name, -1) + + @pytest.fixture(scope="session") def index_host(index_name, metric, spec): pc = build_client() if index_name not in pc.list_indexes().names(): - logger.info("Creating index with name: " + index_name) + logger.info(f"Creating index {index_name}") pc.create_index(name=index_name, dimension=2, metric=metric, spec=spec) else: - logger.info("Index with name " + index_name + " already exists") + logger.info(f"Index {index_name} already exists") description = pc.describe_index(name=index_name) yield description.host - logger.info("Deleting index with name: " + index_name) + logger.info(f"Deleting index {index_name}") pc.delete_index(index_name, -1) @@ -100,15 +137,15 @@ def sparse_index_host(sparse_index_name, spec): pc = build_client() if sparse_index_name not in pc.list_indexes().names(): - logger.info("Creating sparse index with name: " + sparse_index_name) + logger.info(f"Creating index {sparse_index_name}") pc.create_index( name=sparse_index_name, metric="dotproduct", spec=spec, vector_type="sparse" ) else: - logger.info("Sparse index with name " + sparse_index_name + " already exists") + logger.info(f"Index {sparse_index_name} already exists") description = pc.describe_index(name=sparse_index_name) yield description.host - logger.info("Deleting sparse index with name: " + sparse_index_name) + logger.info(f"Deleting index {sparse_index_name}") pc.delete_index(sparse_index_name, -1) diff --git a/tests/integration/data/test_search_and_upsert_records.py b/tests/integration/data/test_search_and_upsert_records.py new file mode 100644 index 000000000..0b0f6444c --- /dev/null +++ b/tests/integration/data/test_search_and_upsert_records.py @@ -0,0 +1,221 @@ +import time +import pytest +from typing import List +from ..helpers import random_string, embedding_values +import logging + +from pinecone import RerankModel, PineconeApiException +from pinecone.data import _Index + +logger = logging.getLogger(__name__) + +model_index_dimension = 1024 # Currently controlled by "multilingual-e5-large" + + +def poll_until_fetchable(idx: _Index, namespace: str, ids: List[str], timeout: int): + found = False + total_wait = 0 + interval = 5 + + while not found: + if total_wait > timeout: + logger.debug(f"Failed to fetch records within {timeout} seconds.") + raise TimeoutError(f"Failed to fetch records within {timeout} seconds.") + time.sleep(interval) + total_wait += interval + + response = idx.fetch(ids=ids, namespace=namespace) + logger.debug( + f"Polling {total_wait} seconds for fetch response with ids {ids} in namespace {namespace}" + ) + + if len(response.vectors) == len(ids): + found = True + + +@pytest.fixture +def records_to_upsert(): + return [ + { + "id": "test1", + "my_text_field": "Apple is a popular fruit known for its sweetness and crisp texture.", + "more_stuff": "more stuff!", + }, + { + "id": "test2", + "my_text_field": "The tech company Apple is known for its innovative products like the iPhone.", + "more_stuff": "more stuff!", + }, + { + "id": "test3", + "my_text_field": "Many people enjoy eating apples as a healthy snack.", + "more_stuff": "more stuff!", + }, + { + "id": "test4", + "my_text_field": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces.", + "more_stuff": "more stuff!", + }, + { + "id": "test5", + "my_text_field": "An apple a day keeps the doctor away, as the saying goes.", + "more_stuff": "more stuff!", + }, + { + "id": "test6", + "my_text_field": "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership.", + "more_stuff": "extra more stuff!", + }, + ] + + +class TestUpsertAndSearchRecords: + def test_search_records(self, model_idx, records_to_upsert): + target_namespace = random_string(10) + model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + poll_until_fetchable( + model_idx, target_namespace, [r["id"] for r in records_to_upsert], timeout=180 + ) + + response = model_idx.search_records( + namespace=target_namespace, query={"inputs": {"text": "Apple corporation"}, "top_k": 3} + ) + assert len(response.result.hits) == 3 + assert response.usage is not None + + # Test search alias + response2 = model_idx.search( + namespace=target_namespace, query={"inputs": {"text": "Apple corporation"}, "top_k": 3} + ) + assert response == response2 + + # validate similar records and contents + similar_record_ids = ["test2", "test4", "test6"] + + for hit in response.result.hits: + assert hit._id in similar_record_ids + similar_record_ids.remove(hit._id) + + assert hit._score > 0 + assert hit.fields.get("my_text_field") is not None + assert hit.fields.get("more_stuff") is not None + + # search for records while filtering fields + response_filtered = model_idx.search_records( + namespace="test-namespace", + query={"inputs": {"text": "Apple corporation"}, "top_k": 3}, + fields=["more_stuff"], + ) + + for hit in response_filtered.result.hits: + assert hit.fields.get("my_text_field") is None + assert hit.fields.get("more_stuff") is not None + + def test_search_records_with_vector(self, model_idx, records_to_upsert): + target_namespace = random_string(10) + model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + poll_until_fetchable( + model_idx, target_namespace, [r["id"] for r in records_to_upsert], timeout=180 + ) + + # Search for similar records + search_query = {"top_k": 3, "vector": {"values": embedding_values(model_index_dimension)}} + response = model_idx.search_records(namespace=target_namespace, query=search_query) + assert len(response.result.hits) == 3 + assert response.usage is not None + + # Test search alias + response2 = model_idx.search(namespace=target_namespace, query=search_query) + assert response == response2 + + @pytest.mark.parametrize("rerank_model", ["bge-reranker-v2-m3", RerankModel.Bge_Reranker_V2_M3]) + def test_search_with_rerank(self, model_idx, records_to_upsert, rerank_model): + target_namespace = random_string(10) + model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + poll_until_fetchable( + model_idx, target_namespace, [r["id"] for r in records_to_upsert], timeout=180 + ) + + # Search for similar records + response = model_idx.search_records( + namespace=target_namespace, + query={"inputs": {"text": "Apple corporation"}, "top_k": 3}, + rerank={"model": rerank_model, "rank_fields": ["my_text_field"], "top_n": 3}, + ) + assert len(response.result.hits) == 3 + assert response.usage is not None + + # validate similar records and contents + similar_record_ids = ["test6", "test4", "test2"] + for hit in response.result.hits: + assert hit._id in similar_record_ids + similar_record_ids.remove(hit._id) + + assert hit._score > 0 + assert hit.fields.get("my_text_field") is not None + assert hit.fields.get("more_stuff") is not None + + def test_search_with_rerank_query(self, model_idx, records_to_upsert): + target_namespace = random_string(10) + model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + # Sleep for freshness + poll_until_fetchable( + model_idx, target_namespace, [r["id"] for r in records_to_upsert], timeout=180 + ) + + # Search for similar records + response = model_idx.search_records( + namespace=target_namespace, + query={"inputs": {"text": "Apple corporation"}, "top_k": 3}, + rerank={ + "model": "bge-reranker-v2-m3", + "rank_fields": ["my_text_field"], + "top_n": 3, + "query": "Apple corporation", + }, + ) + assert len(response.result.hits) == 3 + assert response.usage is not None + + +class TestUpsertAndSearchRecordsErrorCases: + def test_search_with_rerank_nonexistent_model_error(self, model_idx, records_to_upsert): + target_namespace = random_string(10) + model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + poll_until_fetchable( + model_idx, target_namespace, [r["id"] for r in records_to_upsert], timeout=180 + ) + + with pytest.raises(PineconeApiException, match=r"Model 'non-existent-model' not found"): + model_idx.search_records( + namespace=target_namespace, + query={"inputs": {"text": "Apple corporation"}, "top_k": 3}, + rerank={ + "model": "non-existent-model", + "rank_fields": ["my_text_field"], + "top_n": 3, + }, + ) + + @pytest.mark.skip(reason="Possible bug in the API") + def test_search_with_rerank_empty_rank_fields_error(self, model_idx, records_to_upsert): + target_namespace = random_string(10) + model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + poll_until_fetchable( + model_idx, target_namespace, [r["id"] for r in records_to_upsert], timeout=180 + ) + + with pytest.raises( + PineconeApiException, match=r"Only one rank field is supported for model" + ): + model_idx.search_records( + namespace="test-namespace", + query={"inputs": {"text": "Apple corporation"}, "top_k": 3}, + rerank={"model": "bge-reranker-v2-m3", "rank_fields": [], "top_n": 3}, + ) diff --git a/tests/integration/helpers/__init__.py b/tests/integration/helpers/__init__.py index 91f99733f..f233d0895 100644 --- a/tests/integration/helpers/__init__.py +++ b/tests/integration/helpers/__init__.py @@ -7,4 +7,5 @@ poll_stats_for_namespace, poll_fetch_for_ids_in_namespace, embedding_values, + jsonprint, ) diff --git a/tests/integration/helpers/helpers.py b/tests/integration/helpers/helpers.py index a2e4d7b92..795f1a5ed 100644 --- a/tests/integration/helpers/helpers.py +++ b/tests/integration/helpers/helpers.py @@ -6,6 +6,7 @@ import logging from typing import Any from datetime import datetime +import json logger = logging.getLogger(__name__) @@ -113,3 +114,7 @@ def poll_fetch_for_ids_in_namespace(idx, ids, namespace): def fake_api_key(): return "-".join([random_string(x) for x in [8, 4, 4, 4, 12]]) + + +def jsonprint(obj): + print(json.dumps(obj.to_dict(), indent=2)) diff --git a/tests/perf/test_re_vs_in.py b/tests/perf/test_re_vs_in.py new file mode 100644 index 000000000..8d32a94c1 --- /dev/null +++ b/tests/perf/test_re_vs_in.py @@ -0,0 +1,46 @@ +import pytest +import re + + +def use_in_match(headers): + ct = headers.get("Content-Type", "") + return "json" in ct + + +def use_re_match(headers): + ct = headers.get("Content-Type", "") + return re.search("json", ct, re.IGNORECASE) + + +class TestStringMatchPerformance: + @pytest.mark.parametrize( + "headers", + [ + {"Content-Type": "application/json"} + # {"Content-Type": "application/JSON"}, + # {"Content-Type": "application/xml"}, + # {"Content-Type": "text/html"}, + # {"Content-Type": "text/plain"}, + # {"Content-Type": "application/xml"}, + # {"Content-Type": "text/html"}, + # {"Content-Type": "text/plain"}, + ], + ) + def test_using_in(self, benchmark, headers): + benchmark.pedantic(use_in_match, (headers,), rounds=100, warmup_rounds=1, iterations=50) + + @pytest.mark.parametrize( + "headers", + [ + {"Content-Type": "application/json"} + # {"Content-Type": "application/JSON"}, + # {"Content-Type": "application/xml"}, + # {"Content-Type": "text/html"}, + # {"Content-Type": "text/plain"}, + # {"Content-Type": "application/xml"}, + # {"Content-Type": "text/html"}, + # {"Content-Type": "text/plain"}, + ], + ) + def test_using_re(self, benchmark, headers): + benchmark.pedantic(use_re_match, (headers,), rounds=100, warmup_rounds=1, iterations=50) diff --git a/tests/unit/data/test_request_factory.py b/tests/unit/data/test_request_factory.py new file mode 100644 index 000000000..087436c91 --- /dev/null +++ b/tests/unit/data/test_request_factory.py @@ -0,0 +1,401 @@ +import pytest +from pinecone.data.request_factory import ( + IndexRequestFactory, + SearchQuery, + SearchQueryVector, + SearchRerank, +) + +from pinecone.core.openapi.db_data.models import ( + SearchRecordsRequestQuery, + SearchRecordsRequestRerank, + SearchRecordsVector, + VectorValues, + SearchRecordsRequest, +) + +from pinecone import RerankModel + + +class TestIndexRequestFactory: + def test_parsing_search_query_vector_dict(self): + vector1 = IndexRequestFactory._parse_search_vector(None) + assert vector1 is None + + vector2 = IndexRequestFactory._parse_search_vector({"values": [0.1, 0.2, 0.3]}) + assert vector2 == SearchRecordsVector(values=VectorValues([0.1, 0.2, 0.3])) + + vector3 = IndexRequestFactory._parse_search_vector( + { + "values": [0.1, 0.2, 0.3], + "sparse_values": [0.4, 0.5, 0.6], + "sparse_indices": [1, 2, 3], + } + ) + assert vector3 == SearchRecordsVector( + values=VectorValues([0.1, 0.2, 0.3]), + sparse_indices=[1, 2, 3], + sparse_values=[0.4, 0.5, 0.6], + ) + + vector4 = IndexRequestFactory._parse_search_vector( + {"values": [], "sparse_values": [0.4, 0.5, 0.6], "sparse_indices": [1, 2, 3]} + ) + assert vector4 == SearchRecordsVector( + values=VectorValues([]), sparse_indices=[1, 2, 3], sparse_values=[0.4, 0.5, 0.6] + ) + + vector5 = IndexRequestFactory._parse_search_vector( + {"values": [0.1, 0.2, 0.3], "sparse_values": [], "sparse_indices": []} + ) + assert vector5 == SearchRecordsVector( + values=VectorValues([0.1, 0.2, 0.3]), sparse_indices=[], sparse_values=[] + ) + + vector6 = IndexRequestFactory._parse_search_vector( + {"values": [], "sparse_values": [], "sparse_indices": []} + ) + assert vector6 == SearchRecordsVector( + values=VectorValues([]), sparse_indices=[], sparse_values=[] + ) + + vector7 = IndexRequestFactory._parse_search_vector({}) + assert vector7 is None + + vector8 = IndexRequestFactory._parse_search_vector({"values": []}) + assert vector8 == SearchRecordsVector(values=VectorValues([])) + + vector9 = IndexRequestFactory._parse_search_vector( + {"sparse_values": [0.8], "sparse_indices": [100]} + ) + assert vector9 == SearchRecordsVector(sparse_indices=[100], sparse_values=[0.8]) + + def test_parsing_search_query_vector_object(self): + vector2 = IndexRequestFactory._parse_search_vector( + SearchQueryVector(values=[0.1, 0.2, 0.3]) + ) + assert vector2 == SearchRecordsVector(values=VectorValues([0.1, 0.2, 0.3])) + + vector3 = IndexRequestFactory._parse_search_vector( + SearchQueryVector( + values=[0.1, 0.2, 0.3], sparse_values=[0.4, 0.5, 0.6], sparse_indices=[1, 2, 3] + ) + ) + assert vector3 == SearchRecordsVector( + values=VectorValues([0.1, 0.2, 0.3]), + sparse_indices=[1, 2, 3], + sparse_values=[0.4, 0.5, 0.6], + ) + + vector4 = IndexRequestFactory._parse_search_vector( + SearchQueryVector(values=[], sparse_values=[0.4, 0.5, 0.6], sparse_indices=[1, 2, 3]) + ) + assert vector4 == SearchRecordsVector( + values=VectorValues([]), sparse_indices=[1, 2, 3], sparse_values=[0.4, 0.5, 0.6] + ) + + vector5 = IndexRequestFactory._parse_search_vector( + SearchQueryVector(values=[0.1, 0.2, 0.3], sparse_values=[], sparse_indices=[]) + ) + assert vector5 == SearchRecordsVector( + values=VectorValues([0.1, 0.2, 0.3]), sparse_indices=[], sparse_values=[] + ) + + vector6 = IndexRequestFactory._parse_search_vector( + SearchQueryVector(values=[], sparse_values=[], sparse_indices=[]) + ) + assert vector6 == SearchRecordsVector( + values=VectorValues([]), sparse_indices=[], sparse_values=[] + ) + + vector7 = IndexRequestFactory._parse_search_vector(SearchQueryVector(values=[])) + assert vector7 == SearchRecordsVector(values=VectorValues([])) + + def test_parse_search_query_dict(self): + query1 = IndexRequestFactory._parse_search_query( + { + "inputs": {"text": "Apple corporation"}, + "top_k": 3, + "vector": {"values": [0.1, 0.2, 0.3]}, + } + ) + assert query1 == SearchRecordsRequestQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + vector=SearchRecordsVector(values=VectorValues([0.1, 0.2, 0.3])), + ) + + query2 = IndexRequestFactory._parse_search_query( + { + "inputs": {"text": "Apple corporation"}, + "top_k": 3, + "vector": { + "values": [0.1, 0.2, 0.3], + "sparse_values": [0.4, 0.5, 0.6], + "sparse_indices": [1, 2, 3], + }, + } + ) + assert query2 == SearchRecordsRequestQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + vector=SearchRecordsVector( + values=VectorValues([0.1, 0.2, 0.3]), + sparse_indices=[1, 2, 3], + sparse_values=[0.4, 0.5, 0.6], + ), + ) + + query3 = IndexRequestFactory._parse_search_query( + { + "inputs": {"text": "Apple corporation"}, + "top_k": 3, + "id": "test_id", + "filter": {"genre": {"$in": ["action"]}}, + } + ) + assert query3 == SearchRecordsRequestQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + id="test_id", + filter={"genre": {"$in": ["action"]}}, + ) + + def test_parse_search_query_missing_required_fields(self): + with pytest.raises(ValueError) as e: + IndexRequestFactory._parse_search_query({"inputs": {"text": "Apple corporation"}}) + assert str(e.value) == "Missing required field 'top_k' in search query." + + def test_parse_search_query_with_objects(self): + query = IndexRequestFactory._parse_search_query( + SearchQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + filter={"genre": {"$in": ["action"]}}, + id="test_id", + vector=SearchQueryVector( + values=[0.1, 0.2, 0.3], sparse_indices=[1, 2, 3], sparse_values=[0.4, 0.5, 0.6] + ), + ) + ) + assert query == SearchRecordsRequestQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + id="test_id", + filter={"genre": {"$in": ["action"]}}, + vector=SearchRecordsVector( + values=VectorValues([0.1, 0.2, 0.3]), + sparse_indices=[1, 2, 3], + sparse_values=[0.4, 0.5, 0.6], + ), + ) + + def test_parse_search_query_with_mixed_types(self): + query = IndexRequestFactory._parse_search_query( + SearchQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + filter={"genre": {"$in": ["action"]}}, + id="test_id", + vector={ + "values": [0.1, 0.2, 0.3], + "sparse_indices": [1, 2, 3], + "sparse_values": [0.4, 0.5, 0.6], + }, + ) + ) + assert query == SearchRecordsRequestQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + id="test_id", + filter={"genre": {"$in": ["action"]}}, + vector=SearchRecordsVector( + values=VectorValues([0.1, 0.2, 0.3]), + sparse_indices=[1, 2, 3], + sparse_values=[0.4, 0.5, 0.6], + ), + ) + + def test_parse_search_rerank_dict(self): + rerank = IndexRequestFactory._parse_search_rerank( + {"model": "bge-reranker-v2-m3", "rank_fields": ["my_text_field"], "top_n": 3} + ) + assert rerank == SearchRecordsRequestRerank( + model="bge-reranker-v2-m3", rank_fields=["my_text_field"], top_n=3 + ) + + # Enum used in dict + rerank2 = IndexRequestFactory._parse_search_rerank( + {"model": RerankModel.Bge_Reranker_V2_M3, "rank_fields": ["my_text_field"], "top_n": 3} + ) + assert rerank2 == SearchRecordsRequestRerank( + model="bge-reranker-v2-m3", rank_fields=["my_text_field"], top_n=3 + ) + + rerank3 = IndexRequestFactory._parse_search_rerank( + { + "model": "bge-reranker-v2-m3", + "rank_fields": ["my_text_field"], + "top_n": 3, + "parameters": {"key": "value"}, + "query": "foo", + } + ) + assert rerank3 == SearchRecordsRequestRerank( + model="bge-reranker-v2-m3", + rank_fields=["my_text_field"], + top_n=3, + parameters={"key": "value"}, + query="foo", + ) + + def test_parse_search_rerank_missing_required_fields(self): + with pytest.raises(ValueError) as e: + IndexRequestFactory._parse_search_rerank({"rank_fields": ["my_text_field"], "top_n": 3}) + assert str(e.value) == "Missing required field 'model' in rerank." + + with pytest.raises(ValueError) as e: + IndexRequestFactory._parse_search_rerank( + {"rank_fields": ["my_text_field"], "model": None} + ) + assert str(e.value) == "Missing required field 'model' in rerank." + + with pytest.raises(ValueError) as e: + IndexRequestFactory._parse_search_rerank({"rank_fields": None, "model": "foo"}) + assert str(e.value) == "Missing required field 'rank_fields' in rerank." + + with pytest.raises(ValueError) as e: + IndexRequestFactory._parse_search_rerank({"model": "bge-reranker-v2-m3", "top_n": 3}) + assert str(e.value) == "Missing required field 'rank_fields' in rerank." + + with pytest.raises(ValueError) as e: + IndexRequestFactory._parse_search_rerank( + {"model": RerankModel.Bge_Reranker_V2_M3, "top_n": 3} + ) + assert str(e.value) == "Missing required field 'rank_fields' in rerank." + + def test_parse_search_rerank_object(self): + rerank = IndexRequestFactory._parse_search_rerank( + SearchRerank( + model=RerankModel.Bge_Reranker_V2_M3, rank_fields=["my_text_field"], top_n=3 + ) + ) + assert rerank == SearchRecordsRequestRerank( + model="bge-reranker-v2-m3", rank_fields=["my_text_field"], top_n=3 + ) + + rerank2 = IndexRequestFactory._parse_search_rerank( + SearchRerank( + model=RerankModel.Bge_Reranker_V2_M3, + rank_fields=["my_text_field"], + top_n=3, + parameters={"key": "value"}, + query="foo", + ) + ) + assert rerank2 == SearchRecordsRequestRerank( + model="bge-reranker-v2-m3", + rank_fields=["my_text_field"], + top_n=3, + parameters={"key": "value"}, + query="foo", + ) + + rerank3 = IndexRequestFactory._parse_search_rerank( + SearchRerank(model="unknown-model", rank_fields=["my_text_field"]) + ) + assert rerank3 == SearchRecordsRequestRerank( + model="unknown-model", rank_fields=["my_text_field"] + ) + + def test_search_request_with_dicts(self): + request = IndexRequestFactory.search_request( + query={ + "inputs": {"text": "Apple corporation"}, + "top_k": 3, + "vector": {"values": [0.1, 0.2, 0.3]}, + }, + fields=["more_stuff"], + rerank={"model": "bge-reranker-v2-m3", "rank_fields": ["my_text_field"], "top_n": 3}, + ) + + assert request is not None + assert request.query.vector == SearchRecordsVector(values=VectorValues([0.1, 0.2, 0.3])) + + def test_search_request_with_objects_and_enums(self): + factory = IndexRequestFactory() + request = factory.search_request( + query=SearchQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + vector=SearchQueryVector(values=[0.1, 0.2, 0.3]), + filter={"genre": {"$in": ["action"]}}, + id="test_id", + ), + fields=["more_stuff"], + rerank=SearchRerank( + model=RerankModel.Bge_Reranker_V2_M3, + rank_fields=["my_text_field"], + top_n=3, + parameters={"key": "value"}, + query="foo", + ), + ) + + assert request == SearchRecordsRequest( + query=SearchRecordsRequestQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + vector=SearchRecordsVector(values=VectorValues([0.1, 0.2, 0.3])), + filter={"genre": {"$in": ["action"]}}, + id="test_id", + ), + fields=["more_stuff"], + rerank=SearchRecordsRequestRerank( + model="bge-reranker-v2-m3", + rank_fields=["my_text_field"], + top_n=3, + parameters={"key": "value"}, + query="foo", + ), + ) + + def test_search_request_with_no_rerank(self): + factory = IndexRequestFactory() + request = factory.search_request( + query=SearchQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + vector=SearchQueryVector(values=[0.1, 0.2, 0.3]), + filter={"genre": {"$in": ["action"]}}, + id="test_id", + ), + fields=["more_stuff"], + ) + + assert request == SearchRecordsRequest( + query=SearchRecordsRequestQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + vector=SearchRecordsVector(values=VectorValues([0.1, 0.2, 0.3])), + filter={"genre": {"$in": ["action"]}}, + id="test_id", + ), + fields=["more_stuff"], + ) + + request2 = IndexRequestFactory.search_request( + query={ + "inputs": {"text": "Apple corporation"}, + "top_k": 3, + "vector": {"values": [0.1, 0.2, 0.3]}, + } + ) + assert request2 == SearchRecordsRequest( + query=SearchRecordsRequestQuery( + inputs={"text": "Apple corporation"}, + top_k=3, + vector=SearchRecordsVector(values=VectorValues([0.1, 0.2, 0.3])), + ), + fields=["*"], + ) From 60a1a60ccf3fdde5bf55fb38f81f3262e578dec8 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Thu, 30 Jan 2025 08:50:29 -0500 Subject: [PATCH 2/5] Remove unneeded test --- tests/perf/test_re_vs_in.py | 46 ------------------------------------- 1 file changed, 46 deletions(-) delete mode 100644 tests/perf/test_re_vs_in.py diff --git a/tests/perf/test_re_vs_in.py b/tests/perf/test_re_vs_in.py deleted file mode 100644 index 8d32a94c1..000000000 --- a/tests/perf/test_re_vs_in.py +++ /dev/null @@ -1,46 +0,0 @@ -import pytest -import re - - -def use_in_match(headers): - ct = headers.get("Content-Type", "") - return "json" in ct - - -def use_re_match(headers): - ct = headers.get("Content-Type", "") - return re.search("json", ct, re.IGNORECASE) - - -class TestStringMatchPerformance: - @pytest.mark.parametrize( - "headers", - [ - {"Content-Type": "application/json"} - # {"Content-Type": "application/JSON"}, - # {"Content-Type": "application/xml"}, - # {"Content-Type": "text/html"}, - # {"Content-Type": "text/plain"}, - # {"Content-Type": "application/xml"}, - # {"Content-Type": "text/html"}, - # {"Content-Type": "text/plain"}, - ], - ) - def test_using_in(self, benchmark, headers): - benchmark.pedantic(use_in_match, (headers,), rounds=100, warmup_rounds=1, iterations=50) - - @pytest.mark.parametrize( - "headers", - [ - {"Content-Type": "application/json"} - # {"Content-Type": "application/JSON"}, - # {"Content-Type": "application/xml"}, - # {"Content-Type": "text/html"}, - # {"Content-Type": "text/plain"}, - # {"Content-Type": "application/xml"}, - # {"Content-Type": "text/html"}, - # {"Content-Type": "text/plain"}, - ], - ) - def test_using_re(self, benchmark, headers): - benchmark.pedantic(use_re_match, (headers,), rounds=100, warmup_rounds=1, iterations=50) From c447bd4ee8c4292adce8ee38094a1ab556b636c1 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Thu, 30 Jan 2025 09:13:47 -0500 Subject: [PATCH 3/5] mypy fixes --- pinecone/control/pinecone.py | 6 +++- pinecone/data/index.py | 21 ++----------- pinecone/data/interfaces.py | 10 ++++--- pinecone/data/request_factory.py | 30 ++++++++++++++++--- .../data/types/search_query_typed_dict.py | 2 +- 5 files changed, 40 insertions(+), 29 deletions(-) diff --git a/pinecone/control/pinecone.py b/pinecone/control/pinecone.py index 5bf594e01..6290b4f11 100644 --- a/pinecone/control/pinecone.py +++ b/pinecone/control/pinecone.py @@ -1,6 +1,7 @@ import time import logging from typing import Optional, Dict, Any, Union +from enum import Enum from .index_host_store import IndexHostStore from .pinecone_interface import PineconeDBControlInterface @@ -288,7 +289,10 @@ def create_index_for_model( raise ValueError(f"{field} is required in embed") parsed_embed = {} for key, value in embed.items(): - parsed_embed[key] = convert_enum_to_string(value) + if isinstance(value, Enum): + parsed_embed[key] = convert_enum_to_string(value) + else: + parsed_embed[key] = value args = parse_non_empty_args( [ diff --git a/pinecone/data/index.py b/pinecone/data/index.py index bb57df262..c896a9dda 100644 --- a/pinecone/data/index.py +++ b/pinecone/data/index.py @@ -14,7 +14,6 @@ IndexDescription as DescribeIndexStatsResponse, UpsertResponse, ListResponse, - UpsertRecord, SearchRecordsResponse, ) from .dataclasses import Vector, SparseValues, FetchResponse, SearchQuery, SearchRerank @@ -206,24 +205,8 @@ def upsert_from_dataframe( return UpsertResponse(upserted_count=upserted_count) def upsert_records(self, namespace: str, records: List[Dict]): - if namespace is None: - raise ValueError("namespace is required when upserting records") - if not records or len(records) == 0: - raise ValueError("No records provided") - - records_to_upsert = [] - for record in records: - if not record.get("_id") and not record.get("id"): - raise ValueError("Each record must have an '_id' or 'id' value") - - records_to_upsert.append( - UpsertRecord( - record.get("_id", record.get("id")), - **{k: v for k, v in record.items() if k not in {"_id", "id"}}, - ) - ) - - self._vector_api.upsert_records_namespace(namespace, records_to_upsert) + args = IndexRequestFactory.upsert_records_args(namespace=namespace, records=records) + self._vector_api.upsert_records_namespace(**args) def search( self, diff --git a/pinecone/data/interfaces.py b/pinecone/data/interfaces.py index 2930e2754..fa837add8 100644 --- a/pinecone/data/interfaces.py +++ b/pinecone/data/interfaces.py @@ -20,6 +20,8 @@ FilterTypedDict, VectorTuple, VectorTupleWithMetadata, + SearchQueryTypedDict, + SearchRerankTypedDict, ) from .dataclasses import SearchQuery, SearchRerank @@ -122,8 +124,8 @@ def upsert_records(self, namespace: str, records: List[Dict]): def search( self, namespace: str, - query: Union[Dict, SearchQuery], - rerank: Optional[Union[Dict, SearchRerank]] = None, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, fields: Optional[List[str]] = ["*"], # Default to returning all fields ) -> SearchRecordsResponse: """ @@ -147,8 +149,8 @@ def search( def search_records( self, namespace: str, - query: Union[Dict, SearchQuery], - rerank: Optional[Union[Dict, SearchRerank]] = None, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, fields: Optional[List[str]] = ["*"], # Default to returning all fields ) -> SearchRecordsResponse: """Alias of the search() method.""" diff --git a/pinecone/data/request_factory.py b/pinecone/data/request_factory.py index 270963928..3cfe9455c 100644 --- a/pinecone/data/request_factory.py +++ b/pinecone/data/request_factory.py @@ -1,5 +1,5 @@ import logging -from typing import Union, List, Optional, Dict, Any +from typing import Union, List, Optional, Dict, Any, cast from pinecone.core.openapi.db_data.models import ( QueryRequest, @@ -12,6 +12,7 @@ SearchRecordsRequestRerank, VectorValues, SearchRecordsVector, + UpsertRecord, ) from ..utils import parse_non_empty_args, convert_enum_to_string from .vector_factory import VectorFactory @@ -183,7 +184,7 @@ def _parse_search_query( if isinstance(query, SearchQuery): query_dict = query.as_dict() else: - query_dict = query + query_dict = cast(dict[str, Any], query) required_fields = {"top_k"} for key in required_fields: @@ -213,7 +214,7 @@ def _parse_search_vector( return None vector_dict = vector.as_dict() else: - vector_dict = vector + vector_dict = cast(dict[str, Any], vector) if ( vector_dict.get("values", None) is None and vector_dict.get("sparse_values", None) is None @@ -236,7 +237,7 @@ def _parse_search_rerank(rerank: Optional[Union[SearchRerankTypedDict, SearchRer if isinstance(rerank, SearchRerank): rerank_dict = rerank.as_dict() else: - rerank_dict = rerank + rerank_dict = cast(dict[str, Any], rerank) required_fields = {"model", "rank_fields"} for key in required_fields: @@ -246,3 +247,24 @@ def _parse_search_rerank(rerank: Optional[Union[SearchRerankTypedDict, SearchRer rerank_dict["model"] = convert_enum_to_string(rerank_dict["model"]) return SearchRecordsRequestRerank(**rerank_dict) + + @staticmethod + def upsert_records_args(namespace: str, records: List[Dict]): + if namespace is None: + raise ValueError("namespace is required when upserting records") + if not records or len(records) == 0: + raise ValueError("No records provided") + + records_to_upsert = [] + for record in records: + if not record.get("_id") and not record.get("id"): + raise ValueError("Each record must have an '_id' or 'id' value") + + records_to_upsert.append( + UpsertRecord( + record.get("_id", record.get("id")), + **{k: v for k, v in record.items() if k not in {"_id", "id"}}, + ) + ) + + return {"namespace": namespace, "upsert_record": records_to_upsert} diff --git a/pinecone/data/types/search_query_typed_dict.py b/pinecone/data/types/search_query_typed_dict.py index 118607ae0..c21ba1202 100644 --- a/pinecone/data/types/search_query_typed_dict.py +++ b/pinecone/data/types/search_query_typed_dict.py @@ -19,7 +19,7 @@ class SearchQueryTypedDict(TypedDict): Required. """ - filter: Optional[Dict[str, Any]] = None + filter: Optional[Dict[str, Any]] """ The filter to apply to the search. Optional. From 1840eaa1ddd9fd027e8a9cdb0ac232e2b01411f2 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Thu, 30 Jan 2025 09:46:57 -0500 Subject: [PATCH 4/5] Add asyncio implementation --- pinecone/data/index.py | 2 +- pinecone/data/index_asyncio.py | 32 ++- pinecone/data/interfaces.py | 50 +++++ pinecone/openapi_support/rest_aiohttp.py | 27 ++- tests/integration/data_asyncio/conftest.py | 39 +++- .../test_search_and_upsert_records.py | 200 ++++++++++++++++++ 6 files changed, 338 insertions(+), 12 deletions(-) create mode 100644 tests/integration/data_asyncio/test_search_and_upsert_records.py diff --git a/pinecone/data/index.py b/pinecone/data/index.py index c896a9dda..f3841c37a 100644 --- a/pinecone/data/index.py +++ b/pinecone/data/index.py @@ -215,7 +215,7 @@ def search( rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, fields: Optional[List[str]] = ["*"], # Default to returning all fields ) -> SearchRecordsResponse: - if not namespace: + if namespace is None: raise Exception("Namespace is required when searching records") request = IndexRequestFactory.search_request(query=query, rerank=rerank, fields=fields) diff --git a/pinecone/data/index_asyncio.py b/pinecone/data/index_asyncio.py index 5b3c73eb0..47204e18d 100644 --- a/pinecone/data/index_asyncio.py +++ b/pinecone/data/index_asyncio.py @@ -20,6 +20,7 @@ UpsertResponse, DeleteRequest, ListResponse, + SearchRecordsResponse, ) from ..utils import ( @@ -35,8 +36,10 @@ VectorTuple, VectorTupleWithMetadata, FilterTypedDict, + SearchQueryTypedDict, + SearchRerankTypedDict, ) -from .dataclasses import Vector, SparseValues, FetchResponse +from .dataclasses import Vector, SparseValues, FetchResponse, SearchQuery, SearchRerank from pinecone.openapi_support import OPENAPI_ENDPOINT_PARAMS from .index import IndexRequestFactory @@ -400,5 +403,32 @@ async def list(self, **kwargs): else: done = True + async def upsert_records(self, namespace: str, records: List[Dict]): + args = IndexRequestFactory.upsert_records_args(namespace=namespace, records=records) + await self._vector_api.upsert_records_namespace(**args) + + async def search( + self, + namespace: str, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + if namespace is None: + raise Exception("Namespace is required when searching records") + + request = IndexRequestFactory.search_request(query=query, rerank=rerank, fields=fields) + + return await self._vector_api.search_records_namespace(namespace, request) + + async def search_records( + self, + namespace: str, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + return await self.search(namespace, query=query, rerank=rerank, fields=fields) + def _openapi_kwargs(self, kwargs): return {k: v for k, v in kwargs.items() if k in OPENAPI_ENDPOINT_PARAMS} diff --git a/pinecone/data/interfaces.py b/pinecone/data/interfaces.py index fa837add8..9ebe4ad80 100644 --- a/pinecone/data/interfaces.py +++ b/pinecone/data/interfaces.py @@ -856,3 +856,53 @@ async def list(self, **kwargs): namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional] """ pass + + @abstractmethod + async def upsert_records(self, namespace: str, records: List[Dict]): + """ + Upsert records to a namespace. + + Converts records into embeddings and upserts them into a namespacce in the index. + + :param namespace: The namespace of the index to upsert records to. + :type namespace: str, required + :param records: The records to upsert into the index. + :type records: List[Dict], required + """ + pass + + @abstractmethod + async def search( + self, + namespace: str, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + """ + Search for records. + + This operation converts a query to a vector embedding and then searches a namespace. You + can optionally provide a reranking operation as part of the search. + + :param namespace: The namespace in the index to search. + :type namespace: str, required + :param query: The SearchQuery to use for the search. + :type query: Union[Dict, SearchQuery], required + :param rerank: The SearchRerank to use with the search request. + :type rerank: Union[Dict, SearchRerank], optional + :return: The records that match the search. + :rtype: RecordModel + """ + pass + + @abstractmethod + async def search_records( + self, + namespace: str, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + """Alias of the search() method.""" + pass diff --git a/pinecone/openapi_support/rest_aiohttp.py b/pinecone/openapi_support/rest_aiohttp.py index e0e2e40a6..902ca9769 100644 --- a/pinecone/openapi_support/rest_aiohttp.py +++ b/pinecone/openapi_support/rest_aiohttp.py @@ -1,5 +1,6 @@ import weakref import asyncio +import json from .rest_utils import RestClientInterface, RESTResponse, raise_exceptions_or_return @@ -43,13 +44,25 @@ async def request( if method in ["POST", "PUT", "PATCH", "OPTIONS"] and ("Content-Type" not in headers): headers["Content-Type"] = "application/json" - async with self._session.request( - method, url, params=query_params, headers=headers, json=body - ) as resp: - content = await resp.read() - return raise_exceptions_or_return( - RESTResponse(resp.status, content, resp.headers, resp.reason) - ) + if "application/x-ndjson" in headers.get("Content-Type", "").lower(): + ndjson_data = "\n".join(json.dumps(record) for record in body) + + async with self._session.request( + method, url, params=query_params, headers=headers, data=ndjson_data + ) as resp: + content = await resp.read() + return raise_exceptions_or_return( + RESTResponse(resp.status, content, resp.headers, resp.reason) + ) + + else: + async with self._session.request( + method, url, params=query_params, headers=headers, json=body + ) as resp: + content = await resp.read() + return raise_exceptions_or_return( + RESTResponse(resp.status, content, resp.headers, resp.reason) + ) async def GET( self, url, headers=None, query_params=None, _preload_content=True, _request_timeout=None diff --git a/tests/integration/data_asyncio/conftest.py b/tests/integration/data_asyncio/conftest.py index 09d796150..fd4c0657d 100644 --- a/tests/integration/data_asyncio/conftest.py +++ b/tests/integration/data_asyncio/conftest.py @@ -7,6 +7,8 @@ import logging from typing import Callable, Optional, Awaitable, Union +from pinecone import CloudProvider, AwsRegion, IndexEmbed, EmbedModel + logger = logging.getLogger(__name__) @@ -69,6 +71,11 @@ def sparse_index_name(): return generate_index_name("sparse") +@pytest.fixture(scope="session") +def model_index_name(): + return generate_index_name("embed") + + def build_asyncioindex_client(client, index_host) -> _AsyncioIndex: return client.AsyncioIndex(host=index_host) @@ -107,20 +114,46 @@ def sparse_index_host(sparse_index_name, spec): pc = build_client() if sparse_index_name not in pc.list_indexes().names(): - logger.info("Creating sparse index with name: " + sparse_index_name) + logger.info(f"Creating index with name {sparse_index_name}") pc.create_index( name=sparse_index_name, metric="dotproduct", spec=spec, vector_type="sparse" ) else: - logger.info("Sparse index with name " + sparse_index_name + " already exists") + logger.info(f"Index with name {sparse_index_name} already exists") description = pc.describe_index(name=sparse_index_name) yield description.host - logger.info("Deleting sparse index with name: " + sparse_index_name) + logger.info(f"Deleting index with name {sparse_index_name}") pc.delete_index(sparse_index_name, -1) +@pytest.fixture(scope="session") +def model_index_host(model_index_name): + pc = build_client() + + if model_index_name not in pc.list_indexes().names(): + logger.info(f"Creating index {model_index_name}") + pc.create_index_for_model( + name=model_index_name, + cloud=CloudProvider.AWS, + region=AwsRegion.US_WEST_2, + embed=IndexEmbed( + model=EmbedModel.Multilingual_E5_Large, + field_map={"text": "my_text_field"}, + metric="cosine", + ), + ) + else: + logger.info(f"Index {model_index_name} already exists") + + description = pc.describe_index(name=model_index_name) + yield description.host + + logger.info(f"Deleting index {model_index_name}") + pc.delete_index(model_index_name, -1) + + async def poll_for_freshness(asyncio_idx, target_namespace, target_vector_count): max_wait_time = 60 * 3 # 3 minutes time_waited = 0 diff --git a/tests/integration/data_asyncio/test_search_and_upsert_records.py b/tests/integration/data_asyncio/test_search_and_upsert_records.py new file mode 100644 index 000000000..a9f69f382 --- /dev/null +++ b/tests/integration/data_asyncio/test_search_and_upsert_records.py @@ -0,0 +1,200 @@ +import pytest +import logging +from ..helpers import random_string, embedding_values +from .conftest import build_asyncioindex_client, poll_for_freshness + +from pinecone import RerankModel, PineconeApiException + +logger = logging.getLogger(__name__) + +model_index_dimension = 1024 # Currently controlled by "multilingual-e5-large" + + +@pytest.fixture +def records_to_upsert(): + return [ + { + "id": "test1", + "my_text_field": "Apple is a popular fruit known for its sweetness and crisp texture.", + "more_stuff": "more stuff!", + }, + { + "id": "test2", + "my_text_field": "The tech company Apple is known for its innovative products like the iPhone.", + "more_stuff": "more stuff!", + }, + { + "id": "test3", + "my_text_field": "Many people enjoy eating apples as a healthy snack.", + "more_stuff": "more stuff!", + }, + { + "id": "test4", + "my_text_field": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces.", + "more_stuff": "more stuff!", + }, + { + "id": "test5", + "my_text_field": "An apple a day keeps the doctor away, as the saying goes.", + "more_stuff": "more stuff!", + }, + { + "id": "test6", + "my_text_field": "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership.", + "more_stuff": "extra more stuff!", + }, + ] + + +@pytest.mark.asyncio +class TestUpsertAndSearchRecords: + async def test_search_records(self, pc, model_index_host, records_to_upsert): + model_idx = build_asyncioindex_client(pc, model_index_host) + + target_namespace = random_string(10) + await model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + await poll_for_freshness(model_idx, target_namespace, len(records_to_upsert)) + + response = await model_idx.search_records( + namespace=target_namespace, query={"inputs": {"text": "Apple corporation"}, "top_k": 3} + ) + assert len(response.result.hits) == 3 + assert response.usage is not None + + # Test search alias + response2 = await model_idx.search( + namespace=target_namespace, query={"inputs": {"text": "Apple corporation"}, "top_k": 3} + ) + assert response == response2 + + # validate similar records and contents + similar_record_ids = ["test2", "test4", "test6"] + + for hit in response.result.hits: + assert hit._id in similar_record_ids + similar_record_ids.remove(hit._id) + + assert hit._score > 0 + assert hit.fields.get("my_text_field") is not None + assert hit.fields.get("more_stuff") is not None + + # search for records while filtering fields + response_filtered = await model_idx.search_records( + namespace="test-namespace", + query={"inputs": {"text": "Apple corporation"}, "top_k": 3}, + fields=["more_stuff"], + ) + + for hit in response_filtered.result.hits: + assert hit.fields.get("my_text_field") is None + assert hit.fields.get("more_stuff") is not None + + async def test_search_records_with_vector(self, pc, model_index_host, records_to_upsert): + model_idx = build_asyncioindex_client(pc, model_index_host) + + target_namespace = random_string(10) + await model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + await poll_for_freshness(model_idx, target_namespace, len(records_to_upsert)) + + # Search for similar records + search_query = {"top_k": 3, "vector": {"values": embedding_values(model_index_dimension)}} + response = await model_idx.search_records(namespace=target_namespace, query=search_query) + assert len(response.result.hits) == 3 + assert response.usage is not None + + # Test search alias + response2 = await model_idx.search(namespace=target_namespace, query=search_query) + assert response == response2 + + @pytest.mark.parametrize("rerank_model", ["bge-reranker-v2-m3", RerankModel.Bge_Reranker_V2_M3]) + async def test_search_with_rerank(self, pc, model_index_host, records_to_upsert, rerank_model): + model_idx = build_asyncioindex_client(pc, model_index_host) + target_namespace = random_string(10) + await model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + await poll_for_freshness(model_idx, target_namespace, len(records_to_upsert)) + + # Search for similar records + response = await model_idx.search_records( + namespace=target_namespace, + query={"inputs": {"text": "Apple corporation"}, "top_k": 3}, + rerank={"model": rerank_model, "rank_fields": ["my_text_field"], "top_n": 3}, + ) + assert len(response.result.hits) == 3 + assert response.usage is not None + + # validate similar records and contents + similar_record_ids = ["test6", "test4", "test2"] + for hit in response.result.hits: + assert hit._id in similar_record_ids + similar_record_ids.remove(hit._id) + + assert hit._score > 0 + assert hit.fields.get("my_text_field") is not None + assert hit.fields.get("more_stuff") is not None + + async def test_search_with_rerank_query(self, pc, model_index_host, records_to_upsert): + model_idx = build_asyncioindex_client(pc, model_index_host) + target_namespace = random_string(10) + await model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + # Sleep for freshness + await poll_for_freshness(model_idx, target_namespace, len(records_to_upsert)) + + # Search for similar records + response = await model_idx.search_records( + namespace=target_namespace, + query={"inputs": {"text": "Apple corporation"}, "top_k": 3}, + rerank={ + "model": "bge-reranker-v2-m3", + "rank_fields": ["my_text_field"], + "top_n": 3, + "query": "Apple corporation", + }, + ) + assert len(response.result.hits) == 3 + assert response.usage is not None + + +@pytest.mark.asyncio +class TestUpsertAndSearchRecordsErrorCases: + async def test_search_with_rerank_nonexistent_model_error( + self, pc, model_index_host, records_to_upsert + ): + model_idx = build_asyncioindex_client(pc, model_index_host) + target_namespace = random_string(10) + await model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + await poll_for_freshness(model_idx, target_namespace, len(records_to_upsert)) + + with pytest.raises(PineconeApiException, match=r"Model 'non-existent-model' not found"): + await model_idx.search_records( + namespace=target_namespace, + query={"inputs": {"text": "Apple corporation"}, "top_k": 3}, + rerank={ + "model": "non-existent-model", + "rank_fields": ["my_text_field"], + "top_n": 3, + }, + ) + + @pytest.mark.skip(reason="Possible bug in the API") + async def test_search_with_rerank_empty_rank_fields_error( + self, pc, model_index_host, records_to_upsert + ): + model_idx = build_asyncioindex_client(pc, model_index_host) + target_namespace = random_string(10) + await model_idx.upsert_records(namespace=target_namespace, records=records_to_upsert) + + await poll_for_freshness(model_idx, target_namespace, len(records_to_upsert)) + + with pytest.raises( + PineconeApiException, match=r"Only one rank field is supported for model" + ): + await model_idx.search_records( + namespace="test-namespace", + query={"inputs": {"text": "Apple corporation"}, "top_k": 3}, + rerank={"model": "bge-reranker-v2-m3", "rank_fields": [], "top_n": 3}, + ) From 906b0ae97e79135ed2eb0c3b262cf0aed4495fc2 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Thu, 30 Jan 2025 10:07:08 -0500 Subject: [PATCH 5/5] Skip search tests on grpc run --- tests/integration/data/test_search_and_upsert_records.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/integration/data/test_search_and_upsert_records.py b/tests/integration/data/test_search_and_upsert_records.py index 0b0f6444c..e83a5cd88 100644 --- a/tests/integration/data/test_search_and_upsert_records.py +++ b/tests/integration/data/test_search_and_upsert_records.py @@ -3,6 +3,7 @@ from typing import List from ..helpers import random_string, embedding_values import logging +import os from pinecone import RerankModel, PineconeApiException from pinecone.data import _Index @@ -69,6 +70,9 @@ def records_to_upsert(): ] +@pytest.mark.skipif( + os.getenv("USE_GRPC") != "false", reason="These actions are not supported in gRPC" +) class TestUpsertAndSearchRecords: def test_search_records(self, model_idx, records_to_upsert): target_namespace = random_string(10) @@ -182,6 +186,9 @@ def test_search_with_rerank_query(self, model_idx, records_to_upsert): assert response.usage is not None +@pytest.mark.skipif( + os.getenv("USE_GRPC") != "false", reason="These actions are not supported in gRPC" +) class TestUpsertAndSearchRecordsErrorCases: def test_search_with_rerank_nonexistent_model_error(self, model_idx, records_to_upsert): target_namespace = random_string(10)