From f8b8af668ac86eef8b6f6ee157af22debb0d4793 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Wed, 28 Jan 2026 12:44:10 -0500 Subject: [PATCH] feat: add schema field type classes for FTS Add user-friendly schema field classes for defining index schemas: - TextField: for string fields with filterable/full_text_searchable options - IntegerField: for integer fields with filterable option - FloatField: for float fields with filterable option - DenseVectorField: for dense vector embeddings with dimension/metric - SparseVectorField: for sparse vector embeddings - SemanticTextField: for integrated inference with model configuration Each class provides a to_dict() method that serializes to the API format. Closes SDK-101 --- pinecone/__init__.py | 8 + pinecone/db_control/models/__init__.py | 16 ++ pinecone/db_control/models/schema_fields.py | 229 +++++++++++++++++++ tests/unit/models/test_schema_fields.py | 231 ++++++++++++++++++++ 4 files changed, 484 insertions(+) create mode 100644 pinecone/db_control/models/schema_fields.py create mode 100644 tests/unit/models/test_schema_fields.py diff --git a/pinecone/__init__.py b/pinecone/__init__.py index 1064610c2..fe031ff50 100644 --- a/pinecone/__init__.py +++ b/pinecone/__init__.py @@ -107,6 +107,14 @@ "pinecone.db_control.types", "CreateIndexForModelEmbedTypedDict", ), + # Schema field types + "TextField": ("pinecone.db_control.models", "TextField"), + "IntegerField": ("pinecone.db_control.models", "IntegerField"), + "FloatField": ("pinecone.db_control.models", "FloatField"), + "DenseVectorField": ("pinecone.db_control.models", "DenseVectorField"), + "SparseVectorField": ("pinecone.db_control.models", "SparseVectorField"), + "SemanticTextField": ("pinecone.db_control.models", "SemanticTextField"), + "SchemaField": ("pinecone.db_control.models", "SchemaField"), # Read capacity TypedDict classes "ScalingConfigManualDict": ( "pinecone.db_control.models.serverless_spec", diff --git a/pinecone/db_control/models/__init__.py b/pinecone/db_control/models/__init__.py index cf866f116..8f2e10ee6 100644 --- a/pinecone/db_control/models/__init__.py +++ b/pinecone/db_control/models/__init__.py @@ -11,6 +11,15 @@ from .backup_list import BackupList from .restore_job_model import RestoreJobModel from .restore_job_list import RestoreJobList +from .schema_fields import ( + TextField, + IntegerField, + FloatField, + DenseVectorField, + SparseVectorField, + SemanticTextField, + SchemaField, +) __all__ = [ @@ -28,4 +37,11 @@ "BackupList", "RestoreJobModel", "RestoreJobList", + "TextField", + "IntegerField", + "FloatField", + "DenseVectorField", + "SparseVectorField", + "SemanticTextField", + "SchemaField", ] diff --git a/pinecone/db_control/models/schema_fields.py b/pinecone/db_control/models/schema_fields.py new file mode 100644 index 000000000..7b3b972fc --- /dev/null +++ b/pinecone/db_control/models/schema_fields.py @@ -0,0 +1,229 @@ +"""Schema field type classes for defining index schemas. + +These classes provide a user-friendly API for defining index schemas with typed fields. +Each field class serializes to the format expected by the Pinecone API. +""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass +class TextField: + """A text field for storing string values. + + :param filterable: Whether the field can be used in query filters. + :param full_text_searchable: Whether the field is indexed for full-text search. + :param description: Optional description of the field. + + Example usage:: + + from pinecone import TextField + + schema = { + "title": TextField(full_text_searchable=True), + "category": TextField(filterable=True), + } + """ + + filterable: bool = False + full_text_searchable: bool = False + description: str | None = None + + def to_dict(self) -> dict: + """Serialize to API format. + + :returns: Dictionary representation for the API. + """ + result: dict = {"type": "string"} + if self.filterable: + result["filterable"] = True + if self.full_text_searchable: + result["full_text_searchable"] = True + if self.description is not None: + result["description"] = self.description + return result + + +@dataclass +class IntegerField: + """An integer field for storing numeric values. + + :param filterable: Whether the field can be used in query filters. + :param description: Optional description of the field. + + Example usage:: + + from pinecone import IntegerField + + schema = { + "year": IntegerField(filterable=True), + "count": IntegerField(), + } + """ + + filterable: bool = False + description: str | None = None + + def to_dict(self) -> dict: + """Serialize to API format. + + :returns: Dictionary representation for the API. + """ + result: dict = {"type": "integer"} + if self.filterable: + result["filterable"] = True + if self.description is not None: + result["description"] = self.description + return result + + +@dataclass +class FloatField: + """A floating-point field for storing decimal values. + + :param filterable: Whether the field can be used in query filters. + :param description: Optional description of the field. + + Example usage:: + + from pinecone import FloatField + + schema = { + "price": FloatField(filterable=True), + "score": FloatField(), + } + """ + + filterable: bool = False + description: str | None = None + + def to_dict(self) -> dict: + """Serialize to API format. + + :returns: Dictionary representation for the API. + """ + result: dict = {"type": "float"} + if self.filterable: + result["filterable"] = True + if self.description is not None: + result["description"] = self.description + return result + + +@dataclass +class DenseVectorField: + """A dense vector field for storing vector embeddings. + + :param dimension: The dimension of the vectors (1 to 20000). + :param metric: The distance metric for similarity search. + Must be one of: "cosine", "euclidean", "dotproduct". + :param description: Optional description of the field. + + Example usage:: + + from pinecone import DenseVectorField + + schema = { + "embedding": DenseVectorField(dimension=1536, metric="cosine"), + } + """ + + dimension: int + metric: str + description: str | None = None + + def to_dict(self) -> dict: + """Serialize to API format. + + :returns: Dictionary representation for the API. + """ + result: dict = {"type": "dense_vector", "dimension": self.dimension, "metric": self.metric} + if self.description is not None: + result["description"] = self.description + return result + + +@dataclass +class SparseVectorField: + """A sparse vector field for storing sparse embeddings. + + :param metric: The distance metric for similarity search. + Must be "dotproduct" for sparse vectors. + :param description: Optional description of the field. + + Example usage:: + + from pinecone import SparseVectorField + + schema = { + "sparse_embedding": SparseVectorField(metric="dotproduct"), + } + """ + + metric: str = "dotproduct" + description: str | None = None + + def to_dict(self) -> dict: + """Serialize to API format. + + :returns: Dictionary representation for the API. + """ + result: dict = {"type": "sparse_vector", "metric": self.metric} + if self.description is not None: + result["description"] = self.description + return result + + +@dataclass +class SemanticTextField: + """A semantic text field with integrated inference embedding. + + This field type enables automatic embedding generation using a specified model. + When documents are upserted, the text in the mapped field is automatically + converted to vectors. + + :param model: The name of the embedding model to use. + :param field_map: Maps field names in documents to the field used for embedding. + :param read_parameters: Optional parameters for the model during queries. + :param write_parameters: Optional parameters for the model during indexing. + :param description: Optional description of the field. + + Example usage:: + + from pinecone import SemanticTextField + + schema = { + "content": SemanticTextField( + model="multilingual-e5-large", + field_map={"text": "content"}, + ), + } + """ + + model: str + field_map: dict[str, str] + read_parameters: dict[str, object] | None = None + write_parameters: dict[str, object] | None = None + description: str | None = None + + def to_dict(self) -> dict: + """Serialize to API format. + + :returns: Dictionary representation for the API. + """ + result: dict = {"type": "semantic_text", "model": self.model, "field_map": self.field_map} + if self.read_parameters is not None: + result["read_parameters"] = self.read_parameters + if self.write_parameters is not None: + result["write_parameters"] = self.write_parameters + if self.description is not None: + result["description"] = self.description + return result + + +# Type alias for any schema field +SchemaField = ( + TextField | IntegerField | FloatField | DenseVectorField | SparseVectorField | SemanticTextField +) diff --git a/tests/unit/models/test_schema_fields.py b/tests/unit/models/test_schema_fields.py new file mode 100644 index 000000000..5da332813 --- /dev/null +++ b/tests/unit/models/test_schema_fields.py @@ -0,0 +1,231 @@ +"""Tests for schema field type classes. + +Note: We import the schema_fields module by creating a standalone module +since the alpha API changes have broken the normal import chain through +db_control. Once SDK-104/107 are complete, these tests can be updated +to import from the normal locations. +""" + +import os +import sys +import types + + +def _load_schema_fields_module(): + """Load schema_fields.py as a standalone module to avoid broken imports.""" + module_name = "pinecone.db_control.models.schema_fields" + + # Create module and register it before exec + module = types.ModuleType(module_name) + module.__file__ = os.path.join( + os.path.dirname(__file__), + "..", + "..", + "..", + "pinecone", + "db_control", + "models", + "schema_fields.py", + ) + sys.modules[module_name] = module + + # Execute the module code + with open(module.__file__) as f: + code = compile(f.read(), module.__file__, "exec") + exec(code, module.__dict__) + + return module + + +_schema_fields = _load_schema_fields_module() + +TextField = _schema_fields.TextField +IntegerField = _schema_fields.IntegerField +FloatField = _schema_fields.FloatField +DenseVectorField = _schema_fields.DenseVectorField +SparseVectorField = _schema_fields.SparseVectorField +SemanticTextField = _schema_fields.SemanticTextField + + +class TestTextField: + def test_default_values(self): + field = TextField() + assert field.filterable is False + assert field.full_text_searchable is False + assert field.description is None + + def test_to_dict_minimal(self): + field = TextField() + result = field.to_dict() + assert result == {"type": "string"} + + def test_to_dict_filterable(self): + field = TextField(filterable=True) + result = field.to_dict() + assert result == {"type": "string", "filterable": True} + + def test_to_dict_full_text_searchable(self): + field = TextField(full_text_searchable=True) + result = field.to_dict() + assert result == {"type": "string", "full_text_searchable": True} + + def test_to_dict_all_options(self): + field = TextField(filterable=True, full_text_searchable=True, description="A text field") + result = field.to_dict() + assert result == { + "type": "string", + "filterable": True, + "full_text_searchable": True, + "description": "A text field", + } + + +class TestIntegerField: + def test_default_values(self): + field = IntegerField() + assert field.filterable is False + assert field.description is None + + def test_to_dict_minimal(self): + field = IntegerField() + result = field.to_dict() + assert result == {"type": "integer"} + + def test_to_dict_filterable(self): + field = IntegerField(filterable=True) + result = field.to_dict() + assert result == {"type": "integer", "filterable": True} + + def test_to_dict_with_description(self): + field = IntegerField(filterable=True, description="Year of publication") + result = field.to_dict() + assert result == { + "type": "integer", + "filterable": True, + "description": "Year of publication", + } + + +class TestFloatField: + def test_default_values(self): + field = FloatField() + assert field.filterable is False + assert field.description is None + + def test_to_dict_minimal(self): + field = FloatField() + result = field.to_dict() + assert result == {"type": "float"} + + def test_to_dict_filterable(self): + field = FloatField(filterable=True) + result = field.to_dict() + assert result == {"type": "float", "filterable": True} + + def test_to_dict_with_description(self): + field = FloatField(filterable=True, description="Price in USD") + result = field.to_dict() + assert result == {"type": "float", "filterable": True, "description": "Price in USD"} + + +class TestDenseVectorField: + def test_required_params(self): + field = DenseVectorField(dimension=1536, metric="cosine") + assert field.dimension == 1536 + assert field.metric == "cosine" + assert field.description is None + + def test_to_dict_minimal(self): + field = DenseVectorField(dimension=1536, metric="cosine") + result = field.to_dict() + assert result == {"type": "dense_vector", "dimension": 1536, "metric": "cosine"} + + def test_to_dict_with_euclidean(self): + field = DenseVectorField(dimension=768, metric="euclidean") + result = field.to_dict() + assert result == {"type": "dense_vector", "dimension": 768, "metric": "euclidean"} + + def test_to_dict_with_dotproduct(self): + field = DenseVectorField(dimension=384, metric="dotproduct") + result = field.to_dict() + assert result == {"type": "dense_vector", "dimension": 384, "metric": "dotproduct"} + + def test_to_dict_with_description(self): + field = DenseVectorField(dimension=1536, metric="cosine", description="OpenAI embeddings") + result = field.to_dict() + assert result == { + "type": "dense_vector", + "dimension": 1536, + "metric": "cosine", + "description": "OpenAI embeddings", + } + + +class TestSparseVectorField: + def test_default_values(self): + field = SparseVectorField() + assert field.metric == "dotproduct" + assert field.description is None + + def test_to_dict_minimal(self): + field = SparseVectorField() + result = field.to_dict() + assert result == {"type": "sparse_vector", "metric": "dotproduct"} + + def test_to_dict_with_description(self): + field = SparseVectorField(description="BM25 sparse vectors") + result = field.to_dict() + assert result == { + "type": "sparse_vector", + "metric": "dotproduct", + "description": "BM25 sparse vectors", + } + + +class TestSemanticTextField: + def test_required_params(self): + field = SemanticTextField(model="multilingual-e5-large", field_map={"text": "content"}) + assert field.model == "multilingual-e5-large" + assert field.field_map == {"text": "content"} + assert field.read_parameters is None + assert field.write_parameters is None + assert field.description is None + + def test_to_dict_minimal(self): + field = SemanticTextField(model="multilingual-e5-large", field_map={"text": "content"}) + result = field.to_dict() + assert result == { + "type": "semantic_text", + "model": "multilingual-e5-large", + "field_map": {"text": "content"}, + } + + def test_to_dict_with_parameters(self): + field = SemanticTextField( + model="multilingual-e5-large", + field_map={"text": "content"}, + read_parameters={"truncate": "END"}, + write_parameters={"truncate": "START"}, + ) + result = field.to_dict() + assert result == { + "type": "semantic_text", + "model": "multilingual-e5-large", + "field_map": {"text": "content"}, + "read_parameters": {"truncate": "END"}, + "write_parameters": {"truncate": "START"}, + } + + def test_to_dict_with_description(self): + field = SemanticTextField( + model="multilingual-e5-large", + field_map={"text": "content"}, + description="Semantic search field", + ) + result = field.to_dict() + assert result == { + "type": "semantic_text", + "model": "multilingual-e5-large", + "field_map": {"text": "content"}, + "description": "Semantic search field", + }