diff --git a/pinecone/__init__.py b/pinecone/__init__.py index fe031ff5..1600db72 100644 --- a/pinecone/__init__.py +++ b/pinecone/__init__.py @@ -115,6 +115,7 @@ "SparseVectorField": ("pinecone.db_control.models", "SparseVectorField"), "SemanticTextField": ("pinecone.db_control.models", "SemanticTextField"), "SchemaField": ("pinecone.db_control.models", "SchemaField"), + "SchemaBuilder": ("pinecone.db_control.models", "SchemaBuilder"), # Read capacity TypedDict classes "ScalingConfigManualDict": ( "pinecone.db_control.models.serverless_spec", diff --git a/pinecone/db_control/models/__init__.py b/pinecone/db_control/models/__init__.py index 8f2e10ee..9e5338b3 100644 --- a/pinecone/db_control/models/__init__.py +++ b/pinecone/db_control/models/__init__.py @@ -20,6 +20,7 @@ SemanticTextField, SchemaField, ) +from .schema_builder import SchemaBuilder __all__ = [ @@ -44,4 +45,5 @@ "SparseVectorField", "SemanticTextField", "SchemaField", + "SchemaBuilder", ] diff --git a/pinecone/db_control/models/schema_builder.py b/pinecone/db_control/models/schema_builder.py new file mode 100644 index 00000000..b5b04240 --- /dev/null +++ b/pinecone/db_control/models/schema_builder.py @@ -0,0 +1,215 @@ +"""SchemaBuilder fluent API for building index schemas. + +This module provides a builder pattern for constructing index schemas +with a fluent, chainable API. +""" + +from __future__ import annotations + +from .schema_fields import ( + TextField, + IntegerField, + FloatField, + DenseVectorField, + SparseVectorField, + SemanticTextField, + SchemaField, +) + + +class SchemaBuilder: + """A fluent builder for constructing index schemas. + + The SchemaBuilder provides a chainable API for defining index schemas + with typed fields. Each method returns the builder instance, allowing + for method chaining. + + Example usage:: + + from pinecone import SchemaBuilder + + schema = (SchemaBuilder() + .text("title", full_text_searchable=True) + .integer("year", filterable=True) + .dense_vector("embedding", dimension=1536, metric="cosine") + .build()) + + pc.create_index(name="my-index", schema=schema, ...) + """ + + def __init__(self) -> None: + """Initialize an empty schema builder.""" + self._fields: dict[str, SchemaField] = {} + + def text( + self, + name: str, + *, + filterable: bool = False, + full_text_searchable: bool = False, + description: str | None = None, + ) -> SchemaBuilder: + """Add a text field to the schema. + + :param name: The field name. + :param filterable: Whether the field can be used in query filters. + :param full_text_searchable: Whether the field is indexed for full-text search. + :param description: Optional description of the field. + :returns: The builder instance for chaining. + + Example:: + + schema = (SchemaBuilder() + .text("title", full_text_searchable=True) + .text("category", filterable=True) + .build()) + """ + self._fields[name] = TextField( + filterable=filterable, + full_text_searchable=full_text_searchable, + description=description, + ) + return self + + def integer( + self, name: str, *, filterable: bool = False, description: str | None = None + ) -> SchemaBuilder: + """Add an integer field to the schema. + + :param name: The field name. + :param filterable: Whether the field can be used in query filters. + :param description: Optional description of the field. + :returns: The builder instance for chaining. + + Example:: + + schema = (SchemaBuilder() + .integer("year", filterable=True) + .integer("count") + .build()) + """ + self._fields[name] = IntegerField(filterable=filterable, description=description) + return self + + def float( + self, name: str, *, filterable: bool = False, description: str | None = None + ) -> SchemaBuilder: + """Add a float field to the schema. + + :param name: The field name. + :param filterable: Whether the field can be used in query filters. + :param description: Optional description of the field. + :returns: The builder instance for chaining. + + Example:: + + schema = (SchemaBuilder() + .float("price", filterable=True) + .float("score") + .build()) + """ + self._fields[name] = FloatField(filterable=filterable, description=description) + return self + + def dense_vector( + self, name: str, *, dimension: int, metric: str, description: str | None = None + ) -> SchemaBuilder: + """Add a dense vector field to the schema. + + :param name: The field name. + :param dimension: The dimension of the vectors (1 to 20000). + :param metric: The distance metric ("cosine", "euclidean", or "dotproduct"). + :param description: Optional description of the field. + :returns: The builder instance for chaining. + + Example:: + + schema = (SchemaBuilder() + .dense_vector("embedding", dimension=1536, metric="cosine") + .build()) + """ + self._fields[name] = DenseVectorField( + dimension=dimension, metric=metric, description=description + ) + return self + + def sparse_vector( + self, name: str, *, metric: str = "dotproduct", description: str | None = None + ) -> SchemaBuilder: + """Add a sparse vector field to the schema. + + :param name: The field name. + :param metric: The distance metric (must be "dotproduct" for sparse vectors). + :param description: Optional description of the field. + :returns: The builder instance for chaining. + + Example:: + + schema = (SchemaBuilder() + .sparse_vector("sparse_embedding") + .build()) + """ + self._fields[name] = SparseVectorField(metric=metric, description=description) + return self + + def semantic_text( + self, + name: str, + *, + model: str, + field_map: dict[str, str], + read_parameters: dict[str, object] | None = None, + write_parameters: dict[str, object] | None = None, + description: str | None = None, + ) -> SchemaBuilder: + """Add a semantic text field with integrated inference. + + :param name: The field name. + :param model: The name of the embedding model to use. + :param field_map: Maps field names in documents to the field used for embedding. + :param read_parameters: Optional parameters for the model during queries. + :param write_parameters: Optional parameters for the model during indexing. + :param description: Optional description of the field. + :returns: The builder instance for chaining. + + Example:: + + schema = (SchemaBuilder() + .semantic_text( + "content", + model="multilingual-e5-large", + field_map={"text": "content"}, + ) + .build()) + """ + self._fields[name] = SemanticTextField( + model=model, + field_map=field_map, + read_parameters=read_parameters, + write_parameters=write_parameters, + description=description, + ) + return self + + def build(self) -> dict[str, dict]: + """Build and return the final schema dictionary. + + :returns: A dictionary mapping field names to their serialized configurations. + :raises ValueError: If no fields have been added to the builder. + + Example:: + + schema = (SchemaBuilder() + .text("title", full_text_searchable=True) + .dense_vector("embedding", dimension=1536, metric="cosine") + .build()) + + # Returns: + # { + # "title": {"type": "string", "full_text_searchable": True}, + # "embedding": {"type": "dense_vector", "dimension": 1536, "metric": "cosine"} + # } + """ + if not self._fields: + raise ValueError("Cannot build empty schema. Add at least one field.") + return {name: field.to_dict() for name, field in self._fields.items()} diff --git a/tests/unit/models/test_schema_builder.py b/tests/unit/models/test_schema_builder.py new file mode 100644 index 00000000..c556c8ac --- /dev/null +++ b/tests/unit/models/test_schema_builder.py @@ -0,0 +1,253 @@ +"""Tests for SchemaBuilder fluent API.""" + +import os +import sys +import types + +import pytest + + +def _load_schema_builder_module(): + """Load schema_builder.py as a standalone module to avoid broken imports.""" + # First load schema_fields (dependency) + schema_fields_name = "pinecone.db_control.models.schema_fields" + schema_fields = types.ModuleType(schema_fields_name) + schema_fields.__file__ = os.path.join( + os.path.dirname(__file__), + "..", + "..", + "..", + "pinecone", + "db_control", + "models", + "schema_fields.py", + ) + sys.modules[schema_fields_name] = schema_fields + with open(schema_fields.__file__) as f: + exec(compile(f.read(), schema_fields.__file__, "exec"), schema_fields.__dict__) + + # Now load schema_builder + module_name = "pinecone.db_control.models.schema_builder" + module = types.ModuleType(module_name) + module.__file__ = os.path.join( + os.path.dirname(__file__), + "..", + "..", + "..", + "pinecone", + "db_control", + "models", + "schema_builder.py", + ) + sys.modules[module_name] = module + with open(module.__file__) as f: + exec(compile(f.read(), module.__file__, "exec"), module.__dict__) + + return module + + +_schema_builder = _load_schema_builder_module() +SchemaBuilder = _schema_builder.SchemaBuilder + + +class TestSchemaBuilderBasic: + def test_empty_builder_raises_on_build(self): + builder = SchemaBuilder() + with pytest.raises(ValueError, match="Cannot build empty schema"): + builder.build() + + def test_single_text_field(self): + schema = SchemaBuilder().text("title").build() + assert schema == {"title": {"type": "string"}} + + def test_single_integer_field(self): + schema = SchemaBuilder().integer("year").build() + assert schema == {"year": {"type": "integer"}} + + def test_single_float_field(self): + schema = SchemaBuilder().float("price").build() + assert schema == {"price": {"type": "float"}} + + def test_single_dense_vector_field(self): + schema = SchemaBuilder().dense_vector("embedding", dimension=1536, metric="cosine").build() + assert schema == { + "embedding": {"type": "dense_vector", "dimension": 1536, "metric": "cosine"} + } + + def test_single_sparse_vector_field(self): + schema = SchemaBuilder().sparse_vector("sparse").build() + assert schema == {"sparse": {"type": "sparse_vector", "metric": "dotproduct"}} + + def test_single_semantic_text_field(self): + schema = ( + SchemaBuilder() + .semantic_text("content", model="multilingual-e5-large", field_map={"text": "content"}) + .build() + ) + assert schema == { + "content": { + "type": "semantic_text", + "model": "multilingual-e5-large", + "field_map": {"text": "content"}, + } + } + + +class TestSchemaBuilderChaining: + def test_method_chaining_returns_builder(self): + builder = SchemaBuilder() + result = builder.text("title") + assert result is builder + + def test_multiple_fields_chained(self): + schema = ( + SchemaBuilder() + .text("title", full_text_searchable=True) + .integer("year", filterable=True) + .dense_vector("embedding", dimension=1536, metric="cosine") + .build() + ) + assert schema == { + "title": {"type": "string", "full_text_searchable": True}, + "year": {"type": "integer", "filterable": True}, + "embedding": {"type": "dense_vector", "dimension": 1536, "metric": "cosine"}, + } + + def test_all_field_types_chained(self): + schema = ( + SchemaBuilder() + .text("title") + .integer("year") + .float("price") + .dense_vector("dense", dimension=768, metric="euclidean") + .sparse_vector("sparse") + .semantic_text("content", model="model", field_map={"text": "content"}) + .build() + ) + assert len(schema) == 6 + assert "title" in schema + assert "year" in schema + assert "price" in schema + assert "dense" in schema + assert "sparse" in schema + assert "content" in schema + + +class TestSchemaBuilderFieldOptions: + def test_text_with_all_options(self): + schema = ( + SchemaBuilder() + .text( + "title", + filterable=True, + full_text_searchable=True, + description="The document title", + ) + .build() + ) + assert schema == { + "title": { + "type": "string", + "filterable": True, + "full_text_searchable": True, + "description": "The document title", + } + } + + def test_integer_with_all_options(self): + schema = ( + SchemaBuilder().integer("year", filterable=True, description="Publication year").build() + ) + assert schema == { + "year": {"type": "integer", "filterable": True, "description": "Publication year"} + } + + def test_float_with_all_options(self): + schema = SchemaBuilder().float("price", filterable=True, description="Price in USD").build() + assert schema == { + "price": {"type": "float", "filterable": True, "description": "Price in USD"} + } + + def test_dense_vector_with_description(self): + schema = ( + SchemaBuilder() + .dense_vector( + "embedding", dimension=1536, metric="cosine", description="OpenAI embeddings" + ) + .build() + ) + assert schema == { + "embedding": { + "type": "dense_vector", + "dimension": 1536, + "metric": "cosine", + "description": "OpenAI embeddings", + } + } + + def test_sparse_vector_with_options(self): + schema = ( + SchemaBuilder() + .sparse_vector("sparse", metric="dotproduct", description="BM25 vectors") + .build() + ) + assert schema == { + "sparse": { + "type": "sparse_vector", + "metric": "dotproduct", + "description": "BM25 vectors", + } + } + + def test_semantic_text_with_all_options(self): + schema = ( + SchemaBuilder() + .semantic_text( + "content", + model="multilingual-e5-large", + field_map={"text": "content"}, + read_parameters={"truncate": "END"}, + write_parameters={"truncate": "START"}, + description="Semantic search field", + ) + .build() + ) + assert schema == { + "content": { + "type": "semantic_text", + "model": "multilingual-e5-large", + "field_map": {"text": "content"}, + "read_parameters": {"truncate": "END"}, + "write_parameters": {"truncate": "START"}, + "description": "Semantic search field", + } + } + + +class TestSchemaBuilderOverwrite: + def test_adding_same_field_name_overwrites(self): + schema = ( + SchemaBuilder() + .text("field") + .integer("field") # Should overwrite the text field + .build() + ) + assert schema == {"field": {"type": "integer"}} + + +class TestSchemaBuilderUsageExamples: + """Test the usage examples from the ticket.""" + + def test_ticket_example(self): + schema = ( + SchemaBuilder() + .text("title", full_text_searchable=True) + .integer("year", filterable=True) + .dense_vector("embedding", dimension=1536, metric="cosine") + .build() + ) + assert schema == { + "title": {"type": "string", "full_text_searchable": True}, + "year": {"type": "integer", "filterable": True}, + "embedding": {"type": "dense_vector", "dimension": 1536, "metric": "cosine"}, + }