Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions dev/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,26 @@ python dev/generate_cli_docs.py
- `cocoindex` package must be importable (the CLI module)

This ensures that CLI documentation is always kept in sync with the actual command-line interface.

## Type-checking Examples

We provide a helper script to run mypy on each example entry point individually with minimal assumptions about optional dependencies.

### `mypy_check_examples.ps1`

Runs mypy for every `main.py` (and `colpali_main.py`) under the `examples/` folder using these rules:

- Only ignore missing imports (no broad suppressions)
- Avoid type-checking CocoIndex internals by setting `--follow-imports=silent`
- Make CocoIndex sources discoverable via `MYPYPATH=python`

Usage (Windows PowerShell):

```powershell
powershell -NoProfile -ExecutionPolicy Bypass -File dev/mypy_check_examples.ps1
```

Notes:

- Ensure you have a local virtual environment with `mypy` installed (e.g. `.venv` with `pip install mypy`).
- The script will report any failing example files and exit non-zero on failures.
34 changes: 34 additions & 0 deletions dev/mypy_check_examples.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
$ErrorActionPreference = 'Stop'

# Resolve python in local venv
$repoRoot = Split-Path -Parent $PSScriptRoot
$python = Join-Path $repoRoot '.venv\Scripts\python.exe'
if (-not (Test-Path $python)) {
$python = 'python'
}

# Ensure mypy can resolve local cocoindex package sources
$env:MYPYPATH = Join-Path $repoRoot 'python'

# Collect example entry files
$examples = Join-Path $repoRoot 'examples'
$files = Get-ChildItem -Path $examples -Recurse -File |
Where-Object { $_.Name -in @('main.py','colpali_main.py') } |
Sort-Object FullName

$failed = @()
foreach ($f in $files) {
Write-Host (">>> Checking " + $f.FullName)
& $python -m mypy --ignore-missing-imports --follow-imports=silent $f.FullName
if ($LASTEXITCODE -ne 0) {
$failed += $f.FullName
}
}

if ($failed.Count -gt 0) {
Write-Host "\nFailures:"
$failed | ForEach-Object { Write-Host $_ }
exit 1
} else {
Write-Host "\nAll example entry files passed mypy."
}
3 changes: 2 additions & 1 deletion examples/custom_output_files/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import cocoindex
from markdown_it import MarkdownIt
from typing import cast

_markdown_it = MarkdownIt("gfm-like")

Expand Down Expand Up @@ -96,7 +97,7 @@ def mutate(

@cocoindex.op.function()
def markdown_to_html(text: str) -> str:
return _markdown_it.render(text)
return cast(str, _markdown_it.render(text))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking about also considering code simplicity/readability. Given these examples are for users to understand how to use cocoindex, this matters.

For case like this, it's essentially the underlying library doesn't have a specific type, and the return type of the current function is clear.
I think we can just ignore it by a comment like # type: ignore

(IMO cast(...) is more useful for values within a function - from the point on, the type is clear)



@cocoindex.flow_def(name="CustomOutputFiles")
Expand Down
3 changes: 2 additions & 1 deletion examples/face_recognition/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import face_recognition
import numpy as np
from PIL import Image
from typing import cast

QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/")
QDRANT_COLLECTION = "face_embeddings"
Expand Down Expand Up @@ -85,7 +86,7 @@ def extract_face_embedding(
np.array(img),
known_face_locations=[(0, img.width - 1, img.height - 1, 0)],
)[0]
return embedding
return cast(cocoindex.Vector[cocoindex.Float32], embedding)


@cocoindex.flow_def(name="FaceRecognition")
Expand Down
14 changes: 9 additions & 5 deletions examples/fastapi_server_docker/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from psycopg_pool import ConnectionPool
from contextlib import asynccontextmanager
import os
from typing import Any, AsyncIterator


@cocoindex.transform_flow()
Expand All @@ -26,7 +27,7 @@ def text_to_embedding(
@cocoindex.flow_def(name="MarkdownEmbeddingFastApiExample")
def markdown_embedding_flow(
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
):
) -> None:
"""
Define an example flow that embeds markdown files into a vector database.
"""
Expand Down Expand Up @@ -65,7 +66,7 @@ def markdown_embedding_flow(
)


def search(pool: ConnectionPool, query: str, top_k: int = 5):
def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
# Get the table name, for the export target in the text_embedding_flow above.
table_name = cocoindex.utils.get_target_default_name(
markdown_embedding_flow, "doc_embeddings"
Expand All @@ -89,7 +90,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):


@asynccontextmanager
def lifespan(app: FastAPI):
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
load_dotenv()
cocoindex.init()
pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
Expand All @@ -103,16 +104,19 @@ def lifespan(app: FastAPI):
fastapi_app = FastAPI(lifespan=lifespan)


@fastapi_app.get("/search")
def search_endpoint(
request: Request,
q: str = Query(..., description="Search query"),
limit: int = Query(5, description="Number of results"),
):
) -> dict[str, Any]:
pool = request.app.state.pool
results = search(pool, q, limit)
return {"results": results}


# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable
fastapi_app.get("/search")(search_endpoint)
Comment on lines +117 to +118
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really unfortunate if have to use this way to workaround the issue.

What does it mean by "when FastAPI types are unavailable"? Missed a package? If add the package to dependency, will it pass the check?



if __name__ == "__main__":
uvicorn.run(fastapi_app, host="0.0.0.0", port=8080)
7 changes: 4 additions & 3 deletions examples/gdrive_text_embedding/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import cocoindex
import datetime
import os
from typing import Any


@cocoindex.transform_flow()
Expand All @@ -23,7 +24,7 @@ def text_to_embedding(
@cocoindex.flow_def(name="GoogleDriveTextEmbedding")
def gdrive_text_embedding_flow(
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
):
) -> None:
"""
Define an example flow that embeds text into a vector database.
"""
Expand Down Expand Up @@ -71,7 +72,7 @@ def gdrive_text_embedding_flow(
)


def search(pool: ConnectionPool, query: str, top_k: int = 5):
def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
# Get the table name, for the export target in the gdrive_text_embedding_flow above.
table_name = cocoindex.utils.get_target_default_name(
gdrive_text_embedding_flow, "doc_embeddings"
Expand All @@ -94,7 +95,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
]


def _main():
def _main() -> None:
# Initialize the database connection pool.
pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
# Run queries in a loop to demonstrate the query capabilities.
Expand Down
11 changes: 7 additions & 4 deletions examples/image_search/colpali_main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime
import os
from contextlib import asynccontextmanager
from typing import Any
from typing import Any, AsyncIterator

import cocoindex
from dotenv import load_dotenv
Expand Down Expand Up @@ -71,7 +71,7 @@ def image_object_embedding_flow(


@asynccontextmanager
async def lifespan(app: FastAPI) -> None:
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
load_dotenv()
cocoindex.init()
image_object_embedding_flow.setup(report_to_stdout=True)
Expand Down Expand Up @@ -100,11 +100,10 @@ async def lifespan(app: FastAPI) -> None:


# --- Search API ---
@app.get("/search")
def search(
q: str = Query(..., description="Search query"),
limit: int = Query(5, description="Number of results"),
) -> Any:
) -> dict[str, Any]:
# Get the multi-vector embedding for the query
query_embedding = text_to_colpali_embedding.eval(q)
print(
Expand Down Expand Up @@ -132,3 +131,7 @@ def search(
for result in search_results.points
]
}


# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable
app.get("/search")(search)
20 changes: 12 additions & 8 deletions examples/image_search/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import io
import os
from contextlib import asynccontextmanager
from typing import Any, Literal
from typing import Any, Literal, Final, TypeAlias, cast, AsyncIterator

import cocoindex
import torch
Expand All @@ -19,7 +19,8 @@
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/")
QDRANT_COLLECTION = "ImageSearch"
CLIP_MODEL_NAME = "openai/clip-vit-large-patch14"
CLIP_MODEL_DIMENSION = 768
CLIP_MODEL_DIMENSION: Final[int] = 768
CLIPVector: TypeAlias = cocoindex.Vector[cocoindex.Float32, Literal[768]]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With this change, seems CLIP_MODEL_DIMENSION is no longer used. I think we can remove that?



@functools.cache
Expand All @@ -37,13 +38,13 @@ def embed_query(text: str) -> list[float]:
inputs = processor(text=[text], return_tensors="pt", padding=True)
with torch.no_grad():
features = model.get_text_features(**inputs)
return features[0].tolist()
return cast(list[float], features[0].tolist())


@cocoindex.op.function(cache=True, behavior_version=1, gpu=True)
def embed_image(
img_bytes: bytes,
) -> cocoindex.Vector[cocoindex.Float32, Literal[CLIP_MODEL_DIMENSION]]:
) -> CLIPVector:
"""
Convert image to embedding using CLIP model.
"""
Expand All @@ -52,7 +53,7 @@ def embed_image(
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
features = model.get_image_features(**inputs)
return features[0].tolist()
return cast(CLIPVector, features[0].tolist())


# CocoIndex flow: Ingest images, extract captions, embed, export to Qdrant
Expand Down Expand Up @@ -112,7 +113,7 @@ def image_object_embedding_flow(


@asynccontextmanager
async def lifespan(app: FastAPI) -> None:
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
load_dotenv()
cocoindex.init()
image_object_embedding_flow.setup(report_to_stdout=True)
Expand Down Expand Up @@ -141,11 +142,10 @@ async def lifespan(app: FastAPI) -> None:


# --- Search API ---
@app.get("/search")
def search(
q: str = Query(..., description="Search query"),
limit: int = Query(5, description="Number of results"),
) -> Any:
) -> dict[str, Any]:
# Get the embedding for the query
query_embedding = embed_query(q)

Expand All @@ -169,3 +169,7 @@ def search(
for result in search_results
]
}


# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable
app.get("/search")(search)
9 changes: 5 additions & 4 deletions examples/manuals_llm_extraction/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.config.parser import ConfigParser
from typing import cast

import cocoindex

Expand All @@ -20,7 +21,7 @@ class PdfToMarkdownExecutor:
spec: PdfToMarkdown
_converter: PdfConverter

def prepare(self):
def prepare(self) -> None:
config_parser = ConfigParser({})
self._converter = PdfConverter(
create_model_dict(), config=config_parser.generate_config_dict()
Expand All @@ -30,8 +31,8 @@ def __call__(self, content: bytes) -> str:
with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
temp_file.write(content)
temp_file.flush()
text, _, _ = text_from_rendered(self._converter(temp_file.name))
return text
text_any, _, _ = text_from_rendered(self._converter(temp_file.name))
return cast(str, text_any)


@dataclasses.dataclass
Expand Down Expand Up @@ -90,7 +91,7 @@ def summarize_module(module_info: ModuleInfo) -> ModuleSummary:
@cocoindex.flow_def(name="ManualExtraction")
def manual_extraction_flow(
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
):
) -> None:
"""
Define an example flow that extracts manual information from a Markdown.
"""
Expand Down
5 changes: 3 additions & 2 deletions examples/paper_metadata/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from marker.models import create_model_dict
from marker.output import text_from_rendered
from functools import cache
from typing import cast
from pypdf import PdfReader, PdfWriter


Expand Down Expand Up @@ -66,8 +67,8 @@ def pdf_to_markdown(content: bytes) -> str:
with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
temp_file.write(content)
temp_file.flush()
text, _, _ = text_from_rendered(get_marker_converter()(temp_file.name))
return text
text_any, _, _ = text_from_rendered(get_marker_converter()(temp_file.name))
return cast(str, text_any)


@cocoindex.flow_def(name="PaperMetadata")
Expand Down
8 changes: 5 additions & 3 deletions examples/patient_intake_extraction/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from markitdown import MarkItDown
from openai import OpenAI
from typing import cast

import cocoindex

Expand Down Expand Up @@ -97,7 +98,7 @@ class ToMarkdownExecutor:
spec: ToMarkdown
_converter: MarkItDown

def prepare(self):
def prepare(self) -> None:
client = OpenAI()
self._converter = MarkItDown(llm_client=client, llm_model="gpt-4o")

Expand All @@ -106,14 +107,15 @@ def __call__(self, content: bytes, filename: str) -> str:
with tempfile.NamedTemporaryFile(delete=True, suffix=suffix) as temp_file:
temp_file.write(content)
temp_file.flush()
text = self._converter.convert(temp_file.name).text_content
text_any = self._converter.convert(temp_file.name).text_content
text: str = cast(str, text_any)
return text


@cocoindex.flow_def(name="PatientIntakeExtraction")
def patient_intake_extraction_flow(
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
):
) -> None:
"""
Define a flow that extracts patient information from intake forms.
"""
Expand Down
Loading