Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ The `MoorchehClient` and `AsyncMoorchehClient` classes provide the same method s
| `namespaces.delete` | namespace_name | Delete a namespace by name. |
| `documents.upload` | namespace_name, documents | Upload text documents to a text namespace. |
| `documents.get` | namespace_name, ids | Retrieve documents by ID. |
| `documents.fetch_text_data` | namespace_name, limit?, next_token? | List text/summary chunks (cursor pagination). |
| `documents.upload_file` | namespace_name, file_path | Upload a file for server-side ingestion. |
| `documents.list_files` | namespace_name | List raw files in storage for a namespace. |
| `documents.delete` | namespace_name, ids | Delete documents by ID. |
Expand Down
52 changes: 35 additions & 17 deletions examples/09_fetch_text_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@

def main():
"""
Example: list stored text and summary chunks for a text namespace (GET
fetch-text-data). Up to 100 items per response. For retrieving full
documents by ID, use client.documents.get instead.
Example: list stored text and summary chunks with cursor pagination (GET
fetch-text-data). Up to 100 items per page. For retrieving full documents
by ID, use client.documents.get instead.
"""
logger.info("--- Moorcheh SDK: Fetch Text Data Example ---")

Expand All @@ -50,27 +50,45 @@ def main():

try:
with client:
logger.info(f"Fetching text chunks from namespace '{target_namespace}'...")
response = client.documents.fetch_text_data(
namespace_name=target_namespace,
)
all_items = []
next_token = None
page = 0

logger.info("--- API Response (200 OK) ---")
logger.info(json.dumps(response, indent=2))
logger.info("-------------------------------")
while True:
page += 1
logger.info(
f"Fetching page {page} from namespace '{target_namespace}'..."
)
response = client.documents.fetch_text_data(
namespace_name=target_namespace,
limit=100,
next_token=next_token,
)

if response.get("status") == "success":
items = response.get("items") or []
all_items.extend(items)
pagination = response.get("pagination") or {}
has_more = pagination.get("has_more", False)
next_token = pagination.get("next_token") if has_more else None

stats = response.get("statistics") or {}
logger.info(
f"✅ Fetched {len(items)} item(s). "
f"statistics.total_items={stats.get('total_items')}"
)
else:
logger.warning(
f"Unexpected status in response: {response.get('status')!r}"
f"Page {page}: {len(items)} item(s) on this page "
f"(statistics.total_items={stats.get('total_items')}, "
f"has_more={has_more})"
)

if not has_more or not next_token:
break

logger.info("--- Summary ---")
logger.info(
f"Total items collected across {page} page(s): {len(all_items)}"
)
if all_items:
logger.info("First item (truncated):")
logger.info(json.dumps(all_items[0], indent=2)[:500])

except NamespaceNotFound:
logger.error(f"Namespace '{target_namespace}' was not found.")
logger.info(
Expand Down
74 changes: 60 additions & 14 deletions moorcheh_sdk/resources/documents.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio
import os
from pathlib import Path
from typing import BinaryIO, cast
from typing import Any, BinaryIO, cast

import httpx

Expand Down Expand Up @@ -30,6 +30,22 @@
logger = setup_logging(__name__)


def _fetch_text_data_params(
limit: int | None = None,
next_token: str | None = None,
) -> dict[str, Any] | None:
params: dict[str, Any] = {}
if limit is not None:
if not isinstance(limit, int) or limit < 1 or limit > 100:
raise InvalidInputError(
"Argument 'limit' must be a positive integer between 1 and 100."
)
params["limit"] = limit
if next_token is not None:
params["next_token"] = next_token
return params or None


def _deletion_processed_count(response: dict) -> int:
raw = response.get("actual_deletions")
if raw is not None:
Expand Down Expand Up @@ -214,40 +230,58 @@ def get(self, namespace_name: str, ids: list[str | int]) -> DocumentGetResponse:
return cast(DocumentGetResponse, response_data)

@required_args(["namespace_name"], types={"namespace_name": str})
def fetch_text_data(self, namespace_name: str) -> FetchTextDataResponse:
def fetch_text_data(
self,
namespace_name: str,
*,
limit: int | None = None,
next_token: str | None = None,
) -> FetchTextDataResponse:
"""
Lists stored text and summary chunks for a text namespace (up to 100 per request).
Lists stored text and summary chunks for a text namespace (cursor pagination).

Use this to export or display stored chunks, or for RAG. Only namespaces with
``type == "text"`` are supported. This differs from :meth:`get`, which retrieves
full documents by **ID** via ``POST .../documents/get``.

Each page returns up to 100 items. When ``pagination.has_more`` is ``True``,
pass ``pagination.next_token`` as ``next_token`` on the next call.

Args:
namespace_name: The name of the target text-based namespace.
limit: Maximum items per page (1–100). Omit for the API default (100).
next_token: Cursor from a previous response's ``pagination.next_token``.
Omit on the first request.

Returns:
A dictionary with ``status``, ``message``, ``namespace``, ``statistics``,
``items`` (list of chunks with ``text``, ``metadata``, ``created_at``,
``is_summary``, etc.), and ``execution_time``. Response keys use snake_case.
A dictionary with ``status``, ``message``, ``namespace``, ``statistics``
(for this page only), ``items``, ``pagination`` (``limit``, ``has_more``,
``next_token``), and ``execution_time``. Response keys use snake_case.

Raises:
InvalidInputError: If ``namespace_name`` is invalid.
InvalidInputError: If ``namespace_name``, ``limit``, or ``next_token`` is invalid.
NamespaceNotFound: If the namespace does not exist (404).
AuthenticationError: If authentication fails (401/403).
APIError: For other API errors.
MoorchehError: For network issues.
"""
params = _fetch_text_data_params(limit=limit, next_token=next_token)
logger.info(f"Fetching text data from namespace '{namespace_name}'...")
endpoint = f"/namespaces/{namespace_name}/documents/fetch-text-data"
response_data = self._client._request("GET", endpoint, expected_status=200)
response_data = self._client._request(
"GET", endpoint, params=params, expected_status=200
)
if not isinstance(response_data, dict):
logger.error("Fetch text data response was not a dictionary.")
raise APIError(
message="Unexpected response format from fetch text data endpoint."
)
item_count = len(response_data.get("items", []))
pagination = response_data.get("pagination") or {}
has_more = pagination.get("has_more", False)
logger.info(
f"Fetched {item_count} text item(s) from namespace '{namespace_name}'."
f"Fetched {item_count} text item(s) from namespace '{namespace_name}'"
f" (has_more={has_more})."
)
return cast(FetchTextDataResponse, response_data)

Expand Down Expand Up @@ -849,38 +883,50 @@ async def get(
return cast(DocumentGetResponse, response_data)

@required_args(["namespace_name"], types={"namespace_name": str})
async def fetch_text_data(self, namespace_name: str) -> FetchTextDataResponse:
async def fetch_text_data(
self,
namespace_name: str,
*,
limit: int | None = None,
next_token: str | None = None,
) -> FetchTextDataResponse:
"""
Lists stored text and summary chunks for a text namespace (up to 100 per request).
Lists stored text and summary chunks for a text namespace (cursor pagination).

Async counterpart of :meth:`Documents.fetch_text_data`.

Args:
namespace_name: The name of the target text-based namespace.
limit: Maximum items per page (1–100). Omit for the API default (100).
next_token: Cursor from a previous response's ``pagination.next_token``.

Returns:
Same structure as :meth:`Documents.fetch_text_data` (snake_case keys).

Raises:
InvalidInputError: If ``namespace_name`` is invalid.
InvalidInputError: If ``namespace_name``, ``limit``, or ``next_token`` is invalid.
NamespaceNotFound: If the namespace does not exist (404).
AuthenticationError: If authentication fails (401/403).
APIError: For other API errors.
MoorchehError: For network issues.
"""
params = _fetch_text_data_params(limit=limit, next_token=next_token)
logger.info(f"Fetching text data from namespace '{namespace_name}'...")
endpoint = f"/namespaces/{namespace_name}/documents/fetch-text-data"
response_data = await self._client._request(
"GET", endpoint, expected_status=200
"GET", endpoint, params=params, expected_status=200
)
if not isinstance(response_data, dict):
logger.error("Fetch text data response was not a dictionary.")
raise APIError(
message="Unexpected response format from fetch text data endpoint."
)
item_count = len(response_data.get("items", []))
pagination = response_data.get("pagination") or {}
has_more = pagination.get("has_more", False)
logger.info(
f"Fetched {item_count} text item(s) from namespace '{namespace_name}'."
f"Fetched {item_count} text item(s) from namespace '{namespace_name}'"
f" (has_more={has_more})."
)
return cast(FetchTextDataResponse, response_data)

Expand Down
2 changes: 2 additions & 0 deletions moorcheh_sdk/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
FileUploadResponse,
ListFilesResponse,
TextDataItem,
TextDataPagination,
TextDataStatistics,
)
from .namespace import Namespace, NamespaceCreateResponse, NamespaceListResponse
Expand Down Expand Up @@ -47,6 +48,7 @@
"DocumentGetResponse",
"FetchTextDataResponse",
"TextDataItem",
"TextDataPagination",
"TextDataStatistics",
"FileDeleteResponse",
"FileDeleteResult",
Expand Down
15 changes: 12 additions & 3 deletions moorcheh_sdk/types/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,27 @@ class TextDataItem(TypedDict, total=False):
id: str
text: str
metadata: dict[str, Any] | None
created_at: int
created_at: str | None
is_summary: bool


class TextDataStatistics(TypedDict, total=False):
total_items: int
total_text_chunks: int
total_summary_chunks: int
created_at_min: int
created_at_max: int
created_at_min: str | None
created_at_max: str | None
source_counts: dict[str, int]


class TextDataPagination(TypedDict, total=False):
"""Cursor pagination metadata from ``Documents.fetch_text_data``."""

limit: int
has_more: bool
next_token: str | None


class FetchTextDataResponse(TypedDict, total=False):
"""Response from ``GET .../documents/fetch-text-data`` (keys normalized to snake_case)."""

Expand All @@ -49,6 +57,7 @@ class FetchTextDataResponse(TypedDict, total=False):
namespace: str
statistics: TextDataStatistics
items: list[TextDataItem]
pagination: TextDataPagination
execution_time: float


Expand Down
51 changes: 50 additions & 1 deletion tests/resources/test_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,15 @@ def test_fetch_text_data_success(client, mocker, mock_response):
"id": "chunk-1",
"text": "Hello",
"metadata": {"source": "a.txt"},
"created_at": 1700000000000,
"created_at": "2025-12-19T18:18:57.700Z",
"is_summary": False,
}
],
"pagination": {
"limit": 100,
"has_more": False,
"next_token": None,
},
"execution_time": 0.05,
}
mock_resp = mock_response(200, json_data=expected_response)
Expand All @@ -219,6 +224,50 @@ def test_fetch_text_data_success(client, mocker, mock_response):
assert result == expected_response


def test_fetch_text_data_with_pagination_params(client, mocker, mock_response):
"""Test fetch-text-data passes limit and next_token query params."""
expected_response = {
"status": "success",
"message": "Fetched 1 text items.",
"namespace": TEST_NAMESPACE,
"statistics": {"total_items": 1},
"items": [{"id": "chunk-2", "text": "World", "is_summary": False}],
"pagination": {
"limit": 50,
"has_more": True,
"next_token": "token-page-2",
},
"execution_time": 0.02,
}
mock_resp = mock_response(200, json_data=expected_response)
client._mock_httpx_instance.request.return_value = mock_resp

result = client.documents.fetch_text_data(
namespace_name=TEST_NAMESPACE,
limit=50,
next_token="token-page-1",
)

client._mock_httpx_instance.request.assert_called_once_with(
method="GET",
url=f"/namespaces/{TEST_NAMESPACE}/documents/fetch-text-data",
json=None,
params={"limit": 50, "next_token": "token-page-1"},
)
assert result["pagination"]["has_more"] is True
assert result["pagination"]["next_token"] == "token-page-2"


@pytest.mark.parametrize("invalid_limit", [0, 101, -1, 1.5])
def test_fetch_text_data_invalid_limit(client, invalid_limit):
"""Test client-side validation for limit."""
with pytest.raises(InvalidInputError, match="limit"):
client.documents.fetch_text_data(
namespace_name=TEST_NAMESPACE,
limit=invalid_limit,
)


def test_fetch_text_data_namespace_not_found(client, mocker, mock_response):
"""Test fetch_text_data when namespace is missing."""
error_text = f"Namespace '{TEST_NAMESPACE}' not found."
Expand Down
27 changes: 27 additions & 0 deletions tests/test_async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ async def test_documents_fetch_text_data(client):
"namespace": "test",
"statistics": {"total_items": 0},
"items": [],
"pagination": {"limit": 100, "has_more": False, "next_token": None},
"execution_time": 0.01,
}

Expand All @@ -107,6 +108,32 @@ async def test_documents_fetch_text_data(client):
assert kwargs["params"] is None


@pytest.mark.asyncio
async def test_documents_fetch_text_data_pagination(client):
mock_response = {
"status": "success",
"namespace": "test",
"items": [],
"pagination": {"limit": 25, "has_more": True, "next_token": "next"},
}

with patch.object(client, "request", new_callable=AsyncMock) as mock_request:
mock_request.return_value = MagicMock(
status_code=200, json=lambda: mock_response
)

await client.documents.fetch_text_data(
namespace_name="test",
limit=25,
next_token="prev",
)

assert mock_request.call_args.kwargs["params"] == {
"limit": 25,
"next_token": "prev",
}


@pytest.mark.asyncio
async def test_search_query(client):
mock_response = {"results": [], "execution_time": 0.1}
Expand Down
Loading