From 38093bfb47bcbbd55ff28bbac12f4c75cac20537 Mon Sep 17 00:00:00 2001 From: Neelpatel1604 Date: Thu, 28 May 2026 16:40:52 -0400 Subject: [PATCH] feat: implement cursor pagination for fetch_text_data endpoint Enhanced the `fetch_text_data` method in both synchronous and asynchronous document resources to support cursor pagination. Updated the API response structure to include pagination metadata and modified the corresponding example and tests to reflect these changes. Documentation in README.md was also updated to include the new method signature and functionality. --- README.md | 1 + examples/09_fetch_text_data.py | 52 +++++++++++++------- moorcheh_sdk/resources/documents.py | 74 +++++++++++++++++++++++------ moorcheh_sdk/types/__init__.py | 2 + moorcheh_sdk/types/document.py | 15 ++++-- tests/resources/test_documents.py | 51 +++++++++++++++++++- tests/test_async_client.py | 27 +++++++++++ 7 files changed, 187 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 926fd50..9fa0805 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ The `MoorchehClient` and `AsyncMoorchehClient` classes provide the same method s | `namespaces.delete` | namespace_name | Delete a namespace by name. | | `documents.upload` | namespace_name, documents | Upload text documents to a text namespace. | | `documents.get` | namespace_name, ids | Retrieve documents by ID. | +| `documents.fetch_text_data` | namespace_name, limit?, next_token? | List text/summary chunks (cursor pagination). | | `documents.upload_file` | namespace_name, file_path | Upload a file for server-side ingestion. | | `documents.list_files` | namespace_name | List raw files in storage for a namespace. | | `documents.delete` | namespace_name, ids | Delete documents by ID. | diff --git a/examples/09_fetch_text_data.py b/examples/09_fetch_text_data.py index 48cc55c..93c087b 100644 --- a/examples/09_fetch_text_data.py +++ b/examples/09_fetch_text_data.py @@ -25,9 +25,9 @@ def main(): """ - Example: list stored text and summary chunks for a text namespace (GET - fetch-text-data). Up to 100 items per response. For retrieving full - documents by ID, use client.documents.get instead. + Example: list stored text and summary chunks with cursor pagination (GET + fetch-text-data). Up to 100 items per page. For retrieving full documents + by ID, use client.documents.get instead. """ logger.info("--- Moorcheh SDK: Fetch Text Data Example ---") @@ -50,27 +50,45 @@ def main(): try: with client: - logger.info(f"Fetching text chunks from namespace '{target_namespace}'...") - response = client.documents.fetch_text_data( - namespace_name=target_namespace, - ) + all_items = [] + next_token = None + page = 0 - logger.info("--- API Response (200 OK) ---") - logger.info(json.dumps(response, indent=2)) - logger.info("-------------------------------") + while True: + page += 1 + logger.info( + f"Fetching page {page} from namespace '{target_namespace}'..." + ) + response = client.documents.fetch_text_data( + namespace_name=target_namespace, + limit=100, + next_token=next_token, + ) - if response.get("status") == "success": items = response.get("items") or [] + all_items.extend(items) + pagination = response.get("pagination") or {} + has_more = pagination.get("has_more", False) + next_token = pagination.get("next_token") if has_more else None + stats = response.get("statistics") or {} logger.info( - f"✅ Fetched {len(items)} item(s). " - f"statistics.total_items={stats.get('total_items')}" - ) - else: - logger.warning( - f"Unexpected status in response: {response.get('status')!r}" + f"Page {page}: {len(items)} item(s) on this page " + f"(statistics.total_items={stats.get('total_items')}, " + f"has_more={has_more})" ) + if not has_more or not next_token: + break + + logger.info("--- Summary ---") + logger.info( + f"Total items collected across {page} page(s): {len(all_items)}" + ) + if all_items: + logger.info("First item (truncated):") + logger.info(json.dumps(all_items[0], indent=2)[:500]) + except NamespaceNotFound: logger.error(f"Namespace '{target_namespace}' was not found.") logger.info( diff --git a/moorcheh_sdk/resources/documents.py b/moorcheh_sdk/resources/documents.py index 9c23ce9..19b59ab 100644 --- a/moorcheh_sdk/resources/documents.py +++ b/moorcheh_sdk/resources/documents.py @@ -1,7 +1,7 @@ import asyncio import os from pathlib import Path -from typing import BinaryIO, cast +from typing import Any, BinaryIO, cast import httpx @@ -30,6 +30,22 @@ logger = setup_logging(__name__) +def _fetch_text_data_params( + limit: int | None = None, + next_token: str | None = None, +) -> dict[str, Any] | None: + params: dict[str, Any] = {} + if limit is not None: + if not isinstance(limit, int) or limit < 1 or limit > 100: + raise InvalidInputError( + "Argument 'limit' must be a positive integer between 1 and 100." + ) + params["limit"] = limit + if next_token is not None: + params["next_token"] = next_token + return params or None + + def _deletion_processed_count(response: dict) -> int: raw = response.get("actual_deletions") if raw is not None: @@ -214,40 +230,58 @@ def get(self, namespace_name: str, ids: list[str | int]) -> DocumentGetResponse: return cast(DocumentGetResponse, response_data) @required_args(["namespace_name"], types={"namespace_name": str}) - def fetch_text_data(self, namespace_name: str) -> FetchTextDataResponse: + def fetch_text_data( + self, + namespace_name: str, + *, + limit: int | None = None, + next_token: str | None = None, + ) -> FetchTextDataResponse: """ - Lists stored text and summary chunks for a text namespace (up to 100 per request). + Lists stored text and summary chunks for a text namespace (cursor pagination). Use this to export or display stored chunks, or for RAG. Only namespaces with ``type == "text"`` are supported. This differs from :meth:`get`, which retrieves full documents by **ID** via ``POST .../documents/get``. + Each page returns up to 100 items. When ``pagination.has_more`` is ``True``, + pass ``pagination.next_token`` as ``next_token`` on the next call. + Args: namespace_name: The name of the target text-based namespace. + limit: Maximum items per page (1–100). Omit for the API default (100). + next_token: Cursor from a previous response's ``pagination.next_token``. + Omit on the first request. Returns: - A dictionary with ``status``, ``message``, ``namespace``, ``statistics``, - ``items`` (list of chunks with ``text``, ``metadata``, ``created_at``, - ``is_summary``, etc.), and ``execution_time``. Response keys use snake_case. + A dictionary with ``status``, ``message``, ``namespace``, ``statistics`` + (for this page only), ``items``, ``pagination`` (``limit``, ``has_more``, + ``next_token``), and ``execution_time``. Response keys use snake_case. Raises: - InvalidInputError: If ``namespace_name`` is invalid. + InvalidInputError: If ``namespace_name``, ``limit``, or ``next_token`` is invalid. NamespaceNotFound: If the namespace does not exist (404). AuthenticationError: If authentication fails (401/403). APIError: For other API errors. MoorchehError: For network issues. """ + params = _fetch_text_data_params(limit=limit, next_token=next_token) logger.info(f"Fetching text data from namespace '{namespace_name}'...") endpoint = f"/namespaces/{namespace_name}/documents/fetch-text-data" - response_data = self._client._request("GET", endpoint, expected_status=200) + response_data = self._client._request( + "GET", endpoint, params=params, expected_status=200 + ) if not isinstance(response_data, dict): logger.error("Fetch text data response was not a dictionary.") raise APIError( message="Unexpected response format from fetch text data endpoint." ) item_count = len(response_data.get("items", [])) + pagination = response_data.get("pagination") or {} + has_more = pagination.get("has_more", False) logger.info( - f"Fetched {item_count} text item(s) from namespace '{namespace_name}'." + f"Fetched {item_count} text item(s) from namespace '{namespace_name}'" + f" (has_more={has_more})." ) return cast(FetchTextDataResponse, response_data) @@ -849,29 +883,38 @@ async def get( return cast(DocumentGetResponse, response_data) @required_args(["namespace_name"], types={"namespace_name": str}) - async def fetch_text_data(self, namespace_name: str) -> FetchTextDataResponse: + async def fetch_text_data( + self, + namespace_name: str, + *, + limit: int | None = None, + next_token: str | None = None, + ) -> FetchTextDataResponse: """ - Lists stored text and summary chunks for a text namespace (up to 100 per request). + Lists stored text and summary chunks for a text namespace (cursor pagination). Async counterpart of :meth:`Documents.fetch_text_data`. Args: namespace_name: The name of the target text-based namespace. + limit: Maximum items per page (1–100). Omit for the API default (100). + next_token: Cursor from a previous response's ``pagination.next_token``. Returns: Same structure as :meth:`Documents.fetch_text_data` (snake_case keys). Raises: - InvalidInputError: If ``namespace_name`` is invalid. + InvalidInputError: If ``namespace_name``, ``limit``, or ``next_token`` is invalid. NamespaceNotFound: If the namespace does not exist (404). AuthenticationError: If authentication fails (401/403). APIError: For other API errors. MoorchehError: For network issues. """ + params = _fetch_text_data_params(limit=limit, next_token=next_token) logger.info(f"Fetching text data from namespace '{namespace_name}'...") endpoint = f"/namespaces/{namespace_name}/documents/fetch-text-data" response_data = await self._client._request( - "GET", endpoint, expected_status=200 + "GET", endpoint, params=params, expected_status=200 ) if not isinstance(response_data, dict): logger.error("Fetch text data response was not a dictionary.") @@ -879,8 +922,11 @@ async def fetch_text_data(self, namespace_name: str) -> FetchTextDataResponse: message="Unexpected response format from fetch text data endpoint." ) item_count = len(response_data.get("items", [])) + pagination = response_data.get("pagination") or {} + has_more = pagination.get("has_more", False) logger.info( - f"Fetched {item_count} text item(s) from namespace '{namespace_name}'." + f"Fetched {item_count} text item(s) from namespace '{namespace_name}'" + f" (has_more={has_more})." ) return cast(FetchTextDataResponse, response_data) diff --git a/moorcheh_sdk/types/__init__.py b/moorcheh_sdk/types/__init__.py index 44931a8..351cb5b 100644 --- a/moorcheh_sdk/types/__init__.py +++ b/moorcheh_sdk/types/__init__.py @@ -17,6 +17,7 @@ FileUploadResponse, ListFilesResponse, TextDataItem, + TextDataPagination, TextDataStatistics, ) from .namespace import Namespace, NamespaceCreateResponse, NamespaceListResponse @@ -47,6 +48,7 @@ "DocumentGetResponse", "FetchTextDataResponse", "TextDataItem", + "TextDataPagination", "TextDataStatistics", "FileDeleteResponse", "FileDeleteResult", diff --git a/moorcheh_sdk/types/document.py b/moorcheh_sdk/types/document.py index 28d9737..8287fcd 100644 --- a/moorcheh_sdk/types/document.py +++ b/moorcheh_sdk/types/document.py @@ -28,7 +28,7 @@ class TextDataItem(TypedDict, total=False): id: str text: str metadata: dict[str, Any] | None - created_at: int + created_at: str | None is_summary: bool @@ -36,11 +36,19 @@ class TextDataStatistics(TypedDict, total=False): total_items: int total_text_chunks: int total_summary_chunks: int - created_at_min: int - created_at_max: int + created_at_min: str | None + created_at_max: str | None source_counts: dict[str, int] +class TextDataPagination(TypedDict, total=False): + """Cursor pagination metadata from ``Documents.fetch_text_data``.""" + + limit: int + has_more: bool + next_token: str | None + + class FetchTextDataResponse(TypedDict, total=False): """Response from ``GET .../documents/fetch-text-data`` (keys normalized to snake_case).""" @@ -49,6 +57,7 @@ class FetchTextDataResponse(TypedDict, total=False): namespace: str statistics: TextDataStatistics items: list[TextDataItem] + pagination: TextDataPagination execution_time: float diff --git a/tests/resources/test_documents.py b/tests/resources/test_documents.py index cae8663..5b74ad0 100644 --- a/tests/resources/test_documents.py +++ b/tests/resources/test_documents.py @@ -199,10 +199,15 @@ def test_fetch_text_data_success(client, mocker, mock_response): "id": "chunk-1", "text": "Hello", "metadata": {"source": "a.txt"}, - "created_at": 1700000000000, + "created_at": "2025-12-19T18:18:57.700Z", "is_summary": False, } ], + "pagination": { + "limit": 100, + "has_more": False, + "next_token": None, + }, "execution_time": 0.05, } mock_resp = mock_response(200, json_data=expected_response) @@ -219,6 +224,50 @@ def test_fetch_text_data_success(client, mocker, mock_response): assert result == expected_response +def test_fetch_text_data_with_pagination_params(client, mocker, mock_response): + """Test fetch-text-data passes limit and next_token query params.""" + expected_response = { + "status": "success", + "message": "Fetched 1 text items.", + "namespace": TEST_NAMESPACE, + "statistics": {"total_items": 1}, + "items": [{"id": "chunk-2", "text": "World", "is_summary": False}], + "pagination": { + "limit": 50, + "has_more": True, + "next_token": "token-page-2", + }, + "execution_time": 0.02, + } + mock_resp = mock_response(200, json_data=expected_response) + client._mock_httpx_instance.request.return_value = mock_resp + + result = client.documents.fetch_text_data( + namespace_name=TEST_NAMESPACE, + limit=50, + next_token="token-page-1", + ) + + client._mock_httpx_instance.request.assert_called_once_with( + method="GET", + url=f"/namespaces/{TEST_NAMESPACE}/documents/fetch-text-data", + json=None, + params={"limit": 50, "next_token": "token-page-1"}, + ) + assert result["pagination"]["has_more"] is True + assert result["pagination"]["next_token"] == "token-page-2" + + +@pytest.mark.parametrize("invalid_limit", [0, 101, -1, 1.5]) +def test_fetch_text_data_invalid_limit(client, invalid_limit): + """Test client-side validation for limit.""" + with pytest.raises(InvalidInputError, match="limit"): + client.documents.fetch_text_data( + namespace_name=TEST_NAMESPACE, + limit=invalid_limit, + ) + + def test_fetch_text_data_namespace_not_found(client, mocker, mock_response): """Test fetch_text_data when namespace is missing.""" error_text = f"Namespace '{TEST_NAMESPACE}' not found." diff --git a/tests/test_async_client.py b/tests/test_async_client.py index 449d23a..cb10d2e 100644 --- a/tests/test_async_client.py +++ b/tests/test_async_client.py @@ -89,6 +89,7 @@ async def test_documents_fetch_text_data(client): "namespace": "test", "statistics": {"total_items": 0}, "items": [], + "pagination": {"limit": 100, "has_more": False, "next_token": None}, "execution_time": 0.01, } @@ -107,6 +108,32 @@ async def test_documents_fetch_text_data(client): assert kwargs["params"] is None +@pytest.mark.asyncio +async def test_documents_fetch_text_data_pagination(client): + mock_response = { + "status": "success", + "namespace": "test", + "items": [], + "pagination": {"limit": 25, "has_more": True, "next_token": "next"}, + } + + with patch.object(client, "request", new_callable=AsyncMock) as mock_request: + mock_request.return_value = MagicMock( + status_code=200, json=lambda: mock_response + ) + + await client.documents.fetch_text_data( + namespace_name="test", + limit=25, + next_token="prev", + ) + + assert mock_request.call_args.kwargs["params"] == { + "limit": 25, + "next_token": "prev", + } + + @pytest.mark.asyncio async def test_search_query(client): mock_response = {"results": [], "execution_time": 0.1}