diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 7bfb80c0ea5..d31a4680626 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -107,6 +107,7 @@ class Dataset: _id: str _records: List[DatasetRecord] _version: int + _latest_version: int _dne_client: "LLMObsExperimentsClient" _new_records_by_record_id: Dict[str, DatasetRecordRaw] _updated_record_ids_to_new_fields: Dict[str, UpdatableDatasetRecord] @@ -121,6 +122,7 @@ def __init__( dataset_id: str, records: List[DatasetRecord], description: str, + latest_version: int, version: int, _dne_client: "LLMObsExperimentsClient", ) -> None: @@ -128,6 +130,7 @@ def __init__( self.project = project self.description = description self._id = dataset_id + self._latest_version = latest_version self._version = version self._dne_client = _dne_client self._records = records @@ -168,7 +171,10 @@ def push(self) -> None: record["record_id"] = record_id # type: ignore # FIXME: we don't get version numbers in responses to deletion requests - self._version = new_version if new_version != -1 else self._version + 1 + self._latest_version = new_version if new_version != -1 else self._latest_version + 1 + # no matter what the version was before the push, pushing will result in the dataset being on the current + # version tracked by the backend + self._version = self._latest_version self._new_records_by_record_id = {} self._deleted_record_ids = [] self._updated_record_ids_to_new_fields = {} @@ -225,6 +231,14 @@ def url(self) -> str: # FIXME: will not work for subdomain orgs return f"{_get_base_url()}/llm/datasets/{self._id}" + @property + def latest_version(self) -> int: + return self._latest_version + + @property + def version(self) -> int: + return self._version + def _estimate_delta_size(self) -> int: """rough estimate (in bytes) of the size of the next batch update call if it happens""" size = len(safe_json(self._new_records_by_record_id)) + len(safe_json(self._updated_record_ids_to_new_fields)) @@ -434,6 +448,7 @@ def _run_task(self, jobs: int, raise_errors: bool = False, sample_size: Optional dataset_id=self._dataset._id, records=subset_records, description=self._dataset.description, + latest_version=self._dataset._latest_version, version=self._dataset._version, _dne_client=self._dataset._dne_client, ) diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 457ead64212..2a342e27889 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -669,8 +669,12 @@ def _on_asyncio_execute_task(self, task_data: Dict[str, Any]) -> None: self._llmobs_context_provider.activate(llmobs_ctx) @classmethod - def pull_dataset(cls, dataset_name: str, project_name: Optional[str] = None) -> Dataset: - ds = cls._instance._dne_client.dataset_get_with_records(dataset_name, (project_name or cls._project_name)) + def pull_dataset( + cls, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None + ) -> Dataset: + ds = cls._instance._dne_client.dataset_get_with_records( + dataset_name, (project_name or cls._project_name), version + ) return ds @classmethod diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index e5f4cbcf333..1f70c0c3bae 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -11,6 +11,7 @@ from typing import TypedDict from typing import Union from typing import cast +import urllib from urllib.parse import quote from urllib.parse import urlparse @@ -400,7 +401,16 @@ def dataset_create( if dataset_id is None or dataset_id == "": raise ValueError(f"unexpected dataset state, invalid ID (is None: {dataset_id is None})") curr_version = response_data["data"]["attributes"]["current_version"] - return Dataset(dataset_name, project, dataset_id, [], description, curr_version, _dne_client=self) + return Dataset( + name=dataset_name, + project=project, + dataset_id=dataset_id, + records=[], + description=description, + latest_version=curr_version, + version=curr_version, + _dne_client=self, + ) @staticmethod def _get_record_json(record: Union[UpdatableDatasetRecord, DatasetRecordRaw], is_update: bool) -> JSONType: @@ -458,10 +468,14 @@ def dataset_batch_update( new_record_ids: List[str] = [r["id"] for r in data] if data else [] return new_version, new_record_ids - def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str] = None) -> Dataset: + def dataset_get_with_records( + self, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None + ) -> Dataset: project = self.project_create_or_get(project_name) project_id = project.get("_id") - logger.debug("getting records with project ID %s for %s", project_id, project_name) + logger.debug( + "getting records with project ID %s for %s, version: %s", project_id, project_name, str(version) or "latest" + ) path = f"/api/unstable/llm-obs/v1/{project_id}/datasets?filter[name]={quote(dataset_name)}" resp = self.request("GET", path) @@ -480,11 +494,17 @@ def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str dataset_id = data[0]["id"] list_base_path = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" + has_next_page = True class_records: List[DatasetRecord] = [] - list_path = list_base_path page_num = 0 + url_options = {} while has_next_page: + if version: + url_options["filter[version]"] = version + + list_path = f"{list_base_path}?{urllib.parse.urlencode(url_options, safe='[]')}" + logger.debug("list records page %d, request path=%s", page_num, list_path) resp = self.request("GET", list_path, timeout=self.LIST_RECORDS_TIMEOUT) if resp.status != 200: raise ValueError( @@ -504,14 +524,22 @@ def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str } ) next_cursor = records_data.get("meta", {}).get("after") + + url_options = {} has_next_page = False if next_cursor: has_next_page = True - list_path = f"{list_base_path}?page[cursor]={next_cursor}" - logger.debug("next list records request path %s", list_path) + url_options["page[cursor]"] = next_cursor page_num += 1 return Dataset( - dataset_name, project, dataset_id, class_records, dataset_description, curr_version, _dne_client=self + name=dataset_name, + project=project, + dataset_id=dataset_id, + records=class_records, + description=dataset_description, + latest_version=curr_version, + version=version or curr_version, + _dne_client=self, ) def dataset_bulk_upload(self, dataset_id: str, records: List[DatasetRecord]): diff --git a/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml new file mode 100644 index 00000000000..f1f32a86735 --- /dev/null +++ b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + LLM Observability: Previous dataset versions can be optionally pulled by passing the ``version`` + argument to ``LLMObs.pull_dataset`` + - | + LLM Observability: Datasets have new properties ``version`` and ``latest_version`` to provide information on the + version of the dataset that is being worked with and the latest global version of the dataset, respectively \ No newline at end of file diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_batch_update_post_aa45718c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_batch_update_post_aa45718c.yaml new file mode 100644 index 00000000000..b750ac7724c --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_batch_update_post_aa45718c.yaml @@ -0,0 +1,48 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "id": "0bb93ae7-43c4-48ff-91e4-d2817fee85fe", + "attributes": {"insert_records": [{"input": {"prompt": "What is the capital + of France?"}, "expected_output": {"answer": "Paris"}, "metadata": null}], "update_records": + [], "delete_records": []}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '271' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/0bb93ae7-43c4-48ff-91e4-d2817fee85fe/batch_update + response: + body: + string: '{"data":[{"id":"eaadecb4-836e-49b3-8390-212b3fffb60b","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.929416376Z","dataset_id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","expected_output":{"answer":"Paris"},"input":{"prompt":"What + is the capital of France?"},"updated_at":"2025-10-21T18:26:24.929416376Z","version":1}}]}' + headers: + content-length: + - '389' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:26:24 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_records_filter_version__420_get_0060e684.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_records_filter_version__420_get_0060e684.yaml new file mode 100644 index 00000000000..3de4ae57834 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_records_filter_version__420_get_0060e684.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/0bb93ae7-43c4-48ff-91e4-d2817fee85fe/records?filter%5Bversion%5D=420 + response: + body: + string: '{"errors":[{"title":"Generic Error","detail":"invalid version: version + is greater than the current version or negative"}]}' + headers: + content-length: + - '122' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:26:27 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 400 + message: Bad Request +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_0226e07c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_0226e07c.yaml new file mode 100644 index 00000000000..1db2779735a --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_0226e07c.yaml @@ -0,0 +1,48 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "id": "4607e918-094d-4aa9-8b7a-50fa63a95b56", + "attributes": {"insert_records": [{"input": {"prompt": "What is the capital + of France?"}, "expected_output": {"answer": "Paris"}, "metadata": null}], "update_records": + [], "delete_records": []}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '271' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/batch_update + response: + body: + string: '{"data":[{"id":"93328f7a-bfd2-4672-8b94-76b0698cd754","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.855004356Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Paris"},"input":{"prompt":"What + is the capital of France?"},"updated_at":"2025-10-21T18:25:23.855004356Z","version":1}}]}' + headers: + content-length: + - '389' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:23 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_b64aaeeb.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_b64aaeeb.yaml new file mode 100644 index 00000000000..aa56782bb87 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_b64aaeeb.yaml @@ -0,0 +1,48 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "id": "4607e918-094d-4aa9-8b7a-50fa63a95b56", + "attributes": {"insert_records": [{"input": {"prompt": "What is the capital + of China?"}, "expected_output": {"answer": "Beijing"}, "metadata": null}], "update_records": + [], "delete_records": []}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '272' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/batch_update + response: + body: + string: '{"data":[{"id":"5bbd89ec-4eba-4f41-bd47-2a23a005a20a","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:26.024986597Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Beijing"},"input":{"prompt":"What + is the capital of China?"},"updated_at":"2025-10-21T18:25:26.024986597Z","version":2}}]}' + headers: + content-length: + - '390' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:26 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_filter_version__1_get_bcd9fab6.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_filter_version__1_get_bcd9fab6.yaml new file mode 100644 index 00000000000..c903d760daf --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_filter_version__1_get_bcd9fab6.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/records?filter%5Bversion%5D=1 + response: + body: + string: '{"data":[{"id":"93328f7a-bfd2-4672-8b94-76b0698cd754","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.855004Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Paris"},"input":{"prompt":"What + is the capital of France?"},"updated_at":"2025-10-21T18:25:23.855004Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '391' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:34 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_get_eea1d61b.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_get_eea1d61b.yaml new file mode 100644 index 00000000000..61ee8d32222 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_get_eea1d61b.yaml @@ -0,0 +1,47 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/records + response: + body: + string: '{"data":[{"id":"5bbd89ec-4eba-4f41-bd47-2a23a005a20a","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:26.024986Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Beijing"},"input":{"prompt":"What + is the capital of China?"},"updated_at":"2025-10-21T18:25:26.024986Z"}},{"id":"93328f7a-bfd2-4672-8b94-76b0698cd754","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.855004Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Paris"},"input":{"prompt":"What + is the capital of France?"},"updated_at":"2025-10-21T18:25:23.855004Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '753' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:32 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_3f1a88cb.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_3f1a88cb.yaml new file mode 100644 index 00000000000..efd2579c8d3 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_3f1a88cb.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids": + ["4607e918-094d-4aa9-8b7a-50fa63a95b56"]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '119' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete + response: + body: + string: '{"data":[{"id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.688284Z","current_version":2,"deleted_at":"2025-10-21T18:25:34.800676Z","description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_versions[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:25:26.033352Z"}}]}' + headers: + content-length: + - '450' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:34 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_516a93fa.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_516a93fa.yaml new file mode 100644 index 00000000000..42896697f82 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_516a93fa.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids": + ["0bb93ae7-43c4-48ff-91e4-d2817fee85fe"]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '119' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete + response: + body: + string: '{"data":[{"id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.806202Z","current_version":1,"deleted_at":"2025-10-21T18:26:27.521296Z","description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:26:24.946573Z"}}]}' + headers: + content-length: + - '457' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:26:27 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_invalid_version_test_dataset_records0__get_12b16625.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_invalid_version_test_dataset_records0__get_12b16625.yaml new file mode 100644 index 00000000000..bcc667c8752 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_invalid_version_test_dataset_records0__get_12b16625.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets?filter%5Bname%5D=test-dataset-test_dataset_pull_w_invalid_version%5Btest_dataset_records0%5D + response: + body: + string: '{"data":[{"id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.806202Z","current_version":1,"description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:26:24.946573Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '434' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:26:27 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_versions_test_dataset_records0__get_5cf2366a.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_versions_test_dataset_records0__get_5cf2366a.yaml new file mode 100644 index 00000000000..a8107245785 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_versions_test_dataset_records0__get_5cf2366a.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets?filter%5Bname%5D=test-dataset-test_dataset_pull_w_versions%5Btest_dataset_records0%5D + response: + body: + string: '{"data":[{"id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.688284Z","current_version":2,"description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_versions[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:25:26.033352Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '427' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:31 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml new file mode 100644 index 00000000000..3b03000d203 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-test_dataset_pull_w_versions[test_dataset_records0]", + "description": "A test dataset"}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '155' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets + response: + body: + string: '{"data":{"id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.688284458Z","current_version":0,"description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_versions[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:25:23.688284458Z"}}}' + headers: + content-length: + - '411' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:23 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml new file mode 100644 index 00000000000..17ca6f8adec --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]", + "description": "A test dataset"}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '162' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets + response: + body: + string: '{"data":{"id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.80620202Z","current_version":0,"description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:26:24.80620202Z"}}}' + headers: + content-length: + - '416' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:26:24 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index dd36225d589..cf4e44905f4 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -300,7 +300,8 @@ def test_dataset_csv_no_expected_output(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds._version == 1 + assert ds.latest_version == 1 + assert ds.latest_version == ds.version finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset_id) @@ -347,7 +348,8 @@ def test_dataset_csv(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds._version == 1 + assert ds.latest_version == 1 + assert ds.latest_version == ds.version finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset_id) @@ -400,7 +402,8 @@ def test_dataset_csv_pipe_separated(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds._version == 1 + assert ds.latest_version == 1 + assert ds.latest_version == ds.version finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset._id) @@ -423,7 +426,8 @@ def test_dataset_pull_large_num_records(llmobs, test_dataset_large_num_records): assert len(pds) == len(test_dataset_large_num_records) assert pds.name == test_dataset_large_num_records.name assert pds.description == test_dataset_large_num_records.description - assert pds._version == test_dataset_large_num_records._version == 1 + assert pds.latest_version == test_dataset_large_num_records.latest_version == 1 + assert pds.version == test_dataset_large_num_records.version == 1 dataset = sorted(pds, key=lambda r: int(r["input_data"].lstrip("input_"))) for i, d in enumerate(dataset): @@ -450,7 +454,57 @@ def test_dataset_pull_exists_with_record(llmobs, test_dataset_one_record): assert dataset[0]["expected_output"] == {"answer": "Paris"} assert dataset.name == test_dataset_one_record.name assert dataset.description == test_dataset_one_record.description - assert dataset._version == test_dataset_one_record._version == 1 + assert dataset.latest_version == test_dataset_one_record.latest_version == 1 + assert dataset.version == test_dataset_one_record.version == 1 + + +@pytest.mark.parametrize( + "test_dataset_records", + [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], +) +def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records): + assert len(test_dataset) == 1 + assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"} + assert test_dataset[0]["expected_output"] == {"answer": "Paris"} + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 + + test_dataset.append( + {"input_data": {"prompt": "What is the capital of China?"}, "expected_output": {"answer": "Beijing"}} + ) + test_dataset.push() + wait_for_backend(4) + + dataset_v2 = llmobs.pull_dataset(dataset_name=test_dataset.name) + assert len(dataset_v2) == 2 + assert dataset_v2[1]["input_data"] == {"prompt": "What is the capital of France?"} + assert dataset_v2[1]["expected_output"] == {"answer": "Paris"} + assert dataset_v2[0]["input_data"] == {"prompt": "What is the capital of China?"} + assert dataset_v2[0]["expected_output"] == {"answer": "Beijing"} + assert dataset_v2.name == test_dataset.name + assert dataset_v2.description == test_dataset.description + assert dataset_v2.latest_version == test_dataset.latest_version == 2 + assert dataset_v2.version == test_dataset.version == 2 + + dataset_v1 = llmobs.pull_dataset(dataset_name=test_dataset.name, version=1) + assert len(dataset_v1) == 1 + assert dataset_v1[0]["input_data"] == {"prompt": "What is the capital of France?"} + assert dataset_v1[0]["expected_output"] == {"answer": "Paris"} + assert dataset_v1.name == test_dataset.name + assert dataset_v1.description == test_dataset.description + assert dataset_v1.latest_version == test_dataset.latest_version == 2 + assert dataset_v1.version == 1 + + +@pytest.mark.parametrize( + "test_dataset_records", + [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], +) +def test_dataset_pull_w_invalid_version(llmobs, test_dataset, test_dataset_records): + with pytest.raises( + ValueError, match="Failed to pull dataset records for.*version is greater than the current version or negative" + ): + llmobs.pull_dataset(dataset_name=test_dataset.name, version=420) def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_project): @@ -464,7 +518,8 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj assert dataset[0]["expected_output"] == {"answer": "Boston"} assert dataset.name == test_dataset_one_record_separate_project.name assert dataset.description == test_dataset_one_record_separate_project.description - assert dataset._version == test_dataset_one_record_separate_project._version == 1 + assert dataset.latest_version == test_dataset_one_record_separate_project.latest_version == 1 + assert dataset.version == test_dataset_one_record_separate_project.version == 1 @pytest.mark.parametrize( @@ -479,7 +534,8 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj ], ) def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_dataset_records): - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 test_dataset.update( 0, @@ -518,7 +574,8 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase test_dataset.push() assert len(test_dataset) == 2 - assert test_dataset._version == 2 + assert test_dataset.latest_version == 2 + assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"} assert test_dataset[0]["expected_output"] == {"answer": "Berlin"} @@ -548,7 +605,8 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase assert len(ds) == 2 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._version == 2 + assert ds.latest_version == 2 + assert ds.version == 2 @pytest.mark.parametrize( @@ -556,7 +614,8 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records): - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 test_dataset.update( 0, @@ -568,7 +627,8 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset._version == 2 + assert test_dataset.latest_version == 2 + assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"} assert test_dataset[0]["expected_output"] == {"answer": "Berlin"} @@ -585,7 +645,8 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._version == 2 + assert ds.latest_version == 2 + assert ds.version == 2 @pytest.mark.parametrize( @@ -593,7 +654,8 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_single_record_empty_record(llmobs, test_dataset, test_dataset_records): - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 with pytest.raises( ValueError, @@ -615,7 +677,8 @@ def test_dataset_estimate_size(llmobs, test_dataset): [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_records): - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 test_dataset.update(0, {"expected_output": None}) assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"} @@ -624,7 +687,8 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset._version == 2 + assert test_dataset.latest_version == 2 + assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"} assert test_dataset[0]["expected_output"] is None @@ -641,7 +705,8 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._version == 2 + assert ds.latest_version == 2 + assert ds.version == 2 @pytest.mark.parametrize( @@ -657,7 +722,8 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re ], ) def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_records): - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 test_dataset.update(0, {"input_data": "A"}) assert test_dataset[0]["input_data"] == "A" assert test_dataset[0]["expected_output"] == {"answer": "Paris"} @@ -665,7 +731,8 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset._version == 2 + assert test_dataset.latest_version == 2 + assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == "A" assert test_dataset[0]["expected_output"] == {"answer": "Paris"} @@ -684,7 +751,8 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._version == 2 + assert ds.latest_version == 2 + assert ds.version == 2 @pytest.mark.parametrize( @@ -696,7 +764,8 @@ def test_dataset_append(llmobs, test_dataset): DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"}) ) assert len(test_dataset) == 2 - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() @@ -707,12 +776,14 @@ def test_dataset_append(llmobs, test_dataset): assert test_dataset[1]["expected_output"] == {"answer": "Rome"} assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset._version == 2 + assert test_dataset.latest_version == 2 + assert test_dataset.version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds.latest_version == 2 + assert ds.version == 2 assert len(ds) == 2 # note: it looks like dataset order is not deterministic assert ds[1]["input_data"] == {"prompt": "What is the capital of France?"} @@ -735,7 +806,8 @@ def test_dataset_extend(llmobs, test_dataset): ] ) assert len(test_dataset) == 3 - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() @@ -748,12 +820,14 @@ def test_dataset_extend(llmobs, test_dataset): assert test_dataset[2]["expected_output"] == {"answer": "Stockholm"} assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset._version == 2 + assert test_dataset.latest_version == 2 + assert test_dataset.version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds.latest_version == 2 + assert ds.version == 2 assert len(ds) == 3 assert ds[2]["input_data"] == {"prompt": "What is the capital of France?"} # order is non deterministic @@ -770,7 +844,8 @@ def test_dataset_extend(llmobs, test_dataset): def test_dataset_append_no_expected_output(llmobs, test_dataset): test_dataset.append(DatasetRecord(input_data={"prompt": "What is the capital of Sealand?"})) assert len(test_dataset) == 2 - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() @@ -781,12 +856,14 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset): assert "expected_output" not in test_dataset[1] assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset._version == 2 + assert test_dataset.latest_version == 2 + assert test_dataset.version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds.latest_version == 2 + assert ds.version == 2 assert len(ds) == 2 # note: it looks like dataset order is not deterministic assert ds[1]["input_data"] == {"prompt": "What is the capital of France?"} @@ -809,11 +886,13 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset): def test_dataset_delete(llmobs, test_dataset): test_dataset.delete(0) assert len(test_dataset) == 1 - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() - assert test_dataset._version == 2 + assert test_dataset.latest_version == 2 + assert test_dataset.version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert test_dataset[0]["expected_output"] == {"answer": "Rome"} @@ -823,7 +902,8 @@ def test_dataset_delete(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds.latest_version == 2 + assert ds.version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert ds[0]["expected_output"] == {"answer": "Rome"} @@ -841,11 +921,13 @@ def test_dataset_delete(llmobs, test_dataset): def test_dataset_delete_no_expected_output(llmobs, test_dataset): test_dataset.delete(1) assert len(test_dataset) == 1 - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() - assert test_dataset._version == 2 + assert test_dataset.latest_version == 2 + assert test_dataset.version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Nauru?"} assert "expected_output" not in test_dataset[0] @@ -855,7 +937,8 @@ def test_dataset_delete_no_expected_output(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds.latest_version == 2 + assert ds.version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Nauru?"} assert ds[0]["expected_output"] is None @@ -879,11 +962,13 @@ def test_dataset_delete_after_update(llmobs, test_dataset): test_dataset.delete(0) assert len(test_dataset) == 1 - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() - assert test_dataset._version == 2 + assert test_dataset.latest_version == 2 + assert test_dataset.version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert test_dataset[0]["expected_output"] == {"answer": "Rome"} @@ -893,7 +978,8 @@ def test_dataset_delete_after_update(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds.latest_version == 2 + assert ds.version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert ds[0]["expected_output"] == {"answer": "Rome"} @@ -918,14 +1004,16 @@ def test_dataset_delete_after_append(llmobs, test_dataset): test_dataset.delete(0) # all that remains should be Italy and Sweden questions assert len(test_dataset) == 2 - assert test_dataset._version == 1 + assert test_dataset.latest_version == 1 + assert test_dataset.version == 1 assert len(test_dataset._new_records_by_record_id) == 1 assert len(test_dataset._deleted_record_ids) == 1 wait_for_backend() test_dataset.push() - assert test_dataset._version == 2 + assert test_dataset.latest_version == 2 + assert test_dataset.version == 2 assert len(test_dataset) == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert test_dataset[0]["expected_output"] == {"answer": "Rome"} @@ -937,7 +1025,8 @@ def test_dataset_delete_after_append(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds.latest_version == 2 + assert ds.version == 2 assert len(ds) == 2 sds = sorted(ds, key=lambda r: r["input_data"]["prompt"]) assert sds[0]["input_data"] == {"prompt": "What is the capital of Italy?"} @@ -1106,7 +1195,7 @@ def test_experiment_create(llmobs, test_dataset_one_record): project = llmobs._instance._dne_client.project_create_or_get("test-project") project_id = project.get("_id") exp_id, exp_run_name = llmobs._instance._dne_client.experiment_create( - exp.name, exp._dataset._id, project_id, exp._dataset._version, exp._config + exp.name, exp._dataset._id, project_id, exp._dataset.latest_version, exp._config ) assert exp_id is not None assert exp_run_name.startswith("test_experiment")