From 078a9c7c6c0236a4c60f94ef10d4b5fbc05009d8 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Thu, 16 Oct 2025 15:37:31 -0400 Subject: [PATCH 1/7] allow versioned dataset pull --- ddtrace/llmobs/_llmobs.py | 4 ++-- ddtrace/llmobs/_writer.py | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 457ead64212..55275c7c620 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -669,8 +669,8 @@ def _on_asyncio_execute_task(self, task_data: Dict[str, Any]) -> None: self._llmobs_context_provider.activate(llmobs_ctx) @classmethod - def pull_dataset(cls, dataset_name: str, project_name: Optional[str] = None) -> Dataset: - ds = cls._instance._dne_client.dataset_get_with_records(dataset_name, (project_name or cls._project_name)) + def pull_dataset(cls, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None) -> Dataset: + ds = cls._instance._dne_client.dataset_get_with_records(dataset_name, (project_name or cls._project_name), version) return ds @classmethod diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index e5f4cbcf333..48b767ac85a 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -458,10 +458,10 @@ def dataset_batch_update( new_record_ids: List[str] = [r["id"] for r in data] if data else [] return new_version, new_record_ids - def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str] = None) -> Dataset: + def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None) -> Dataset: project = self.project_create_or_get(project_name) project_id = project.get("_id") - logger.debug("getting records with project ID %s for %s", project_id, project_name) + logger.debug("getting records with project ID %s for %s, version: %s", project_id, project_name, str(version) or "latest") path = f"/api/unstable/llm-obs/v1/{project_id}/datasets?filter[name]={quote(dataset_name)}" resp = self.request("GET", path) @@ -480,6 +480,9 @@ def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str dataset_id = data[0]["id"] list_base_path = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" + if version: + list_base_path = f"{list_base_path}?filter[version]={version}" + has_next_page = True class_records: List[DatasetRecord] = [] list_path = list_base_path @@ -507,7 +510,7 @@ def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str has_next_page = False if next_cursor: has_next_page = True - list_path = f"{list_base_path}?page[cursor]={next_cursor}" + list_path = f"{list_base_path}{"&" if version else "?"}page[cursor]={next_cursor}" logger.debug("next list records request path %s", list_path) page_num += 1 return Dataset( From c566d4372e90d21a248e2d9c930191a26fc5eb35 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Fri, 17 Oct 2025 16:27:12 -0400 Subject: [PATCH 2/7] add current version and version as properties --- ddtrace/llmobs/_experiment.py | 17 ++++++- ddtrace/llmobs/_writer.py | 4 +- tests/llmobs/test_experiments.py | 82 ++++++++++++++++---------------- 3 files changed, 59 insertions(+), 44 deletions(-) diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 7bfb80c0ea5..2e0932c7ab4 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -107,6 +107,7 @@ class Dataset: _id: str _records: List[DatasetRecord] _version: int + _current_version: int _dne_client: "LLMObsExperimentsClient" _new_records_by_record_id: Dict[str, DatasetRecordRaw] _updated_record_ids_to_new_fields: Dict[str, UpdatableDatasetRecord] @@ -121,6 +122,7 @@ def __init__( dataset_id: str, records: List[DatasetRecord], description: str, + current_version: int, version: int, _dne_client: "LLMObsExperimentsClient", ) -> None: @@ -128,6 +130,7 @@ def __init__( self.project = project self.description = description self._id = dataset_id + self._current_version = current_version self._version = version self._dne_client = _dne_client self._records = records @@ -168,7 +171,10 @@ def push(self) -> None: record["record_id"] = record_id # type: ignore # FIXME: we don't get version numbers in responses to deletion requests - self._version = new_version if new_version != -1 else self._version + 1 + self._current_version = new_version if new_version != -1 else self._current_version + 1 + # no matter what the version was before the push, pushing will result in the dataset being on the current + # version tracked by the backend + self._version = self._current_version self._new_records_by_record_id = {} self._deleted_record_ids = [] self._updated_record_ids_to_new_fields = {} @@ -225,6 +231,14 @@ def url(self) -> str: # FIXME: will not work for subdomain orgs return f"{_get_base_url()}/llm/datasets/{self._id}" + @property + def current_version(self) -> int: + return self._current_version + + @property + def version(self) -> int: + return self._version + def _estimate_delta_size(self) -> int: """rough estimate (in bytes) of the size of the next batch update call if it happens""" size = len(safe_json(self._new_records_by_record_id)) + len(safe_json(self._updated_record_ids_to_new_fields)) @@ -434,6 +448,7 @@ def _run_task(self, jobs: int, raise_errors: bool = False, sample_size: Optional dataset_id=self._dataset._id, records=subset_records, description=self._dataset.description, + current_version=self._dataset._current_version, version=self._dataset._version, _dne_client=self._dataset._dne_client, ) diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 48b767ac85a..cedea1a1c69 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -400,7 +400,7 @@ def dataset_create( if dataset_id is None or dataset_id == "": raise ValueError(f"unexpected dataset state, invalid ID (is None: {dataset_id is None})") curr_version = response_data["data"]["attributes"]["current_version"] - return Dataset(dataset_name, project, dataset_id, [], description, curr_version, _dne_client=self) + return Dataset(dataset_name, project, dataset_id, [], description, curr_version, curr_version, _dne_client=self) @staticmethod def _get_record_json(record: Union[UpdatableDatasetRecord, DatasetRecordRaw], is_update: bool) -> JSONType: @@ -514,7 +514,7 @@ def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str logger.debug("next list records request path %s", list_path) page_num += 1 return Dataset( - dataset_name, project, dataset_id, class_records, dataset_description, curr_version, _dne_client=self + dataset_name, project, dataset_id, class_records, dataset_description, curr_version, version or curr_version, _dne_client=self ) def dataset_bulk_upload(self, dataset_id: str, records: List[DatasetRecord]): diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index dd36225d589..76a3898a657 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -300,7 +300,7 @@ def test_dataset_csv_no_expected_output(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds._version == 1 + assert ds._current_version == 1 finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset_id) @@ -347,7 +347,7 @@ def test_dataset_csv(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds._version == 1 + assert ds._current_version == 1 finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset_id) @@ -400,7 +400,7 @@ def test_dataset_csv_pipe_separated(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds._version == 1 + assert ds._current_version == 1 finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset._id) @@ -423,7 +423,7 @@ def test_dataset_pull_large_num_records(llmobs, test_dataset_large_num_records): assert len(pds) == len(test_dataset_large_num_records) assert pds.name == test_dataset_large_num_records.name assert pds.description == test_dataset_large_num_records.description - assert pds._version == test_dataset_large_num_records._version == 1 + assert pds._current_version == test_dataset_large_num_records._current_version == 1 dataset = sorted(pds, key=lambda r: int(r["input_data"].lstrip("input_"))) for i, d in enumerate(dataset): @@ -450,7 +450,7 @@ def test_dataset_pull_exists_with_record(llmobs, test_dataset_one_record): assert dataset[0]["expected_output"] == {"answer": "Paris"} assert dataset.name == test_dataset_one_record.name assert dataset.description == test_dataset_one_record.description - assert dataset._version == test_dataset_one_record._version == 1 + assert dataset._current_version == test_dataset_one_record._current_version == 1 def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_project): @@ -464,7 +464,7 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj assert dataset[0]["expected_output"] == {"answer": "Boston"} assert dataset.name == test_dataset_one_record_separate_project.name assert dataset.description == test_dataset_one_record_separate_project.description - assert dataset._version == test_dataset_one_record_separate_project._version == 1 + assert dataset._current_version == test_dataset_one_record_separate_project._current_version == 1 @pytest.mark.parametrize( @@ -479,7 +479,7 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj ], ) def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_dataset_records): - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 test_dataset.update( 0, @@ -518,7 +518,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase test_dataset.push() assert len(test_dataset) == 2 - assert test_dataset._version == 2 + assert test_dataset._current_version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"} assert test_dataset[0]["expected_output"] == {"answer": "Berlin"} @@ -548,7 +548,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase assert len(ds) == 2 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._version == 2 + assert ds._current_version == 2 @pytest.mark.parametrize( @@ -556,7 +556,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records): - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 test_dataset.update( 0, @@ -568,7 +568,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset._version == 2 + assert test_dataset._current_version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"} assert test_dataset[0]["expected_output"] == {"answer": "Berlin"} @@ -585,7 +585,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._version == 2 + assert ds._current_version == 2 @pytest.mark.parametrize( @@ -593,7 +593,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_single_record_empty_record(llmobs, test_dataset, test_dataset_records): - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 with pytest.raises( ValueError, @@ -615,7 +615,7 @@ def test_dataset_estimate_size(llmobs, test_dataset): [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_records): - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 test_dataset.update(0, {"expected_output": None}) assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"} @@ -624,7 +624,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset._version == 2 + assert test_dataset._current_version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"} assert test_dataset[0]["expected_output"] is None @@ -641,7 +641,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._version == 2 + assert ds._current_version == 2 @pytest.mark.parametrize( @@ -657,7 +657,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re ], ) def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_records): - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 test_dataset.update(0, {"input_data": "A"}) assert test_dataset[0]["input_data"] == "A" assert test_dataset[0]["expected_output"] == {"answer": "Paris"} @@ -665,7 +665,7 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset._version == 2 + assert test_dataset._current_version == 2 assert test_dataset[0]["input_data"] == "A" assert test_dataset[0]["expected_output"] == {"answer": "Paris"} @@ -684,7 +684,7 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._version == 2 + assert ds._current_version == 2 @pytest.mark.parametrize( @@ -696,7 +696,7 @@ def test_dataset_append(llmobs, test_dataset): DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"}) ) assert len(test_dataset) == 2 - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 wait_for_backend() test_dataset.push() @@ -707,12 +707,12 @@ def test_dataset_append(llmobs, test_dataset): assert test_dataset[1]["expected_output"] == {"answer": "Rome"} assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset._version == 2 + assert test_dataset._current_version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds._current_version == 2 assert len(ds) == 2 # note: it looks like dataset order is not deterministic assert ds[1]["input_data"] == {"prompt": "What is the capital of France?"} @@ -735,7 +735,7 @@ def test_dataset_extend(llmobs, test_dataset): ] ) assert len(test_dataset) == 3 - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 wait_for_backend() test_dataset.push() @@ -748,12 +748,12 @@ def test_dataset_extend(llmobs, test_dataset): assert test_dataset[2]["expected_output"] == {"answer": "Stockholm"} assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset._version == 2 + assert test_dataset._current_version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds._current_version == 2 assert len(ds) == 3 assert ds[2]["input_data"] == {"prompt": "What is the capital of France?"} # order is non deterministic @@ -770,7 +770,7 @@ def test_dataset_extend(llmobs, test_dataset): def test_dataset_append_no_expected_output(llmobs, test_dataset): test_dataset.append(DatasetRecord(input_data={"prompt": "What is the capital of Sealand?"})) assert len(test_dataset) == 2 - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 wait_for_backend() test_dataset.push() @@ -781,12 +781,12 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset): assert "expected_output" not in test_dataset[1] assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset._version == 2 + assert test_dataset._current_version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds._current_version == 2 assert len(ds) == 2 # note: it looks like dataset order is not deterministic assert ds[1]["input_data"] == {"prompt": "What is the capital of France?"} @@ -809,11 +809,11 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset): def test_dataset_delete(llmobs, test_dataset): test_dataset.delete(0) assert len(test_dataset) == 1 - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 wait_for_backend() test_dataset.push() - assert test_dataset._version == 2 + assert test_dataset._current_version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert test_dataset[0]["expected_output"] == {"answer": "Rome"} @@ -823,7 +823,7 @@ def test_dataset_delete(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds._current_version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert ds[0]["expected_output"] == {"answer": "Rome"} @@ -841,11 +841,11 @@ def test_dataset_delete(llmobs, test_dataset): def test_dataset_delete_no_expected_output(llmobs, test_dataset): test_dataset.delete(1) assert len(test_dataset) == 1 - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 wait_for_backend() test_dataset.push() - assert test_dataset._version == 2 + assert test_dataset._current_version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Nauru?"} assert "expected_output" not in test_dataset[0] @@ -855,7 +855,7 @@ def test_dataset_delete_no_expected_output(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds._current_version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Nauru?"} assert ds[0]["expected_output"] is None @@ -879,11 +879,11 @@ def test_dataset_delete_after_update(llmobs, test_dataset): test_dataset.delete(0) assert len(test_dataset) == 1 - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 wait_for_backend() test_dataset.push() - assert test_dataset._version == 2 + assert test_dataset._current_version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert test_dataset[0]["expected_output"] == {"answer": "Rome"} @@ -893,7 +893,7 @@ def test_dataset_delete_after_update(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds._current_version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert ds[0]["expected_output"] == {"answer": "Rome"} @@ -918,14 +918,14 @@ def test_dataset_delete_after_append(llmobs, test_dataset): test_dataset.delete(0) # all that remains should be Italy and Sweden questions assert len(test_dataset) == 2 - assert test_dataset._version == 1 + assert test_dataset._current_version == 1 assert len(test_dataset._new_records_by_record_id) == 1 assert len(test_dataset._deleted_record_ids) == 1 wait_for_backend() test_dataset.push() - assert test_dataset._version == 2 + assert test_dataset._current_version == 2 assert len(test_dataset) == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert test_dataset[0]["expected_output"] == {"answer": "Rome"} @@ -937,7 +937,7 @@ def test_dataset_delete_after_append(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._version == 2 + assert ds._current_version == 2 assert len(ds) == 2 sds = sorted(ds, key=lambda r: r["input_data"]["prompt"]) assert sds[0]["input_data"] == {"prompt": "What is the capital of Italy?"} @@ -1106,7 +1106,7 @@ def test_experiment_create(llmobs, test_dataset_one_record): project = llmobs._instance._dne_client.project_create_or_get("test-project") project_id = project.get("_id") exp_id, exp_run_name = llmobs._instance._dne_client.experiment_create( - exp.name, exp._dataset._id, project_id, exp._dataset._version, exp._config + exp.name, exp._dataset._id, project_id, exp._dataset._current_version, exp._config ) assert exp_id is not None assert exp_run_name.startswith("test_experiment") From e75deb04088891a2f7850a7d723561a2b4aabb83 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Tue, 21 Oct 2025 14:33:06 -0400 Subject: [PATCH 3/7] update tests to use new version properties and add tests for versioned pulls --- ddtrace/llmobs/_llmobs.py | 8 +- ddtrace/llmobs/_writer.py | 19 +- ...817fee85fe_batch_update_post_aa45718c.yaml | 48 +++++ ...ords_filter_version__420_get_0060e684.yaml | 46 +++++ ...fa63a95b56_batch_update_post_0226e07c.yaml | 48 +++++ ...fa63a95b56_batch_update_post_b64aaeeb.yaml | 48 +++++ ...ecords_filter_version__1_get_bcd9fab6.yaml | 46 +++++ ...b7a-50fa63a95b56_records_get_eea1d61b.yaml | 47 +++++ ...-obs_v1_datasets_delete_post_3f1a88cb.yaml | 46 +++++ ...-obs_v1_datasets_delete_post_516a93fa.yaml | 46 +++++ ...n_test_dataset_records0__get_12b16625.yaml | 46 +++++ ...s_test_dataset_records0__get_5cf2366a.yaml | 46 +++++ ...a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml | 46 +++++ ...a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml | 46 +++++ tests/llmobs/test_experiments.py | 171 +++++++++++++----- 15 files changed, 710 insertions(+), 47 deletions(-) create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_batch_update_post_aa45718c.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_records_filter_version__420_get_0060e684.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_0226e07c.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_b64aaeeb.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_filter_version__1_get_bcd9fab6.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_get_eea1d61b.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_3f1a88cb.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_516a93fa.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_invalid_version_test_dataset_records0__get_12b16625.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_versions_test_dataset_records0__get_5cf2366a.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 55275c7c620..2a342e27889 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -669,8 +669,12 @@ def _on_asyncio_execute_task(self, task_data: Dict[str, Any]) -> None: self._llmobs_context_provider.activate(llmobs_ctx) @classmethod - def pull_dataset(cls, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None) -> Dataset: - ds = cls._instance._dne_client.dataset_get_with_records(dataset_name, (project_name or cls._project_name), version) + def pull_dataset( + cls, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None + ) -> Dataset: + ds = cls._instance._dne_client.dataset_get_with_records( + dataset_name, (project_name or cls._project_name), version + ) return ds @classmethod diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index cedea1a1c69..0f224996394 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -458,10 +458,14 @@ def dataset_batch_update( new_record_ids: List[str] = [r["id"] for r in data] if data else [] return new_version, new_record_ids - def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None) -> Dataset: + def dataset_get_with_records( + self, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None + ) -> Dataset: project = self.project_create_or_get(project_name) project_id = project.get("_id") - logger.debug("getting records with project ID %s for %s, version: %s", project_id, project_name, str(version) or "latest") + logger.debug( + "getting records with project ID %s for %s, version: %s", project_id, project_name, str(version) or "latest" + ) path = f"/api/unstable/llm-obs/v1/{project_id}/datasets?filter[name]={quote(dataset_name)}" resp = self.request("GET", path) @@ -510,11 +514,18 @@ def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str has_next_page = False if next_cursor: has_next_page = True - list_path = f"{list_base_path}{"&" if version else "?"}page[cursor]={next_cursor}" + list_path = f"{list_base_path}{'&' if version else '?'}page[cursor]={next_cursor}" logger.debug("next list records request path %s", list_path) page_num += 1 return Dataset( - dataset_name, project, dataset_id, class_records, dataset_description, curr_version, version or curr_version, _dne_client=self + dataset_name, + project, + dataset_id, + class_records, + dataset_description, + curr_version, + version or curr_version, + _dne_client=self, ) def dataset_bulk_upload(self, dataset_id: str, records: List[DatasetRecord]): diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_batch_update_post_aa45718c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_batch_update_post_aa45718c.yaml new file mode 100644 index 00000000000..b750ac7724c --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_batch_update_post_aa45718c.yaml @@ -0,0 +1,48 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "id": "0bb93ae7-43c4-48ff-91e4-d2817fee85fe", + "attributes": {"insert_records": [{"input": {"prompt": "What is the capital + of France?"}, "expected_output": {"answer": "Paris"}, "metadata": null}], "update_records": + [], "delete_records": []}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '271' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/0bb93ae7-43c4-48ff-91e4-d2817fee85fe/batch_update + response: + body: + string: '{"data":[{"id":"eaadecb4-836e-49b3-8390-212b3fffb60b","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.929416376Z","dataset_id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","expected_output":{"answer":"Paris"},"input":{"prompt":"What + is the capital of France?"},"updated_at":"2025-10-21T18:26:24.929416376Z","version":1}}]}' + headers: + content-length: + - '389' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:26:24 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_records_filter_version__420_get_0060e684.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_records_filter_version__420_get_0060e684.yaml new file mode 100644 index 00000000000..3de4ae57834 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_records_filter_version__420_get_0060e684.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/0bb93ae7-43c4-48ff-91e4-d2817fee85fe/records?filter%5Bversion%5D=420 + response: + body: + string: '{"errors":[{"title":"Generic Error","detail":"invalid version: version + is greater than the current version or negative"}]}' + headers: + content-length: + - '122' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:26:27 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 400 + message: Bad Request +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_0226e07c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_0226e07c.yaml new file mode 100644 index 00000000000..1db2779735a --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_0226e07c.yaml @@ -0,0 +1,48 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "id": "4607e918-094d-4aa9-8b7a-50fa63a95b56", + "attributes": {"insert_records": [{"input": {"prompt": "What is the capital + of France?"}, "expected_output": {"answer": "Paris"}, "metadata": null}], "update_records": + [], "delete_records": []}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '271' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/batch_update + response: + body: + string: '{"data":[{"id":"93328f7a-bfd2-4672-8b94-76b0698cd754","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.855004356Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Paris"},"input":{"prompt":"What + is the capital of France?"},"updated_at":"2025-10-21T18:25:23.855004356Z","version":1}}]}' + headers: + content-length: + - '389' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:23 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_b64aaeeb.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_b64aaeeb.yaml new file mode 100644 index 00000000000..aa56782bb87 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_b64aaeeb.yaml @@ -0,0 +1,48 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "id": "4607e918-094d-4aa9-8b7a-50fa63a95b56", + "attributes": {"insert_records": [{"input": {"prompt": "What is the capital + of China?"}, "expected_output": {"answer": "Beijing"}, "metadata": null}], "update_records": + [], "delete_records": []}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '272' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/batch_update + response: + body: + string: '{"data":[{"id":"5bbd89ec-4eba-4f41-bd47-2a23a005a20a","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:26.024986597Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Beijing"},"input":{"prompt":"What + is the capital of China?"},"updated_at":"2025-10-21T18:25:26.024986597Z","version":2}}]}' + headers: + content-length: + - '390' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:26 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_filter_version__1_get_bcd9fab6.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_filter_version__1_get_bcd9fab6.yaml new file mode 100644 index 00000000000..c903d760daf --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_filter_version__1_get_bcd9fab6.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/records?filter%5Bversion%5D=1 + response: + body: + string: '{"data":[{"id":"93328f7a-bfd2-4672-8b94-76b0698cd754","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.855004Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Paris"},"input":{"prompt":"What + is the capital of France?"},"updated_at":"2025-10-21T18:25:23.855004Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '391' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:34 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_get_eea1d61b.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_get_eea1d61b.yaml new file mode 100644 index 00000000000..61ee8d32222 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_get_eea1d61b.yaml @@ -0,0 +1,47 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/records + response: + body: + string: '{"data":[{"id":"5bbd89ec-4eba-4f41-bd47-2a23a005a20a","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:26.024986Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Beijing"},"input":{"prompt":"What + is the capital of China?"},"updated_at":"2025-10-21T18:25:26.024986Z"}},{"id":"93328f7a-bfd2-4672-8b94-76b0698cd754","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.855004Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Paris"},"input":{"prompt":"What + is the capital of France?"},"updated_at":"2025-10-21T18:25:23.855004Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '753' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:32 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_3f1a88cb.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_3f1a88cb.yaml new file mode 100644 index 00000000000..efd2579c8d3 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_3f1a88cb.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids": + ["4607e918-094d-4aa9-8b7a-50fa63a95b56"]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '119' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete + response: + body: + string: '{"data":[{"id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.688284Z","current_version":2,"deleted_at":"2025-10-21T18:25:34.800676Z","description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_versions[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:25:26.033352Z"}}]}' + headers: + content-length: + - '450' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:34 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_516a93fa.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_516a93fa.yaml new file mode 100644 index 00000000000..42896697f82 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_516a93fa.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids": + ["0bb93ae7-43c4-48ff-91e4-d2817fee85fe"]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '119' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete + response: + body: + string: '{"data":[{"id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.806202Z","current_version":1,"deleted_at":"2025-10-21T18:26:27.521296Z","description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:26:24.946573Z"}}]}' + headers: + content-length: + - '457' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:26:27 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_invalid_version_test_dataset_records0__get_12b16625.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_invalid_version_test_dataset_records0__get_12b16625.yaml new file mode 100644 index 00000000000..bcc667c8752 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_invalid_version_test_dataset_records0__get_12b16625.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets?filter%5Bname%5D=test-dataset-test_dataset_pull_w_invalid_version%5Btest_dataset_records0%5D + response: + body: + string: '{"data":[{"id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.806202Z","current_version":1,"description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:26:24.946573Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '434' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:26:27 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_versions_test_dataset_records0__get_5cf2366a.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_versions_test_dataset_records0__get_5cf2366a.yaml new file mode 100644 index 00000000000..a8107245785 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_versions_test_dataset_records0__get_5cf2366a.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets?filter%5Bname%5D=test-dataset-test_dataset_pull_w_versions%5Btest_dataset_records0%5D + response: + body: + string: '{"data":[{"id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.688284Z","current_version":2,"description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_versions[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:25:26.033352Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '427' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:31 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml new file mode 100644 index 00000000000..3b03000d203 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-test_dataset_pull_w_versions[test_dataset_records0]", + "description": "A test dataset"}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '155' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets + response: + body: + string: '{"data":{"id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.688284458Z","current_version":0,"description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_versions[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:25:23.688284458Z"}}}' + headers: + content-length: + - '411' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:25:23 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml new file mode 100644 index 00000000000..17ca6f8adec --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]", + "description": "A test dataset"}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '162' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets + response: + body: + string: '{"data":{"id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.80620202Z","current_version":0,"description":"A + test dataset","name":"test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:26:24.80620202Z"}}}' + headers: + content-length: + - '416' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Tue, 21 Oct 2025 18:26:24 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 76a3898a657..1646d1cb90b 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -300,7 +300,8 @@ def test_dataset_csv_no_expected_output(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds._current_version == 1 + assert ds.current_version == 1 + assert ds.current_version == ds.version finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset_id) @@ -347,7 +348,8 @@ def test_dataset_csv(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds._current_version == 1 + assert ds.current_version == 1 + assert ds.current_version == ds.version finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset_id) @@ -400,7 +402,8 @@ def test_dataset_csv_pipe_separated(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds._current_version == 1 + assert ds.current_version == 1 + assert ds.current_version == ds.version finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset._id) @@ -423,7 +426,8 @@ def test_dataset_pull_large_num_records(llmobs, test_dataset_large_num_records): assert len(pds) == len(test_dataset_large_num_records) assert pds.name == test_dataset_large_num_records.name assert pds.description == test_dataset_large_num_records.description - assert pds._current_version == test_dataset_large_num_records._current_version == 1 + assert pds.current_version == test_dataset_large_num_records.current_version == 1 + assert pds.version == test_dataset_large_num_records.version == 1 dataset = sorted(pds, key=lambda r: int(r["input_data"].lstrip("input_"))) for i, d in enumerate(dataset): @@ -450,7 +454,57 @@ def test_dataset_pull_exists_with_record(llmobs, test_dataset_one_record): assert dataset[0]["expected_output"] == {"answer": "Paris"} assert dataset.name == test_dataset_one_record.name assert dataset.description == test_dataset_one_record.description - assert dataset._current_version == test_dataset_one_record._current_version == 1 + assert dataset.current_version == test_dataset_one_record.current_version == 1 + assert dataset.version == test_dataset_one_record.version == 1 + + +@pytest.mark.parametrize( + "test_dataset_records", + [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], +) +def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records): + assert len(test_dataset) == 1 + assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"} + assert test_dataset[0]["expected_output"] == {"answer": "Paris"} + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 + + test_dataset.append( + {"input_data": {"prompt": "What is the capital of China?"}, "expected_output": {"answer": "Beijing"}} + ) + test_dataset.push() + wait_for_backend(4) + + dataset_v2 = llmobs.pull_dataset(dataset_name=test_dataset.name) + assert len(dataset_v2) == 2 + assert dataset_v2[1]["input_data"] == {"prompt": "What is the capital of France?"} + assert dataset_v2[1]["expected_output"] == {"answer": "Paris"} + assert dataset_v2[0]["input_data"] == {"prompt": "What is the capital of China?"} + assert dataset_v2[0]["expected_output"] == {"answer": "Beijing"} + assert dataset_v2.name == test_dataset.name + assert dataset_v2.description == test_dataset.description + assert dataset_v2.current_version == test_dataset.current_version == 2 + assert dataset_v2.version == test_dataset.version == 2 + + dataset_v1 = llmobs.pull_dataset(dataset_name=test_dataset.name, version=1) + assert len(dataset_v1) == 1 + assert dataset_v1[0]["input_data"] == {"prompt": "What is the capital of France?"} + assert dataset_v1[0]["expected_output"] == {"answer": "Paris"} + assert dataset_v1.name == test_dataset.name + assert dataset_v1.description == test_dataset.description + assert dataset_v1.current_version == test_dataset.current_version == 2 + assert dataset_v1.version == 1 + + +@pytest.mark.parametrize( + "test_dataset_records", + [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], +) +def test_dataset_pull_w_invalid_version(llmobs, test_dataset, test_dataset_records): + with pytest.raises( + ValueError, match="Failed to pull dataset records for.*version is greater than the current version or negative" + ): + llmobs.pull_dataset(dataset_name=test_dataset.name, version=420) def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_project): @@ -464,7 +518,8 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj assert dataset[0]["expected_output"] == {"answer": "Boston"} assert dataset.name == test_dataset_one_record_separate_project.name assert dataset.description == test_dataset_one_record_separate_project.description - assert dataset._current_version == test_dataset_one_record_separate_project._current_version == 1 + assert dataset.current_version == test_dataset_one_record_separate_project.current_version == 1 + assert dataset.version == test_dataset_one_record_separate_project.version == 1 @pytest.mark.parametrize( @@ -479,7 +534,8 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj ], ) def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_dataset_records): - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 test_dataset.update( 0, @@ -518,7 +574,8 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase test_dataset.push() assert len(test_dataset) == 2 - assert test_dataset._current_version == 2 + assert test_dataset.current_version == 2 + assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"} assert test_dataset[0]["expected_output"] == {"answer": "Berlin"} @@ -548,7 +605,8 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase assert len(ds) == 2 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._current_version == 2 + assert ds.current_version == 2 + assert ds.version == 2 @pytest.mark.parametrize( @@ -556,7 +614,8 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records): - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 test_dataset.update( 0, @@ -568,7 +627,8 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset._current_version == 2 + assert test_dataset.current_version == 2 + assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"} assert test_dataset[0]["expected_output"] == {"answer": "Berlin"} @@ -585,7 +645,8 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._current_version == 2 + assert ds.current_version == 2 + assert ds.version == 2 @pytest.mark.parametrize( @@ -593,7 +654,8 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_single_record_empty_record(llmobs, test_dataset, test_dataset_records): - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 with pytest.raises( ValueError, @@ -615,7 +677,8 @@ def test_dataset_estimate_size(llmobs, test_dataset): [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_records): - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 test_dataset.update(0, {"expected_output": None}) assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"} @@ -624,7 +687,8 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset._current_version == 2 + assert test_dataset.current_version == 2 + assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"} assert test_dataset[0]["expected_output"] is None @@ -641,7 +705,8 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._current_version == 2 + assert ds.current_version == 2 + assert ds.version == 2 @pytest.mark.parametrize( @@ -657,7 +722,8 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re ], ) def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_records): - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 test_dataset.update(0, {"input_data": "A"}) assert test_dataset[0]["input_data"] == "A" assert test_dataset[0]["expected_output"] == {"answer": "Paris"} @@ -665,7 +731,8 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset._current_version == 2 + assert test_dataset.current_version == 2 + assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == "A" assert test_dataset[0]["expected_output"] == {"answer": "Paris"} @@ -684,7 +751,8 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds._current_version == 2 + assert ds.current_version == 2 + assert ds.version == 2 @pytest.mark.parametrize( @@ -696,7 +764,8 @@ def test_dataset_append(llmobs, test_dataset): DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"}) ) assert len(test_dataset) == 2 - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() @@ -707,12 +776,14 @@ def test_dataset_append(llmobs, test_dataset): assert test_dataset[1]["expected_output"] == {"answer": "Rome"} assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset._current_version == 2 + assert test_dataset.current_version == 2 + assert test_dataset.version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._current_version == 2 + assert ds.current_version == 2 + assert ds.version == 2 assert len(ds) == 2 # note: it looks like dataset order is not deterministic assert ds[1]["input_data"] == {"prompt": "What is the capital of France?"} @@ -735,7 +806,8 @@ def test_dataset_extend(llmobs, test_dataset): ] ) assert len(test_dataset) == 3 - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() @@ -748,12 +820,14 @@ def test_dataset_extend(llmobs, test_dataset): assert test_dataset[2]["expected_output"] == {"answer": "Stockholm"} assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset._current_version == 2 + assert test_dataset.current_version == 2 + assert test_dataset.version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._current_version == 2 + assert ds.current_version == 2 + assert ds.version == 2 assert len(ds) == 3 assert ds[2]["input_data"] == {"prompt": "What is the capital of France?"} # order is non deterministic @@ -770,7 +844,8 @@ def test_dataset_extend(llmobs, test_dataset): def test_dataset_append_no_expected_output(llmobs, test_dataset): test_dataset.append(DatasetRecord(input_data={"prompt": "What is the capital of Sealand?"})) assert len(test_dataset) == 2 - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() @@ -781,12 +856,14 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset): assert "expected_output" not in test_dataset[1] assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset._current_version == 2 + assert test_dataset.current_version == 2 + assert test_dataset.version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._current_version == 2 + assert ds.current_version == 2 + assert ds.version == 2 assert len(ds) == 2 # note: it looks like dataset order is not deterministic assert ds[1]["input_data"] == {"prompt": "What is the capital of France?"} @@ -809,11 +886,13 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset): def test_dataset_delete(llmobs, test_dataset): test_dataset.delete(0) assert len(test_dataset) == 1 - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() - assert test_dataset._current_version == 2 + assert test_dataset.current_version == 2 + assert test_dataset.version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert test_dataset[0]["expected_output"] == {"answer": "Rome"} @@ -823,7 +902,8 @@ def test_dataset_delete(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._current_version == 2 + assert ds.current_version == 2 + assert ds.version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert ds[0]["expected_output"] == {"answer": "Rome"} @@ -841,11 +921,13 @@ def test_dataset_delete(llmobs, test_dataset): def test_dataset_delete_no_expected_output(llmobs, test_dataset): test_dataset.delete(1) assert len(test_dataset) == 1 - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() - assert test_dataset._current_version == 2 + assert test_dataset.current_version == 2 + assert test_dataset.version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Nauru?"} assert "expected_output" not in test_dataset[0] @@ -855,7 +937,8 @@ def test_dataset_delete_no_expected_output(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._current_version == 2 + assert ds.current_version == 2 + assert ds.version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Nauru?"} assert ds[0]["expected_output"] is None @@ -879,11 +962,13 @@ def test_dataset_delete_after_update(llmobs, test_dataset): test_dataset.delete(0) assert len(test_dataset) == 1 - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 wait_for_backend() test_dataset.push() - assert test_dataset._current_version == 2 + assert test_dataset.current_version == 2 + assert test_dataset.version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert test_dataset[0]["expected_output"] == {"answer": "Rome"} @@ -893,7 +978,8 @@ def test_dataset_delete_after_update(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._current_version == 2 + assert ds.current_version == 2 + assert ds.version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert ds[0]["expected_output"] == {"answer": "Rome"} @@ -918,14 +1004,16 @@ def test_dataset_delete_after_append(llmobs, test_dataset): test_dataset.delete(0) # all that remains should be Italy and Sweden questions assert len(test_dataset) == 2 - assert test_dataset._current_version == 1 + assert test_dataset.current_version == 1 + assert test_dataset.version == 1 assert len(test_dataset._new_records_by_record_id) == 1 assert len(test_dataset._deleted_record_ids) == 1 wait_for_backend() test_dataset.push() - assert test_dataset._current_version == 2 + assert test_dataset.current_version == 2 + assert test_dataset.version == 2 assert len(test_dataset) == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} assert test_dataset[0]["expected_output"] == {"answer": "Rome"} @@ -937,7 +1025,8 @@ def test_dataset_delete_after_append(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds._current_version == 2 + assert ds.current_version == 2 + assert ds.version == 2 assert len(ds) == 2 sds = sorted(ds, key=lambda r: r["input_data"]["prompt"]) assert sds[0]["input_data"] == {"prompt": "What is the capital of Italy?"} @@ -1106,7 +1195,7 @@ def test_experiment_create(llmobs, test_dataset_one_record): project = llmobs._instance._dne_client.project_create_or_get("test-project") project_id = project.get("_id") exp_id, exp_run_name = llmobs._instance._dne_client.experiment_create( - exp.name, exp._dataset._id, project_id, exp._dataset._current_version, exp._config + exp.name, exp._dataset._id, project_id, exp._dataset.current_version, exp._config ) assert exp_id is not None assert exp_run_name.startswith("test_experiment") From 4555396b6e526f764c52fe4cc53aca7648665551 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Tue, 21 Oct 2025 14:46:11 -0400 Subject: [PATCH 4/7] reno --- ...bs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml diff --git a/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml new file mode 100644 index 00000000000..912c2f243ba --- /dev/null +++ b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml @@ -0,0 +1,5 @@ +--- +upgrade: + - | + LLM Observability: Previous dataset versions can be optionally pulled by passing the ``version`` + argument to ``LLMObs.pull_dataset`` From 0e0c15bd85b497d98d1f08bfb8c0866a637af436 Mon Sep 17 00:00:00 2001 From: Gary Huang Date: Tue, 21 Oct 2025 16:36:11 -0400 Subject: [PATCH 5/7] Update releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml Co-authored-by: Sam Brenner <106700075+sabrenner@users.noreply.github.com> --- ...lmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml index 912c2f243ba..ef710697fcd 100644 --- a/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml +++ b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml @@ -1,5 +1,5 @@ --- -upgrade: +features: - | LLM Observability: Previous dataset versions can be optionally pulled by passing the ``version`` argument to ``LLMObs.pull_dataset`` From 13c0407d21f95856cac4f9f4ec756d9d796feecd Mon Sep 17 00:00:00 2001 From: gary-huang Date: Wed, 22 Oct 2025 14:34:13 -0400 Subject: [PATCH 6/7] rename current version to latest version and use urlencode, and update release notes --- ddtrace/llmobs/_experiment.py | 16 ++-- ddtrace/llmobs/_writer.py | 35 ++++--- ...rsioned-dataset-pull-c7017f982b2c1f5b.yaml | 3 + tests/llmobs/test_experiments.py | 94 +++++++++---------- 4 files changed, 82 insertions(+), 66 deletions(-) diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 2e0932c7ab4..d31a4680626 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -107,7 +107,7 @@ class Dataset: _id: str _records: List[DatasetRecord] _version: int - _current_version: int + _latest_version: int _dne_client: "LLMObsExperimentsClient" _new_records_by_record_id: Dict[str, DatasetRecordRaw] _updated_record_ids_to_new_fields: Dict[str, UpdatableDatasetRecord] @@ -122,7 +122,7 @@ def __init__( dataset_id: str, records: List[DatasetRecord], description: str, - current_version: int, + latest_version: int, version: int, _dne_client: "LLMObsExperimentsClient", ) -> None: @@ -130,7 +130,7 @@ def __init__( self.project = project self.description = description self._id = dataset_id - self._current_version = current_version + self._latest_version = latest_version self._version = version self._dne_client = _dne_client self._records = records @@ -171,10 +171,10 @@ def push(self) -> None: record["record_id"] = record_id # type: ignore # FIXME: we don't get version numbers in responses to deletion requests - self._current_version = new_version if new_version != -1 else self._current_version + 1 + self._latest_version = new_version if new_version != -1 else self._latest_version + 1 # no matter what the version was before the push, pushing will result in the dataset being on the current # version tracked by the backend - self._version = self._current_version + self._version = self._latest_version self._new_records_by_record_id = {} self._deleted_record_ids = [] self._updated_record_ids_to_new_fields = {} @@ -232,8 +232,8 @@ def url(self) -> str: return f"{_get_base_url()}/llm/datasets/{self._id}" @property - def current_version(self) -> int: - return self._current_version + def latest_version(self) -> int: + return self._latest_version @property def version(self) -> int: @@ -448,7 +448,7 @@ def _run_task(self, jobs: int, raise_errors: bool = False, sample_size: Optional dataset_id=self._dataset._id, records=subset_records, description=self._dataset.description, - current_version=self._dataset._current_version, + latest_version=self._dataset._latest_version, version=self._dataset._version, _dne_client=self._dataset._dne_client, ) diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 0f224996394..be22a54aa86 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -3,6 +3,7 @@ import json import os import tempfile +import urllib from typing import Any from typing import Dict from typing import List @@ -400,7 +401,16 @@ def dataset_create( if dataset_id is None or dataset_id == "": raise ValueError(f"unexpected dataset state, invalid ID (is None: {dataset_id is None})") curr_version = response_data["data"]["attributes"]["current_version"] - return Dataset(dataset_name, project, dataset_id, [], description, curr_version, curr_version, _dne_client=self) + return Dataset( + name=dataset_name, + project=project, + dataset_id=dataset_id, + records=[], + description=description, + latest_version=curr_version, + version=curr_version, + _dne_client=self, + ) @staticmethod def _get_record_json(record: Union[UpdatableDatasetRecord, DatasetRecordRaw], is_update: bool) -> JSONType: @@ -484,12 +494,14 @@ def dataset_get_with_records( dataset_id = data[0]["id"] list_base_path = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" + + url_options = {} if version: - list_base_path = f"{list_base_path}?filter[version]={version}" + url_options["filter[version]"] = version has_next_page = True class_records: List[DatasetRecord] = [] - list_path = list_base_path + list_path = f"{list_base_path}?{urllib.parse.urlencode(url_options, safe='[]')}" page_num = 0 while has_next_page: resp = self.request("GET", list_path, timeout=self.LIST_RECORDS_TIMEOUT) @@ -514,17 +526,18 @@ def dataset_get_with_records( has_next_page = False if next_cursor: has_next_page = True - list_path = f"{list_base_path}{'&' if version else '?'}page[cursor]={next_cursor}" + url_options["page[cursor]"] = next_cursor + list_path = f"{list_base_path}?{urllib.parse.urlencode(url_options, safe='[]')}" logger.debug("next list records request path %s", list_path) page_num += 1 return Dataset( - dataset_name, - project, - dataset_id, - class_records, - dataset_description, - curr_version, - version or curr_version, + name=dataset_name, + project=project, + dataset_id=dataset_id, + records=class_records, + description=dataset_description, + latest_version=curr_version, + version=version or curr_version, _dne_client=self, ) diff --git a/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml index ef710697fcd..f1f32a86735 100644 --- a/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml +++ b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml @@ -3,3 +3,6 @@ features: - | LLM Observability: Previous dataset versions can be optionally pulled by passing the ``version`` argument to ``LLMObs.pull_dataset`` + - | + LLM Observability: Datasets have new properties ``version`` and ``latest_version`` to provide information on the + version of the dataset that is being worked with and the latest global version of the dataset, respectively \ No newline at end of file diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 1646d1cb90b..cf4e44905f4 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -300,8 +300,8 @@ def test_dataset_csv_no_expected_output(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds.current_version == 1 - assert ds.current_version == ds.version + assert ds.latest_version == 1 + assert ds.latest_version == ds.version finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset_id) @@ -348,8 +348,8 @@ def test_dataset_csv(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds.current_version == 1 - assert ds.current_version == ds.version + assert ds.latest_version == 1 + assert ds.latest_version == ds.version finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset_id) @@ -402,8 +402,8 @@ def test_dataset_csv_pipe_separated(llmobs, tmp_csv_file_for_upload): assert len(ds) == len(dataset) assert ds.name == dataset.name assert ds.description == dataset.description - assert ds.current_version == 1 - assert ds.current_version == ds.version + assert ds.latest_version == 1 + assert ds.latest_version == ds.version finally: if dataset_id: llmobs._delete_dataset(dataset_id=dataset._id) @@ -426,7 +426,7 @@ def test_dataset_pull_large_num_records(llmobs, test_dataset_large_num_records): assert len(pds) == len(test_dataset_large_num_records) assert pds.name == test_dataset_large_num_records.name assert pds.description == test_dataset_large_num_records.description - assert pds.current_version == test_dataset_large_num_records.current_version == 1 + assert pds.latest_version == test_dataset_large_num_records.latest_version == 1 assert pds.version == test_dataset_large_num_records.version == 1 dataset = sorted(pds, key=lambda r: int(r["input_data"].lstrip("input_"))) @@ -454,7 +454,7 @@ def test_dataset_pull_exists_with_record(llmobs, test_dataset_one_record): assert dataset[0]["expected_output"] == {"answer": "Paris"} assert dataset.name == test_dataset_one_record.name assert dataset.description == test_dataset_one_record.description - assert dataset.current_version == test_dataset_one_record.current_version == 1 + assert dataset.latest_version == test_dataset_one_record.latest_version == 1 assert dataset.version == test_dataset_one_record.version == 1 @@ -466,7 +466,7 @@ def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records): assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"} assert test_dataset[0]["expected_output"] == {"answer": "Paris"} - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 test_dataset.append( @@ -483,7 +483,7 @@ def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records): assert dataset_v2[0]["expected_output"] == {"answer": "Beijing"} assert dataset_v2.name == test_dataset.name assert dataset_v2.description == test_dataset.description - assert dataset_v2.current_version == test_dataset.current_version == 2 + assert dataset_v2.latest_version == test_dataset.latest_version == 2 assert dataset_v2.version == test_dataset.version == 2 dataset_v1 = llmobs.pull_dataset(dataset_name=test_dataset.name, version=1) @@ -492,7 +492,7 @@ def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records): assert dataset_v1[0]["expected_output"] == {"answer": "Paris"} assert dataset_v1.name == test_dataset.name assert dataset_v1.description == test_dataset.description - assert dataset_v1.current_version == test_dataset.current_version == 2 + assert dataset_v1.latest_version == test_dataset.latest_version == 2 assert dataset_v1.version == 1 @@ -518,7 +518,7 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj assert dataset[0]["expected_output"] == {"answer": "Boston"} assert dataset.name == test_dataset_one_record_separate_project.name assert dataset.description == test_dataset_one_record_separate_project.description - assert dataset.current_version == test_dataset_one_record_separate_project.current_version == 1 + assert dataset.latest_version == test_dataset_one_record_separate_project.latest_version == 1 assert dataset.version == test_dataset_one_record_separate_project.version == 1 @@ -534,7 +534,7 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj ], ) def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_dataset_records): - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 test_dataset.update( @@ -574,7 +574,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase test_dataset.push() assert len(test_dataset) == 2 - assert test_dataset.current_version == 2 + assert test_dataset.latest_version == 2 assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"} @@ -605,7 +605,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase assert len(ds) == 2 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds.current_version == 2 + assert ds.latest_version == 2 assert ds.version == 2 @@ -614,7 +614,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records): - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 test_dataset.update( @@ -627,7 +627,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset.current_version == 2 + assert test_dataset.latest_version == 2 assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"} @@ -645,7 +645,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds.current_version == 2 + assert ds.latest_version == 2 assert ds.version == 2 @@ -654,7 +654,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_single_record_empty_record(llmobs, test_dataset, test_dataset_records): - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 with pytest.raises( @@ -677,7 +677,7 @@ def test_dataset_estimate_size(llmobs, test_dataset): [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]], ) def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_records): - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 test_dataset.update(0, {"expected_output": None}) @@ -687,7 +687,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset.current_version == 2 + assert test_dataset.latest_version == 2 assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"} @@ -705,7 +705,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds.current_version == 2 + assert ds.latest_version == 2 assert ds.version == 2 @@ -722,7 +722,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re ], ) def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_records): - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 test_dataset.update(0, {"input_data": "A"}) assert test_dataset[0]["input_data"] == "A" @@ -731,7 +731,7 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor test_dataset.push() assert len(test_dataset) == 1 - assert test_dataset.current_version == 2 + assert test_dataset.latest_version == 2 assert test_dataset.version == 2 assert test_dataset[0]["input_data"] == "A" @@ -751,7 +751,7 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor assert len(ds) == 1 assert ds.name == test_dataset.name assert ds.description == test_dataset.description - assert ds.current_version == 2 + assert ds.latest_version == 2 assert ds.version == 2 @@ -764,7 +764,7 @@ def test_dataset_append(llmobs, test_dataset): DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"}) ) assert len(test_dataset) == 2 - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 wait_for_backend() @@ -776,13 +776,13 @@ def test_dataset_append(llmobs, test_dataset): assert test_dataset[1]["expected_output"] == {"answer": "Rome"} assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset.current_version == 2 + assert test_dataset.latest_version == 2 assert test_dataset.version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds.current_version == 2 + assert ds.latest_version == 2 assert ds.version == 2 assert len(ds) == 2 # note: it looks like dataset order is not deterministic @@ -806,7 +806,7 @@ def test_dataset_extend(llmobs, test_dataset): ] ) assert len(test_dataset) == 3 - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 wait_for_backend() @@ -820,13 +820,13 @@ def test_dataset_extend(llmobs, test_dataset): assert test_dataset[2]["expected_output"] == {"answer": "Stockholm"} assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset.current_version == 2 + assert test_dataset.latest_version == 2 assert test_dataset.version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds.current_version == 2 + assert ds.latest_version == 2 assert ds.version == 2 assert len(ds) == 3 assert ds[2]["input_data"] == {"prompt": "What is the capital of France?"} @@ -844,7 +844,7 @@ def test_dataset_extend(llmobs, test_dataset): def test_dataset_append_no_expected_output(llmobs, test_dataset): test_dataset.append(DatasetRecord(input_data={"prompt": "What is the capital of Sealand?"})) assert len(test_dataset) == 2 - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 wait_for_backend() @@ -856,13 +856,13 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset): assert "expected_output" not in test_dataset[1] assert test_dataset.name == test_dataset.name assert test_dataset.description == test_dataset.description - assert test_dataset.current_version == 2 + assert test_dataset.latest_version == 2 assert test_dataset.version == 2 # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds.current_version == 2 + assert ds.latest_version == 2 assert ds.version == 2 assert len(ds) == 2 # note: it looks like dataset order is not deterministic @@ -886,12 +886,12 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset): def test_dataset_delete(llmobs, test_dataset): test_dataset.delete(0) assert len(test_dataset) == 1 - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 wait_for_backend() test_dataset.push() - assert test_dataset.current_version == 2 + assert test_dataset.latest_version == 2 assert test_dataset.version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} @@ -902,7 +902,7 @@ def test_dataset_delete(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds.current_version == 2 + assert ds.latest_version == 2 assert ds.version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"} @@ -921,12 +921,12 @@ def test_dataset_delete(llmobs, test_dataset): def test_dataset_delete_no_expected_output(llmobs, test_dataset): test_dataset.delete(1) assert len(test_dataset) == 1 - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 wait_for_backend() test_dataset.push() - assert test_dataset.current_version == 2 + assert test_dataset.latest_version == 2 assert test_dataset.version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Nauru?"} @@ -937,7 +937,7 @@ def test_dataset_delete_no_expected_output(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds.current_version == 2 + assert ds.latest_version == 2 assert ds.version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Nauru?"} @@ -962,12 +962,12 @@ def test_dataset_delete_after_update(llmobs, test_dataset): test_dataset.delete(0) assert len(test_dataset) == 1 - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 wait_for_backend() test_dataset.push() - assert test_dataset.current_version == 2 + assert test_dataset.latest_version == 2 assert test_dataset.version == 2 assert len(test_dataset) == 1 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} @@ -978,7 +978,7 @@ def test_dataset_delete_after_update(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds.current_version == 2 + assert ds.latest_version == 2 assert ds.version == 2 assert len(ds) == 1 assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"} @@ -1004,7 +1004,7 @@ def test_dataset_delete_after_append(llmobs, test_dataset): test_dataset.delete(0) # all that remains should be Italy and Sweden questions assert len(test_dataset) == 2 - assert test_dataset.current_version == 1 + assert test_dataset.latest_version == 1 assert test_dataset.version == 1 assert len(test_dataset._new_records_by_record_id) == 1 @@ -1012,7 +1012,7 @@ def test_dataset_delete_after_append(llmobs, test_dataset): wait_for_backend() test_dataset.push() - assert test_dataset.current_version == 2 + assert test_dataset.latest_version == 2 assert test_dataset.version == 2 assert len(test_dataset) == 2 assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"} @@ -1025,7 +1025,7 @@ def test_dataset_delete_after_append(llmobs, test_dataset): # check that a pulled dataset matches the pushed dataset wait_for_backend() ds = llmobs.pull_dataset(dataset_name=test_dataset.name) - assert ds.current_version == 2 + assert ds.latest_version == 2 assert ds.version == 2 assert len(ds) == 2 sds = sorted(ds, key=lambda r: r["input_data"]["prompt"]) @@ -1195,7 +1195,7 @@ def test_experiment_create(llmobs, test_dataset_one_record): project = llmobs._instance._dne_client.project_create_or_get("test-project") project_id = project.get("_id") exp_id, exp_run_name = llmobs._instance._dne_client.experiment_create( - exp.name, exp._dataset._id, project_id, exp._dataset.current_version, exp._config + exp.name, exp._dataset._id, project_id, exp._dataset.latest_version, exp._config ) assert exp_id is not None assert exp_run_name.startswith("test_experiment") From 48e4ca9f8792af531bf54b95875c16140ae53d7f Mon Sep 17 00:00:00 2001 From: gary-huang Date: Wed, 22 Oct 2025 15:22:45 -0400 Subject: [PATCH 7/7] fresh url options after list records call --- ddtrace/llmobs/_writer.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index be22a54aa86..1f70c0c3bae 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -3,7 +3,6 @@ import json import os import tempfile -import urllib from typing import Any from typing import Dict from typing import List @@ -12,6 +11,7 @@ from typing import TypedDict from typing import Union from typing import cast +import urllib from urllib.parse import quote from urllib.parse import urlparse @@ -495,15 +495,16 @@ def dataset_get_with_records( list_base_path = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" - url_options = {} - if version: - url_options["filter[version]"] = version - has_next_page = True class_records: List[DatasetRecord] = [] - list_path = f"{list_base_path}?{urllib.parse.urlencode(url_options, safe='[]')}" page_num = 0 + url_options = {} while has_next_page: + if version: + url_options["filter[version]"] = version + + list_path = f"{list_base_path}?{urllib.parse.urlencode(url_options, safe='[]')}" + logger.debug("list records page %d, request path=%s", page_num, list_path) resp = self.request("GET", list_path, timeout=self.LIST_RECORDS_TIMEOUT) if resp.status != 200: raise ValueError( @@ -523,12 +524,12 @@ def dataset_get_with_records( } ) next_cursor = records_data.get("meta", {}).get("after") + + url_options = {} has_next_page = False if next_cursor: has_next_page = True url_options["page[cursor]"] = next_cursor - list_path = f"{list_base_path}?{urllib.parse.urlencode(url_options, safe='[]')}" - logger.debug("next list records request path %s", list_path) page_num += 1 return Dataset( name=dataset_name,