From 078a9c7c6c0236a4c60f94ef10d4b5fbc05009d8 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Thu, 16 Oct 2025 15:37:31 -0400
Subject: [PATCH 1/7] allow versioned dataset pull

---
 ddtrace/llmobs/_llmobs.py | 4 ++--
 ddtrace/llmobs/_writer.py | 9 ++++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 457ead64212..55275c7c620 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -669,8 +669,8 @@ def _on_asyncio_execute_task(self, task_data: Dict[str, Any]) -> None:
             self._llmobs_context_provider.activate(llmobs_ctx)
 
     @classmethod
-    def pull_dataset(cls, dataset_name: str, project_name: Optional[str] = None) -> Dataset:
-        ds = cls._instance._dne_client.dataset_get_with_records(dataset_name, (project_name or cls._project_name))
+    def pull_dataset(cls, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None) -> Dataset:
+        ds = cls._instance._dne_client.dataset_get_with_records(dataset_name, (project_name or cls._project_name), version)
         return ds
 
     @classmethod
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index e5f4cbcf333..48b767ac85a 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -458,10 +458,10 @@ def dataset_batch_update(
         new_record_ids: List[str] = [r["id"] for r in data] if data else []
         return new_version, new_record_ids
 
-    def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str] = None) -> Dataset:
+    def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None) -> Dataset:
         project = self.project_create_or_get(project_name)
         project_id = project.get("_id")
-        logger.debug("getting records with project ID %s for %s", project_id, project_name)
+        logger.debug("getting records with project ID %s for %s, version: %s", project_id, project_name, str(version) or "latest")
 
         path = f"/api/unstable/llm-obs/v1/{project_id}/datasets?filter[name]={quote(dataset_name)}"
         resp = self.request("GET", path)
@@ -480,6 +480,9 @@ def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str
         dataset_id = data[0]["id"]
 
         list_base_path = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
+        if version:
+            list_base_path = f"{list_base_path}?filter[version]={version}"
+
         has_next_page = True
         class_records: List[DatasetRecord] = []
         list_path = list_base_path
@@ -507,7 +510,7 @@ def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str
             has_next_page = False
             if next_cursor:
                 has_next_page = True
-                list_path = f"{list_base_path}?page[cursor]={next_cursor}"
+                list_path = f"{list_base_path}{"&" if version else "?"}page[cursor]={next_cursor}"
                 logger.debug("next list records request path %s", list_path)
                 page_num += 1
         return Dataset(

From c566d4372e90d21a248e2d9c930191a26fc5eb35 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Fri, 17 Oct 2025 16:27:12 -0400
Subject: [PATCH 2/7] add current version and version as properties

---
 ddtrace/llmobs/_experiment.py    | 17 ++++++-
 ddtrace/llmobs/_writer.py        |  4 +-
 tests/llmobs/test_experiments.py | 82 ++++++++++++++++----------------
 3 files changed, 59 insertions(+), 44 deletions(-)

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 7bfb80c0ea5..2e0932c7ab4 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -107,6 +107,7 @@ class Dataset:
     _id: str
     _records: List[DatasetRecord]
     _version: int
+    _current_version: int
     _dne_client: "LLMObsExperimentsClient"
     _new_records_by_record_id: Dict[str, DatasetRecordRaw]
     _updated_record_ids_to_new_fields: Dict[str, UpdatableDatasetRecord]
@@ -121,6 +122,7 @@ def __init__(
         dataset_id: str,
         records: List[DatasetRecord],
         description: str,
+        current_version: int,
         version: int,
         _dne_client: "LLMObsExperimentsClient",
     ) -> None:
@@ -128,6 +130,7 @@ def __init__(
         self.project = project
         self.description = description
         self._id = dataset_id
+        self._current_version = current_version
         self._version = version
         self._dne_client = _dne_client
         self._records = records
@@ -168,7 +171,10 @@ def push(self) -> None:
                 record["record_id"] = record_id  # type: ignore
 
             # FIXME: we don't get version numbers in responses to deletion requests
-            self._version = new_version if new_version != -1 else self._version + 1
+            self._current_version = new_version if new_version != -1 else self._current_version + 1
+            # no matter what the version was before the push, pushing will result in the dataset being on the current
+            # version tracked by the backend
+            self._version = self._current_version
         self._new_records_by_record_id = {}
         self._deleted_record_ids = []
         self._updated_record_ids_to_new_fields = {}
@@ -225,6 +231,14 @@ def url(self) -> str:
         # FIXME: will not work for subdomain orgs
         return f"{_get_base_url()}/llm/datasets/{self._id}"
 
+    @property
+    def current_version(self) -> int:
+        return self._current_version
+
+    @property
+    def version(self) -> int:
+        return self._version
+
     def _estimate_delta_size(self) -> int:
         """rough estimate (in bytes) of the size of the next batch update call if it happens"""
         size = len(safe_json(self._new_records_by_record_id)) + len(safe_json(self._updated_record_ids_to_new_fields))
@@ -434,6 +448,7 @@ def _run_task(self, jobs: int, raise_errors: bool = False, sample_size: Optional
                 dataset_id=self._dataset._id,
                 records=subset_records,
                 description=self._dataset.description,
+                current_version=self._dataset._current_version,
                 version=self._dataset._version,
                 _dne_client=self._dataset._dne_client,
             )
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index 48b767ac85a..cedea1a1c69 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -400,7 +400,7 @@ def dataset_create(
         if dataset_id is None or dataset_id == "":
             raise ValueError(f"unexpected dataset state, invalid ID (is None: {dataset_id is None})")
         curr_version = response_data["data"]["attributes"]["current_version"]
-        return Dataset(dataset_name, project, dataset_id, [], description, curr_version, _dne_client=self)
+        return Dataset(dataset_name, project, dataset_id, [], description, curr_version, curr_version, _dne_client=self)
 
     @staticmethod
     def _get_record_json(record: Union[UpdatableDatasetRecord, DatasetRecordRaw], is_update: bool) -> JSONType:
@@ -514,7 +514,7 @@ def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str
                 logger.debug("next list records request path %s", list_path)
                 page_num += 1
         return Dataset(
-            dataset_name, project, dataset_id, class_records, dataset_description, curr_version, _dne_client=self
+            dataset_name, project, dataset_id, class_records, dataset_description, curr_version, version or curr_version, _dne_client=self
         )
 
     def dataset_bulk_upload(self, dataset_id: str, records: List[DatasetRecord]):
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index dd36225d589..76a3898a657 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -300,7 +300,7 @@ def test_dataset_csv_no_expected_output(llmobs, tmp_csv_file_for_upload):
             assert len(ds) == len(dataset)
             assert ds.name == dataset.name
             assert ds.description == dataset.description
-            assert ds._version == 1
+            assert ds._current_version == 1
         finally:
             if dataset_id:
                 llmobs._delete_dataset(dataset_id=dataset_id)
@@ -347,7 +347,7 @@ def test_dataset_csv(llmobs, tmp_csv_file_for_upload):
             assert len(ds) == len(dataset)
             assert ds.name == dataset.name
             assert ds.description == dataset.description
-            assert ds._version == 1
+            assert ds._current_version == 1
         finally:
             if dataset_id:
                 llmobs._delete_dataset(dataset_id=dataset_id)
@@ -400,7 +400,7 @@ def test_dataset_csv_pipe_separated(llmobs, tmp_csv_file_for_upload):
             assert len(ds) == len(dataset)
             assert ds.name == dataset.name
             assert ds.description == dataset.description
-            assert ds._version == 1
+            assert ds._current_version == 1
         finally:
             if dataset_id:
                 llmobs._delete_dataset(dataset_id=dataset._id)
@@ -423,7 +423,7 @@ def test_dataset_pull_large_num_records(llmobs, test_dataset_large_num_records):
     assert len(pds) == len(test_dataset_large_num_records)
     assert pds.name == test_dataset_large_num_records.name
     assert pds.description == test_dataset_large_num_records.description
-    assert pds._version == test_dataset_large_num_records._version == 1
+    assert pds._current_version == test_dataset_large_num_records._current_version == 1
 
     dataset = sorted(pds, key=lambda r: int(r["input_data"].lstrip("input_")))
     for i, d in enumerate(dataset):
@@ -450,7 +450,7 @@ def test_dataset_pull_exists_with_record(llmobs, test_dataset_one_record):
     assert dataset[0]["expected_output"] == {"answer": "Paris"}
     assert dataset.name == test_dataset_one_record.name
     assert dataset.description == test_dataset_one_record.description
-    assert dataset._version == test_dataset_one_record._version == 1
+    assert dataset._current_version == test_dataset_one_record._current_version == 1
 
 
 def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_project):
@@ -464,7 +464,7 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj
     assert dataset[0]["expected_output"] == {"answer": "Boston"}
     assert dataset.name == test_dataset_one_record_separate_project.name
     assert dataset.description == test_dataset_one_record_separate_project.description
-    assert dataset._version == test_dataset_one_record_separate_project._version == 1
+    assert dataset._current_version == test_dataset_one_record_separate_project._current_version == 1
 
 
 @pytest.mark.parametrize(
@@ -479,7 +479,7 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj
     ],
 )
 def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
 
     test_dataset.update(
         0,
@@ -518,7 +518,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase
 
     test_dataset.push()
     assert len(test_dataset) == 2
-    assert test_dataset._version == 2
+    assert test_dataset._current_version == 2
 
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Berlin"}
@@ -548,7 +548,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase
     assert len(ds) == 2
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds._version == 2
+    assert ds._current_version == 2
 
 
 @pytest.mark.parametrize(
@@ -556,7 +556,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase
     [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
 )
 def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
 
     test_dataset.update(
         0,
@@ -568,7 +568,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records
 
     test_dataset.push()
     assert len(test_dataset) == 1
-    assert test_dataset._version == 2
+    assert test_dataset._current_version == 2
 
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Berlin"}
@@ -585,7 +585,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records
     assert len(ds) == 1
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds._version == 2
+    assert ds._current_version == 2
 
 
 @pytest.mark.parametrize(
@@ -593,7 +593,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records
     [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
 )
 def test_dataset_modify_single_record_empty_record(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
 
     with pytest.raises(
         ValueError,
@@ -615,7 +615,7 @@ def test_dataset_estimate_size(llmobs, test_dataset):
     [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
 )
 def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
 
     test_dataset.update(0, {"expected_output": None})
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"}
@@ -624,7 +624,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re
 
     test_dataset.push()
     assert len(test_dataset) == 1
-    assert test_dataset._version == 2
+    assert test_dataset._current_version == 2
 
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"}
     assert test_dataset[0]["expected_output"] is None
@@ -641,7 +641,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re
     assert len(ds) == 1
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds._version == 2
+    assert ds._current_version == 2
 
 
 @pytest.mark.parametrize(
@@ -657,7 +657,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re
     ],
 )
 def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
     test_dataset.update(0, {"input_data": "A"})
     assert test_dataset[0]["input_data"] == "A"
     assert test_dataset[0]["expected_output"] == {"answer": "Paris"}
@@ -665,7 +665,7 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor
 
     test_dataset.push()
     assert len(test_dataset) == 1
-    assert test_dataset._version == 2
+    assert test_dataset._current_version == 2
 
     assert test_dataset[0]["input_data"] == "A"
     assert test_dataset[0]["expected_output"] == {"answer": "Paris"}
@@ -684,7 +684,7 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor
     assert len(ds) == 1
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds._version == 2
+    assert ds._current_version == 2
 
 
 @pytest.mark.parametrize(
@@ -696,7 +696,7 @@ def test_dataset_append(llmobs, test_dataset):
         DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"})
     )
     assert len(test_dataset) == 2
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
 
     wait_for_backend()
     test_dataset.push()
@@ -707,12 +707,12 @@ def test_dataset_append(llmobs, test_dataset):
     assert test_dataset[1]["expected_output"] == {"answer": "Rome"}
     assert test_dataset.name == test_dataset.name
     assert test_dataset.description == test_dataset.description
-    assert test_dataset._version == 2
+    assert test_dataset._current_version == 2
 
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._version == 2
+    assert ds._current_version == 2
     assert len(ds) == 2
     # note: it looks like dataset order is not deterministic
     assert ds[1]["input_data"] == {"prompt": "What is the capital of France?"}
@@ -735,7 +735,7 @@ def test_dataset_extend(llmobs, test_dataset):
         ]
     )
     assert len(test_dataset) == 3
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
 
     wait_for_backend()
     test_dataset.push()
@@ -748,12 +748,12 @@ def test_dataset_extend(llmobs, test_dataset):
     assert test_dataset[2]["expected_output"] == {"answer": "Stockholm"}
     assert test_dataset.name == test_dataset.name
     assert test_dataset.description == test_dataset.description
-    assert test_dataset._version == 2
+    assert test_dataset._current_version == 2
 
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._version == 2
+    assert ds._current_version == 2
     assert len(ds) == 3
     assert ds[2]["input_data"] == {"prompt": "What is the capital of France?"}
     # order is non deterministic
@@ -770,7 +770,7 @@ def test_dataset_extend(llmobs, test_dataset):
 def test_dataset_append_no_expected_output(llmobs, test_dataset):
     test_dataset.append(DatasetRecord(input_data={"prompt": "What is the capital of Sealand?"}))
     assert len(test_dataset) == 2
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
 
     wait_for_backend()
     test_dataset.push()
@@ -781,12 +781,12 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset):
     assert "expected_output" not in test_dataset[1]
     assert test_dataset.name == test_dataset.name
     assert test_dataset.description == test_dataset.description
-    assert test_dataset._version == 2
+    assert test_dataset._current_version == 2
 
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._version == 2
+    assert ds._current_version == 2
     assert len(ds) == 2
     # note: it looks like dataset order is not deterministic
     assert ds[1]["input_data"] == {"prompt": "What is the capital of France?"}
@@ -809,11 +809,11 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset):
 def test_dataset_delete(llmobs, test_dataset):
     test_dataset.delete(0)
     assert len(test_dataset) == 1
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset._version == 2
+    assert test_dataset._current_version == 2
     assert len(test_dataset) == 1
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Rome"}
@@ -823,7 +823,7 @@ def test_dataset_delete(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._version == 2
+    assert ds._current_version == 2
     assert len(ds) == 1
     assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
     assert ds[0]["expected_output"] == {"answer": "Rome"}
@@ -841,11 +841,11 @@ def test_dataset_delete(llmobs, test_dataset):
 def test_dataset_delete_no_expected_output(llmobs, test_dataset):
     test_dataset.delete(1)
     assert len(test_dataset) == 1
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset._version == 2
+    assert test_dataset._current_version == 2
     assert len(test_dataset) == 1
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Nauru?"}
     assert "expected_output" not in test_dataset[0]
@@ -855,7 +855,7 @@ def test_dataset_delete_no_expected_output(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._version == 2
+    assert ds._current_version == 2
     assert len(ds) == 1
     assert ds[0]["input_data"] == {"prompt": "What is the capital of Nauru?"}
     assert ds[0]["expected_output"] is None
@@ -879,11 +879,11 @@ def test_dataset_delete_after_update(llmobs, test_dataset):
 
     test_dataset.delete(0)
     assert len(test_dataset) == 1
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset._version == 2
+    assert test_dataset._current_version == 2
     assert len(test_dataset) == 1
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Rome"}
@@ -893,7 +893,7 @@ def test_dataset_delete_after_update(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._version == 2
+    assert ds._current_version == 2
     assert len(ds) == 1
     assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
     assert ds[0]["expected_output"] == {"answer": "Rome"}
@@ -918,14 +918,14 @@ def test_dataset_delete_after_append(llmobs, test_dataset):
     test_dataset.delete(0)
     # all that remains should be Italy and Sweden questions
     assert len(test_dataset) == 2
-    assert test_dataset._version == 1
+    assert test_dataset._current_version == 1
 
     assert len(test_dataset._new_records_by_record_id) == 1
     assert len(test_dataset._deleted_record_ids) == 1
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset._version == 2
+    assert test_dataset._current_version == 2
     assert len(test_dataset) == 2
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Rome"}
@@ -937,7 +937,7 @@ def test_dataset_delete_after_append(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._version == 2
+    assert ds._current_version == 2
     assert len(ds) == 2
     sds = sorted(ds, key=lambda r: r["input_data"]["prompt"])
     assert sds[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
@@ -1106,7 +1106,7 @@ def test_experiment_create(llmobs, test_dataset_one_record):
     project = llmobs._instance._dne_client.project_create_or_get("test-project")
     project_id = project.get("_id")
     exp_id, exp_run_name = llmobs._instance._dne_client.experiment_create(
-        exp.name, exp._dataset._id, project_id, exp._dataset._version, exp._config
+        exp.name, exp._dataset._id, project_id, exp._dataset._current_version, exp._config
     )
     assert exp_id is not None
     assert exp_run_name.startswith("test_experiment")

From e75deb04088891a2f7850a7d723561a2b4aabb83 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Tue, 21 Oct 2025 14:33:06 -0400
Subject: [PATCH 3/7] update tests to use new version properties and add tests
 for versioned pulls

---
 ddtrace/llmobs/_llmobs.py                     |   8 +-
 ddtrace/llmobs/_writer.py                     |  19 +-
 ...817fee85fe_batch_update_post_aa45718c.yaml |  48 +++++
 ...ords_filter_version__420_get_0060e684.yaml |  46 +++++
 ...fa63a95b56_batch_update_post_0226e07c.yaml |  48 +++++
 ...fa63a95b56_batch_update_post_b64aaeeb.yaml |  48 +++++
 ...ecords_filter_version__1_get_bcd9fab6.yaml |  46 +++++
 ...b7a-50fa63a95b56_records_get_eea1d61b.yaml |  47 +++++
 ...-obs_v1_datasets_delete_post_3f1a88cb.yaml |  46 +++++
 ...-obs_v1_datasets_delete_post_516a93fa.yaml |  46 +++++
 ...n_test_dataset_records0__get_12b16625.yaml |  46 +++++
 ...s_test_dataset_records0__get_5cf2366a.yaml |  46 +++++
 ...a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml |  46 +++++
 ...a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml |  46 +++++
 tests/llmobs/test_experiments.py              | 171 +++++++++++++-----
 15 files changed, 710 insertions(+), 47 deletions(-)
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_batch_update_post_aa45718c.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_records_filter_version__420_get_0060e684.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_0226e07c.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_b64aaeeb.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_filter_version__1_get_bcd9fab6.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_get_eea1d61b.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_3f1a88cb.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_516a93fa.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_invalid_version_test_dataset_records0__get_12b16625.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_versions_test_dataset_records0__get_5cf2366a.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml
 create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml

diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 55275c7c620..2a342e27889 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -669,8 +669,12 @@ def _on_asyncio_execute_task(self, task_data: Dict[str, Any]) -> None:
             self._llmobs_context_provider.activate(llmobs_ctx)
 
     @classmethod
-    def pull_dataset(cls, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None) -> Dataset:
-        ds = cls._instance._dne_client.dataset_get_with_records(dataset_name, (project_name or cls._project_name), version)
+    def pull_dataset(
+        cls, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None
+    ) -> Dataset:
+        ds = cls._instance._dne_client.dataset_get_with_records(
+            dataset_name, (project_name or cls._project_name), version
+        )
         return ds
 
     @classmethod
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index cedea1a1c69..0f224996394 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -458,10 +458,14 @@ def dataset_batch_update(
         new_record_ids: List[str] = [r["id"] for r in data] if data else []
         return new_version, new_record_ids
 
-    def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None) -> Dataset:
+    def dataset_get_with_records(
+        self, dataset_name: str, project_name: Optional[str] = None, version: Optional[int] = None
+    ) -> Dataset:
         project = self.project_create_or_get(project_name)
         project_id = project.get("_id")
-        logger.debug("getting records with project ID %s for %s, version: %s", project_id, project_name, str(version) or "latest")
+        logger.debug(
+            "getting records with project ID %s for %s, version: %s", project_id, project_name, str(version) or "latest"
+        )
 
         path = f"/api/unstable/llm-obs/v1/{project_id}/datasets?filter[name]={quote(dataset_name)}"
         resp = self.request("GET", path)
@@ -510,11 +514,18 @@ def dataset_get_with_records(self, dataset_name: str, project_name: Optional[str
             has_next_page = False
             if next_cursor:
                 has_next_page = True
-                list_path = f"{list_base_path}{"&" if version else "?"}page[cursor]={next_cursor}"
+                list_path = f"{list_base_path}{'&' if version else '?'}page[cursor]={next_cursor}"
                 logger.debug("next list records request path %s", list_path)
                 page_num += 1
         return Dataset(
-            dataset_name, project, dataset_id, class_records, dataset_description, curr_version, version or curr_version, _dne_client=self
+            dataset_name,
+            project,
+            dataset_id,
+            class_records,
+            dataset_description,
+            curr_version,
+            version or curr_version,
+            _dne_client=self,
         )
 
     def dataset_bulk_upload(self, dataset_id: str, records: List[DatasetRecord]):
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_batch_update_post_aa45718c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_batch_update_post_aa45718c.yaml
new file mode 100644
index 00000000000..b750ac7724c
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_batch_update_post_aa45718c.yaml
@@ -0,0 +1,48 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "id": "0bb93ae7-43c4-48ff-91e4-d2817fee85fe",
+      "attributes": {"insert_records": [{"input": {"prompt": "What is the capital
+      of France?"}, "expected_output": {"answer": "Paris"}, "metadata": null}], "update_records":
+      [], "delete_records": []}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '271'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/0bb93ae7-43c4-48ff-91e4-d2817fee85fe/batch_update
+  response:
+    body:
+      string: '{"data":[{"id":"eaadecb4-836e-49b3-8390-212b3fffb60b","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.929416376Z","dataset_id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","expected_output":{"answer":"Paris"},"input":{"prompt":"What
+        is the capital of France?"},"updated_at":"2025-10-21T18:26:24.929416376Z","version":1}}]}'
+    headers:
+      content-length:
+      - '389'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:26:24 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_records_filter_version__420_get_0060e684.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_records_filter_version__420_get_0060e684.yaml
new file mode 100644
index 00000000000..3de4ae57834
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_0bb93ae7-43c4-48ff-91e4-d2817fee85fe_records_filter_version__420_get_0060e684.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/0bb93ae7-43c4-48ff-91e4-d2817fee85fe/records?filter%5Bversion%5D=420
+  response:
+    body:
+      string: '{"errors":[{"title":"Generic Error","detail":"invalid version: version
+        is greater than the current version or negative"}]}'
+    headers:
+      content-length:
+      - '122'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:26:27 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 400
+      message: Bad Request
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_0226e07c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_0226e07c.yaml
new file mode 100644
index 00000000000..1db2779735a
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_0226e07c.yaml
@@ -0,0 +1,48 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "id": "4607e918-094d-4aa9-8b7a-50fa63a95b56",
+      "attributes": {"insert_records": [{"input": {"prompt": "What is the capital
+      of France?"}, "expected_output": {"answer": "Paris"}, "metadata": null}], "update_records":
+      [], "delete_records": []}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '271'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/batch_update
+  response:
+    body:
+      string: '{"data":[{"id":"93328f7a-bfd2-4672-8b94-76b0698cd754","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.855004356Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Paris"},"input":{"prompt":"What
+        is the capital of France?"},"updated_at":"2025-10-21T18:25:23.855004356Z","version":1}}]}'
+    headers:
+      content-length:
+      - '389'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:25:23 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_b64aaeeb.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_b64aaeeb.yaml
new file mode 100644
index 00000000000..aa56782bb87
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_batch_update_post_b64aaeeb.yaml
@@ -0,0 +1,48 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "id": "4607e918-094d-4aa9-8b7a-50fa63a95b56",
+      "attributes": {"insert_records": [{"input": {"prompt": "What is the capital
+      of China?"}, "expected_output": {"answer": "Beijing"}, "metadata": null}], "update_records":
+      [], "delete_records": []}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '272'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/batch_update
+  response:
+    body:
+      string: '{"data":[{"id":"5bbd89ec-4eba-4f41-bd47-2a23a005a20a","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:26.024986597Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Beijing"},"input":{"prompt":"What
+        is the capital of China?"},"updated_at":"2025-10-21T18:25:26.024986597Z","version":2}}]}'
+    headers:
+      content-length:
+      - '390'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:25:26 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_filter_version__1_get_bcd9fab6.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_filter_version__1_get_bcd9fab6.yaml
new file mode 100644
index 00000000000..c903d760daf
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_filter_version__1_get_bcd9fab6.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/records?filter%5Bversion%5D=1
+  response:
+    body:
+      string: '{"data":[{"id":"93328f7a-bfd2-4672-8b94-76b0698cd754","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.855004Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Paris"},"input":{"prompt":"What
+        is the capital of France?"},"updated_at":"2025-10-21T18:25:23.855004Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '391'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:25:34 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_get_eea1d61b.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_get_eea1d61b.yaml
new file mode 100644
index 00000000000..61ee8d32222
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_4607e918-094d-4aa9-8b7a-50fa63a95b56_records_get_eea1d61b.yaml
@@ -0,0 +1,47 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/4607e918-094d-4aa9-8b7a-50fa63a95b56/records
+  response:
+    body:
+      string: '{"data":[{"id":"5bbd89ec-4eba-4f41-bd47-2a23a005a20a","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:26.024986Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Beijing"},"input":{"prompt":"What
+        is the capital of China?"},"updated_at":"2025-10-21T18:25:26.024986Z"}},{"id":"93328f7a-bfd2-4672-8b94-76b0698cd754","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.855004Z","dataset_id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","expected_output":{"answer":"Paris"},"input":{"prompt":"What
+        is the capital of France?"},"updated_at":"2025-10-21T18:25:23.855004Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '753'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:25:32 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_3f1a88cb.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_3f1a88cb.yaml
new file mode 100644
index 00000000000..efd2579c8d3
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_3f1a88cb.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids":
+      ["4607e918-094d-4aa9-8b7a-50fa63a95b56"]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '119'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete
+  response:
+    body:
+      string: '{"data":[{"id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.688284Z","current_version":2,"deleted_at":"2025-10-21T18:25:34.800676Z","description":"A
+        test dataset","name":"test-dataset-test_dataset_pull_w_versions[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:25:26.033352Z"}}]}'
+    headers:
+      content-length:
+      - '450'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:25:34 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_516a93fa.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_516a93fa.yaml
new file mode 100644
index 00000000000..42896697f82
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_516a93fa.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids":
+      ["0bb93ae7-43c4-48ff-91e4-d2817fee85fe"]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '119'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete
+  response:
+    body:
+      string: '{"data":[{"id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.806202Z","current_version":1,"deleted_at":"2025-10-21T18:26:27.521296Z","description":"A
+        test dataset","name":"test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:26:24.946573Z"}}]}'
+    headers:
+      content-length:
+      - '457'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:26:27 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_invalid_version_test_dataset_records0__get_12b16625.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_invalid_version_test_dataset_records0__get_12b16625.yaml
new file mode 100644
index 00000000000..bcc667c8752
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_invalid_version_test_dataset_records0__get_12b16625.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets?filter%5Bname%5D=test-dataset-test_dataset_pull_w_invalid_version%5Btest_dataset_records0%5D
+  response:
+    body:
+      string: '{"data":[{"id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.806202Z","current_version":1,"description":"A
+        test dataset","name":"test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:26:24.946573Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '434'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:26:27 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_versions_test_dataset_records0__get_5cf2366a.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_versions_test_dataset_records0__get_5cf2366a.yaml
new file mode 100644
index 00000000000..a8107245785
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_filter_name__test-dataset-test_dataset_pull_w_versions_test_dataset_records0__get_5cf2366a.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets?filter%5Bname%5D=test-dataset-test_dataset_pull_w_versions%5Btest_dataset_records0%5D
+  response:
+    body:
+      string: '{"data":[{"id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.688284Z","current_version":2,"description":"A
+        test dataset","name":"test-dataset-test_dataset_pull_w_versions[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:25:26.033352Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '427'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:25:31 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml
new file mode 100644
index 00000000000..3b03000d203
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_5a9c5f1b.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-test_dataset_pull_w_versions[test_dataset_records0]",
+      "description": "A test dataset"}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '155'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets
+  response:
+    body:
+      string: '{"data":{"id":"4607e918-094d-4aa9-8b7a-50fa63a95b56","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:25:23.688284458Z","current_version":0,"description":"A
+        test dataset","name":"test-dataset-test_dataset_pull_w_versions[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:25:23.688284458Z"}}}'
+    headers:
+      content-length:
+      - '411'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:25:23 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml
new file mode 100644
index 00000000000..17ca6f8adec
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9_datasets_post_f3e0da1c.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]",
+      "description": "A test dataset"}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '162'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9/datasets
+  response:
+    body:
+      string: '{"data":{"id":"0bb93ae7-43c4-48ff-91e4-d2817fee85fe","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-21T18:26:24.80620202Z","current_version":0,"description":"A
+        test dataset","name":"test-dataset-test_dataset_pull_w_invalid_version[test_dataset_records0]","project_id":"f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9","updated_at":"2025-10-21T18:26:24.80620202Z"}}}'
+    headers:
+      content-length:
+      - '416'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Tue, 21 Oct 2025 18:26:24 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index 76a3898a657..1646d1cb90b 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -300,7 +300,8 @@ def test_dataset_csv_no_expected_output(llmobs, tmp_csv_file_for_upload):
             assert len(ds) == len(dataset)
             assert ds.name == dataset.name
             assert ds.description == dataset.description
-            assert ds._current_version == 1
+            assert ds.current_version == 1
+            assert ds.current_version == ds.version
         finally:
             if dataset_id:
                 llmobs._delete_dataset(dataset_id=dataset_id)
@@ -347,7 +348,8 @@ def test_dataset_csv(llmobs, tmp_csv_file_for_upload):
             assert len(ds) == len(dataset)
             assert ds.name == dataset.name
             assert ds.description == dataset.description
-            assert ds._current_version == 1
+            assert ds.current_version == 1
+            assert ds.current_version == ds.version
         finally:
             if dataset_id:
                 llmobs._delete_dataset(dataset_id=dataset_id)
@@ -400,7 +402,8 @@ def test_dataset_csv_pipe_separated(llmobs, tmp_csv_file_for_upload):
             assert len(ds) == len(dataset)
             assert ds.name == dataset.name
             assert ds.description == dataset.description
-            assert ds._current_version == 1
+            assert ds.current_version == 1
+            assert ds.current_version == ds.version
         finally:
             if dataset_id:
                 llmobs._delete_dataset(dataset_id=dataset._id)
@@ -423,7 +426,8 @@ def test_dataset_pull_large_num_records(llmobs, test_dataset_large_num_records):
     assert len(pds) == len(test_dataset_large_num_records)
     assert pds.name == test_dataset_large_num_records.name
     assert pds.description == test_dataset_large_num_records.description
-    assert pds._current_version == test_dataset_large_num_records._current_version == 1
+    assert pds.current_version == test_dataset_large_num_records.current_version == 1
+    assert pds.version == test_dataset_large_num_records.version == 1
 
     dataset = sorted(pds, key=lambda r: int(r["input_data"].lstrip("input_")))
     for i, d in enumerate(dataset):
@@ -450,7 +454,57 @@ def test_dataset_pull_exists_with_record(llmobs, test_dataset_one_record):
     assert dataset[0]["expected_output"] == {"answer": "Paris"}
     assert dataset.name == test_dataset_one_record.name
     assert dataset.description == test_dataset_one_record.description
-    assert dataset._current_version == test_dataset_one_record._current_version == 1
+    assert dataset.current_version == test_dataset_one_record.current_version == 1
+    assert dataset.version == test_dataset_one_record.version == 1
+
+
+@pytest.mark.parametrize(
+    "test_dataset_records",
+    [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
+)
+def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records):
+    assert len(test_dataset) == 1
+    assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"}
+    assert test_dataset[0]["expected_output"] == {"answer": "Paris"}
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
+
+    test_dataset.append(
+        {"input_data": {"prompt": "What is the capital of China?"}, "expected_output": {"answer": "Beijing"}}
+    )
+    test_dataset.push()
+    wait_for_backend(4)
+
+    dataset_v2 = llmobs.pull_dataset(dataset_name=test_dataset.name)
+    assert len(dataset_v2) == 2
+    assert dataset_v2[1]["input_data"] == {"prompt": "What is the capital of France?"}
+    assert dataset_v2[1]["expected_output"] == {"answer": "Paris"}
+    assert dataset_v2[0]["input_data"] == {"prompt": "What is the capital of China?"}
+    assert dataset_v2[0]["expected_output"] == {"answer": "Beijing"}
+    assert dataset_v2.name == test_dataset.name
+    assert dataset_v2.description == test_dataset.description
+    assert dataset_v2.current_version == test_dataset.current_version == 2
+    assert dataset_v2.version == test_dataset.version == 2
+
+    dataset_v1 = llmobs.pull_dataset(dataset_name=test_dataset.name, version=1)
+    assert len(dataset_v1) == 1
+    assert dataset_v1[0]["input_data"] == {"prompt": "What is the capital of France?"}
+    assert dataset_v1[0]["expected_output"] == {"answer": "Paris"}
+    assert dataset_v1.name == test_dataset.name
+    assert dataset_v1.description == test_dataset.description
+    assert dataset_v1.current_version == test_dataset.current_version == 2
+    assert dataset_v1.version == 1
+
+
+@pytest.mark.parametrize(
+    "test_dataset_records",
+    [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
+)
+def test_dataset_pull_w_invalid_version(llmobs, test_dataset, test_dataset_records):
+    with pytest.raises(
+        ValueError, match="Failed to pull dataset records for.*version is greater than the current version or negative"
+    ):
+        llmobs.pull_dataset(dataset_name=test_dataset.name, version=420)
 
 
 def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_project):
@@ -464,7 +518,8 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj
     assert dataset[0]["expected_output"] == {"answer": "Boston"}
     assert dataset.name == test_dataset_one_record_separate_project.name
     assert dataset.description == test_dataset_one_record_separate_project.description
-    assert dataset._current_version == test_dataset_one_record_separate_project._current_version == 1
+    assert dataset.current_version == test_dataset_one_record_separate_project.current_version == 1
+    assert dataset.version == test_dataset_one_record_separate_project.version == 1
 
 
 @pytest.mark.parametrize(
@@ -479,7 +534,8 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj
     ],
 )
 def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
 
     test_dataset.update(
         0,
@@ -518,7 +574,8 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase
 
     test_dataset.push()
     assert len(test_dataset) == 2
-    assert test_dataset._current_version == 2
+    assert test_dataset.current_version == 2
+    assert test_dataset.version == 2
 
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Berlin"}
@@ -548,7 +605,8 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase
     assert len(ds) == 2
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds._current_version == 2
+    assert ds.current_version == 2
+    assert ds.version == 2
 
 
 @pytest.mark.parametrize(
@@ -556,7 +614,8 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase
     [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
 )
 def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
 
     test_dataset.update(
         0,
@@ -568,7 +627,8 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records
 
     test_dataset.push()
     assert len(test_dataset) == 1
-    assert test_dataset._current_version == 2
+    assert test_dataset.current_version == 2
+    assert test_dataset.version == 2
 
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Berlin"}
@@ -585,7 +645,8 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records
     assert len(ds) == 1
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds._current_version == 2
+    assert ds.current_version == 2
+    assert ds.version == 2
 
 
 @pytest.mark.parametrize(
@@ -593,7 +654,8 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records
     [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
 )
 def test_dataset_modify_single_record_empty_record(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
 
     with pytest.raises(
         ValueError,
@@ -615,7 +677,8 @@ def test_dataset_estimate_size(llmobs, test_dataset):
     [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
 )
 def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
 
     test_dataset.update(0, {"expected_output": None})
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"}
@@ -624,7 +687,8 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re
 
     test_dataset.push()
     assert len(test_dataset) == 1
-    assert test_dataset._current_version == 2
+    assert test_dataset.current_version == 2
+    assert test_dataset.version == 2
 
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"}
     assert test_dataset[0]["expected_output"] is None
@@ -641,7 +705,8 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re
     assert len(ds) == 1
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds._current_version == 2
+    assert ds.current_version == 2
+    assert ds.version == 2
 
 
 @pytest.mark.parametrize(
@@ -657,7 +722,8 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re
     ],
 )
 def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
     test_dataset.update(0, {"input_data": "A"})
     assert test_dataset[0]["input_data"] == "A"
     assert test_dataset[0]["expected_output"] == {"answer": "Paris"}
@@ -665,7 +731,8 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor
 
     test_dataset.push()
     assert len(test_dataset) == 1
-    assert test_dataset._current_version == 2
+    assert test_dataset.current_version == 2
+    assert test_dataset.version == 2
 
     assert test_dataset[0]["input_data"] == "A"
     assert test_dataset[0]["expected_output"] == {"answer": "Paris"}
@@ -684,7 +751,8 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor
     assert len(ds) == 1
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds._current_version == 2
+    assert ds.current_version == 2
+    assert ds.version == 2
 
 
 @pytest.mark.parametrize(
@@ -696,7 +764,8 @@ def test_dataset_append(llmobs, test_dataset):
         DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"})
     )
     assert len(test_dataset) == 2
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
 
     wait_for_backend()
     test_dataset.push()
@@ -707,12 +776,14 @@ def test_dataset_append(llmobs, test_dataset):
     assert test_dataset[1]["expected_output"] == {"answer": "Rome"}
     assert test_dataset.name == test_dataset.name
     assert test_dataset.description == test_dataset.description
-    assert test_dataset._current_version == 2
+    assert test_dataset.current_version == 2
+    assert test_dataset.version == 2
 
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._current_version == 2
+    assert ds.current_version == 2
+    assert ds.version == 2
     assert len(ds) == 2
     # note: it looks like dataset order is not deterministic
     assert ds[1]["input_data"] == {"prompt": "What is the capital of France?"}
@@ -735,7 +806,8 @@ def test_dataset_extend(llmobs, test_dataset):
         ]
     )
     assert len(test_dataset) == 3
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
 
     wait_for_backend()
     test_dataset.push()
@@ -748,12 +820,14 @@ def test_dataset_extend(llmobs, test_dataset):
     assert test_dataset[2]["expected_output"] == {"answer": "Stockholm"}
     assert test_dataset.name == test_dataset.name
     assert test_dataset.description == test_dataset.description
-    assert test_dataset._current_version == 2
+    assert test_dataset.current_version == 2
+    assert test_dataset.version == 2
 
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._current_version == 2
+    assert ds.current_version == 2
+    assert ds.version == 2
     assert len(ds) == 3
     assert ds[2]["input_data"] == {"prompt": "What is the capital of France?"}
     # order is non deterministic
@@ -770,7 +844,8 @@ def test_dataset_extend(llmobs, test_dataset):
 def test_dataset_append_no_expected_output(llmobs, test_dataset):
     test_dataset.append(DatasetRecord(input_data={"prompt": "What is the capital of Sealand?"}))
     assert len(test_dataset) == 2
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
 
     wait_for_backend()
     test_dataset.push()
@@ -781,12 +856,14 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset):
     assert "expected_output" not in test_dataset[1]
     assert test_dataset.name == test_dataset.name
     assert test_dataset.description == test_dataset.description
-    assert test_dataset._current_version == 2
+    assert test_dataset.current_version == 2
+    assert test_dataset.version == 2
 
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._current_version == 2
+    assert ds.current_version == 2
+    assert ds.version == 2
     assert len(ds) == 2
     # note: it looks like dataset order is not deterministic
     assert ds[1]["input_data"] == {"prompt": "What is the capital of France?"}
@@ -809,11 +886,13 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset):
 def test_dataset_delete(llmobs, test_dataset):
     test_dataset.delete(0)
     assert len(test_dataset) == 1
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset._current_version == 2
+    assert test_dataset.current_version == 2
+    assert test_dataset.version == 2
     assert len(test_dataset) == 1
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Rome"}
@@ -823,7 +902,8 @@ def test_dataset_delete(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._current_version == 2
+    assert ds.current_version == 2
+    assert ds.version == 2
     assert len(ds) == 1
     assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
     assert ds[0]["expected_output"] == {"answer": "Rome"}
@@ -841,11 +921,13 @@ def test_dataset_delete(llmobs, test_dataset):
 def test_dataset_delete_no_expected_output(llmobs, test_dataset):
     test_dataset.delete(1)
     assert len(test_dataset) == 1
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset._current_version == 2
+    assert test_dataset.current_version == 2
+    assert test_dataset.version == 2
     assert len(test_dataset) == 1
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Nauru?"}
     assert "expected_output" not in test_dataset[0]
@@ -855,7 +937,8 @@ def test_dataset_delete_no_expected_output(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._current_version == 2
+    assert ds.current_version == 2
+    assert ds.version == 2
     assert len(ds) == 1
     assert ds[0]["input_data"] == {"prompt": "What is the capital of Nauru?"}
     assert ds[0]["expected_output"] is None
@@ -879,11 +962,13 @@ def test_dataset_delete_after_update(llmobs, test_dataset):
 
     test_dataset.delete(0)
     assert len(test_dataset) == 1
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset._current_version == 2
+    assert test_dataset.current_version == 2
+    assert test_dataset.version == 2
     assert len(test_dataset) == 1
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Rome"}
@@ -893,7 +978,8 @@ def test_dataset_delete_after_update(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._current_version == 2
+    assert ds.current_version == 2
+    assert ds.version == 2
     assert len(ds) == 1
     assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
     assert ds[0]["expected_output"] == {"answer": "Rome"}
@@ -918,14 +1004,16 @@ def test_dataset_delete_after_append(llmobs, test_dataset):
     test_dataset.delete(0)
     # all that remains should be Italy and Sweden questions
     assert len(test_dataset) == 2
-    assert test_dataset._current_version == 1
+    assert test_dataset.current_version == 1
+    assert test_dataset.version == 1
 
     assert len(test_dataset._new_records_by_record_id) == 1
     assert len(test_dataset._deleted_record_ids) == 1
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset._current_version == 2
+    assert test_dataset.current_version == 2
+    assert test_dataset.version == 2
     assert len(test_dataset) == 2
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Rome"}
@@ -937,7 +1025,8 @@ def test_dataset_delete_after_append(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds._current_version == 2
+    assert ds.current_version == 2
+    assert ds.version == 2
     assert len(ds) == 2
     sds = sorted(ds, key=lambda r: r["input_data"]["prompt"])
     assert sds[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
@@ -1106,7 +1195,7 @@ def test_experiment_create(llmobs, test_dataset_one_record):
     project = llmobs._instance._dne_client.project_create_or_get("test-project")
     project_id = project.get("_id")
     exp_id, exp_run_name = llmobs._instance._dne_client.experiment_create(
-        exp.name, exp._dataset._id, project_id, exp._dataset._current_version, exp._config
+        exp.name, exp._dataset._id, project_id, exp._dataset.current_version, exp._config
     )
     assert exp_id is not None
     assert exp_run_name.startswith("test_experiment")

From 4555396b6e526f764c52fe4cc53aca7648665551 Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Tue, 21 Oct 2025 14:46:11 -0400
Subject: [PATCH 4/7] reno

---
 ...bs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml

diff --git a/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml
new file mode 100644
index 00000000000..912c2f243ba
--- /dev/null
+++ b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml
@@ -0,0 +1,5 @@
+---
+upgrade:
+  - |
+    LLM Observability: Previous dataset versions can be optionally pulled by passing the ``version`` 
+    argument to ``LLMObs.pull_dataset``

From 0e0c15bd85b497d98d1f08bfb8c0866a637af436 Mon Sep 17 00:00:00 2001
From: Gary Huang <garyhuang@hotmail.ca>
Date: Tue, 21 Oct 2025 16:36:11 -0400
Subject: [PATCH 5/7] Update
 releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml

Co-authored-by: Sam Brenner <106700075+sabrenner@users.noreply.github.com>
---
 ...lmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml
index 912c2f243ba..ef710697fcd 100644
--- a/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml
+++ b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml
@@ -1,5 +1,5 @@
 ---
-upgrade:
+features:
   - |
     LLM Observability: Previous dataset versions can be optionally pulled by passing the ``version`` 
     argument to ``LLMObs.pull_dataset``

From 13c0407d21f95856cac4f9f4ec756d9d796feecd Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Wed, 22 Oct 2025 14:34:13 -0400
Subject: [PATCH 6/7] rename current version to latest version and use
 urlencode, and update release notes

---
 ddtrace/llmobs/_experiment.py                 | 16 ++--
 ddtrace/llmobs/_writer.py                     | 35 ++++---
 ...rsioned-dataset-pull-c7017f982b2c1f5b.yaml |  3 +
 tests/llmobs/test_experiments.py              | 94 +++++++++----------
 4 files changed, 82 insertions(+), 66 deletions(-)

diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 2e0932c7ab4..d31a4680626 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -107,7 +107,7 @@ class Dataset:
     _id: str
     _records: List[DatasetRecord]
     _version: int
-    _current_version: int
+    _latest_version: int
     _dne_client: "LLMObsExperimentsClient"
     _new_records_by_record_id: Dict[str, DatasetRecordRaw]
     _updated_record_ids_to_new_fields: Dict[str, UpdatableDatasetRecord]
@@ -122,7 +122,7 @@ def __init__(
         dataset_id: str,
         records: List[DatasetRecord],
         description: str,
-        current_version: int,
+        latest_version: int,
         version: int,
         _dne_client: "LLMObsExperimentsClient",
     ) -> None:
@@ -130,7 +130,7 @@ def __init__(
         self.project = project
         self.description = description
         self._id = dataset_id
-        self._current_version = current_version
+        self._latest_version = latest_version
         self._version = version
         self._dne_client = _dne_client
         self._records = records
@@ -171,10 +171,10 @@ def push(self) -> None:
                 record["record_id"] = record_id  # type: ignore
 
             # FIXME: we don't get version numbers in responses to deletion requests
-            self._current_version = new_version if new_version != -1 else self._current_version + 1
+            self._latest_version = new_version if new_version != -1 else self._latest_version + 1
             # no matter what the version was before the push, pushing will result in the dataset being on the current
             # version tracked by the backend
-            self._version = self._current_version
+            self._version = self._latest_version
         self._new_records_by_record_id = {}
         self._deleted_record_ids = []
         self._updated_record_ids_to_new_fields = {}
@@ -232,8 +232,8 @@ def url(self) -> str:
         return f"{_get_base_url()}/llm/datasets/{self._id}"
 
     @property
-    def current_version(self) -> int:
-        return self._current_version
+    def latest_version(self) -> int:
+        return self._latest_version
 
     @property
     def version(self) -> int:
@@ -448,7 +448,7 @@ def _run_task(self, jobs: int, raise_errors: bool = False, sample_size: Optional
                 dataset_id=self._dataset._id,
                 records=subset_records,
                 description=self._dataset.description,
-                current_version=self._dataset._current_version,
+                latest_version=self._dataset._latest_version,
                 version=self._dataset._version,
                 _dne_client=self._dataset._dne_client,
             )
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index 0f224996394..be22a54aa86 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -3,6 +3,7 @@
 import json
 import os
 import tempfile
+import urllib
 from typing import Any
 from typing import Dict
 from typing import List
@@ -400,7 +401,16 @@ def dataset_create(
         if dataset_id is None or dataset_id == "":
             raise ValueError(f"unexpected dataset state, invalid ID (is None: {dataset_id is None})")
         curr_version = response_data["data"]["attributes"]["current_version"]
-        return Dataset(dataset_name, project, dataset_id, [], description, curr_version, curr_version, _dne_client=self)
+        return Dataset(
+            name=dataset_name,
+            project=project,
+            dataset_id=dataset_id,
+            records=[],
+            description=description,
+            latest_version=curr_version,
+            version=curr_version,
+            _dne_client=self,
+        )
 
     @staticmethod
     def _get_record_json(record: Union[UpdatableDatasetRecord, DatasetRecordRaw], is_update: bool) -> JSONType:
@@ -484,12 +494,14 @@ def dataset_get_with_records(
         dataset_id = data[0]["id"]
 
         list_base_path = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
+
+        url_options = {}
         if version:
-            list_base_path = f"{list_base_path}?filter[version]={version}"
+            url_options["filter[version]"] = version
 
         has_next_page = True
         class_records: List[DatasetRecord] = []
-        list_path = list_base_path
+        list_path = f"{list_base_path}?{urllib.parse.urlencode(url_options, safe='[]')}"
         page_num = 0
         while has_next_page:
             resp = self.request("GET", list_path, timeout=self.LIST_RECORDS_TIMEOUT)
@@ -514,17 +526,18 @@ def dataset_get_with_records(
             has_next_page = False
             if next_cursor:
                 has_next_page = True
-                list_path = f"{list_base_path}{'&' if version else '?'}page[cursor]={next_cursor}"
+                url_options["page[cursor]"] = next_cursor
+                list_path = f"{list_base_path}?{urllib.parse.urlencode(url_options, safe='[]')}"
                 logger.debug("next list records request path %s", list_path)
                 page_num += 1
         return Dataset(
-            dataset_name,
-            project,
-            dataset_id,
-            class_records,
-            dataset_description,
-            curr_version,
-            version or curr_version,
+            name=dataset_name,
+            project=project,
+            dataset_id=dataset_id,
+            records=class_records,
+            description=dataset_description,
+            latest_version=curr_version,
+            version=version or curr_version,
             _dne_client=self,
         )
 
diff --git a/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml
index ef710697fcd..f1f32a86735 100644
--- a/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml
+++ b/releasenotes/notes/llmobs-dne-allow-versioned-dataset-pull-c7017f982b2c1f5b.yaml
@@ -3,3 +3,6 @@ features:
   - |
     LLM Observability: Previous dataset versions can be optionally pulled by passing the ``version`` 
     argument to ``LLMObs.pull_dataset``
+  - |
+    LLM Observability: Datasets have new properties ``version`` and ``latest_version`` to provide information on the
+    version of the dataset that is being worked with and the latest global version of the dataset, respectively
\ No newline at end of file
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index 1646d1cb90b..cf4e44905f4 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -300,8 +300,8 @@ def test_dataset_csv_no_expected_output(llmobs, tmp_csv_file_for_upload):
             assert len(ds) == len(dataset)
             assert ds.name == dataset.name
             assert ds.description == dataset.description
-            assert ds.current_version == 1
-            assert ds.current_version == ds.version
+            assert ds.latest_version == 1
+            assert ds.latest_version == ds.version
         finally:
             if dataset_id:
                 llmobs._delete_dataset(dataset_id=dataset_id)
@@ -348,8 +348,8 @@ def test_dataset_csv(llmobs, tmp_csv_file_for_upload):
             assert len(ds) == len(dataset)
             assert ds.name == dataset.name
             assert ds.description == dataset.description
-            assert ds.current_version == 1
-            assert ds.current_version == ds.version
+            assert ds.latest_version == 1
+            assert ds.latest_version == ds.version
         finally:
             if dataset_id:
                 llmobs._delete_dataset(dataset_id=dataset_id)
@@ -402,8 +402,8 @@ def test_dataset_csv_pipe_separated(llmobs, tmp_csv_file_for_upload):
             assert len(ds) == len(dataset)
             assert ds.name == dataset.name
             assert ds.description == dataset.description
-            assert ds.current_version == 1
-            assert ds.current_version == ds.version
+            assert ds.latest_version == 1
+            assert ds.latest_version == ds.version
         finally:
             if dataset_id:
                 llmobs._delete_dataset(dataset_id=dataset._id)
@@ -426,7 +426,7 @@ def test_dataset_pull_large_num_records(llmobs, test_dataset_large_num_records):
     assert len(pds) == len(test_dataset_large_num_records)
     assert pds.name == test_dataset_large_num_records.name
     assert pds.description == test_dataset_large_num_records.description
-    assert pds.current_version == test_dataset_large_num_records.current_version == 1
+    assert pds.latest_version == test_dataset_large_num_records.latest_version == 1
     assert pds.version == test_dataset_large_num_records.version == 1
 
     dataset = sorted(pds, key=lambda r: int(r["input_data"].lstrip("input_")))
@@ -454,7 +454,7 @@ def test_dataset_pull_exists_with_record(llmobs, test_dataset_one_record):
     assert dataset[0]["expected_output"] == {"answer": "Paris"}
     assert dataset.name == test_dataset_one_record.name
     assert dataset.description == test_dataset_one_record.description
-    assert dataset.current_version == test_dataset_one_record.current_version == 1
+    assert dataset.latest_version == test_dataset_one_record.latest_version == 1
     assert dataset.version == test_dataset_one_record.version == 1
 
 
@@ -466,7 +466,7 @@ def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records):
     assert len(test_dataset) == 1
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"}
     assert test_dataset[0]["expected_output"] == {"answer": "Paris"}
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     test_dataset.append(
@@ -483,7 +483,7 @@ def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records):
     assert dataset_v2[0]["expected_output"] == {"answer": "Beijing"}
     assert dataset_v2.name == test_dataset.name
     assert dataset_v2.description == test_dataset.description
-    assert dataset_v2.current_version == test_dataset.current_version == 2
+    assert dataset_v2.latest_version == test_dataset.latest_version == 2
     assert dataset_v2.version == test_dataset.version == 2
 
     dataset_v1 = llmobs.pull_dataset(dataset_name=test_dataset.name, version=1)
@@ -492,7 +492,7 @@ def test_dataset_pull_w_versions(llmobs, test_dataset, test_dataset_records):
     assert dataset_v1[0]["expected_output"] == {"answer": "Paris"}
     assert dataset_v1.name == test_dataset.name
     assert dataset_v1.description == test_dataset.description
-    assert dataset_v1.current_version == test_dataset.current_version == 2
+    assert dataset_v1.latest_version == test_dataset.latest_version == 2
     assert dataset_v1.version == 1
 
 
@@ -518,7 +518,7 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj
     assert dataset[0]["expected_output"] == {"answer": "Boston"}
     assert dataset.name == test_dataset_one_record_separate_project.name
     assert dataset.description == test_dataset_one_record_separate_project.description
-    assert dataset.current_version == test_dataset_one_record_separate_project.current_version == 1
+    assert dataset.latest_version == test_dataset_one_record_separate_project.latest_version == 1
     assert dataset.version == test_dataset_one_record_separate_project.version == 1
 
 
@@ -534,7 +534,7 @@ def test_dataset_pull_from_project(llmobs, test_dataset_one_record_separate_proj
     ],
 )
 def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     test_dataset.update(
@@ -574,7 +574,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase
 
     test_dataset.push()
     assert len(test_dataset) == 2
-    assert test_dataset.current_version == 2
+    assert test_dataset.latest_version == 2
     assert test_dataset.version == 2
 
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"}
@@ -605,7 +605,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase
     assert len(ds) == 2
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds.current_version == 2
+    assert ds.latest_version == 2
     assert ds.version == 2
 
 
@@ -614,7 +614,7 @@ def test_dataset_modify_records_multiple_times(llmobs, test_dataset, test_datase
     [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
 )
 def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     test_dataset.update(
@@ -627,7 +627,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records
 
     test_dataset.push()
     assert len(test_dataset) == 1
-    assert test_dataset.current_version == 2
+    assert test_dataset.latest_version == 2
     assert test_dataset.version == 2
 
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Germany?"}
@@ -645,7 +645,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records
     assert len(ds) == 1
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds.current_version == 2
+    assert ds.latest_version == 2
     assert ds.version == 2
 
 
@@ -654,7 +654,7 @@ def test_dataset_modify_single_record(llmobs, test_dataset, test_dataset_records
     [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
 )
 def test_dataset_modify_single_record_empty_record(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     with pytest.raises(
@@ -677,7 +677,7 @@ def test_dataset_estimate_size(llmobs, test_dataset):
     [[DatasetRecord(input_data={"prompt": "What is the capital of France?"}, expected_output={"answer": "Paris"})]],
 )
 def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     test_dataset.update(0, {"expected_output": None})
@@ -687,7 +687,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re
 
     test_dataset.push()
     assert len(test_dataset) == 1
-    assert test_dataset.current_version == 2
+    assert test_dataset.latest_version == 2
     assert test_dataset.version == 2
 
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of France?"}
@@ -705,7 +705,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re
     assert len(ds) == 1
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds.current_version == 2
+    assert ds.latest_version == 2
     assert ds.version == 2
 
 
@@ -722,7 +722,7 @@ def test_dataset_modify_record_on_optional(llmobs, test_dataset, test_dataset_re
     ],
 )
 def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_records):
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
     test_dataset.update(0, {"input_data": "A"})
     assert test_dataset[0]["input_data"] == "A"
@@ -731,7 +731,7 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor
 
     test_dataset.push()
     assert len(test_dataset) == 1
-    assert test_dataset.current_version == 2
+    assert test_dataset.latest_version == 2
     assert test_dataset.version == 2
 
     assert test_dataset[0]["input_data"] == "A"
@@ -751,7 +751,7 @@ def test_dataset_modify_record_on_input(llmobs, test_dataset, test_dataset_recor
     assert len(ds) == 1
     assert ds.name == test_dataset.name
     assert ds.description == test_dataset.description
-    assert ds.current_version == 2
+    assert ds.latest_version == 2
     assert ds.version == 2
 
 
@@ -764,7 +764,7 @@ def test_dataset_append(llmobs, test_dataset):
         DatasetRecord(input_data={"prompt": "What is the capital of Italy?"}, expected_output={"answer": "Rome"})
     )
     assert len(test_dataset) == 2
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     wait_for_backend()
@@ -776,13 +776,13 @@ def test_dataset_append(llmobs, test_dataset):
     assert test_dataset[1]["expected_output"] == {"answer": "Rome"}
     assert test_dataset.name == test_dataset.name
     assert test_dataset.description == test_dataset.description
-    assert test_dataset.current_version == 2
+    assert test_dataset.latest_version == 2
     assert test_dataset.version == 2
 
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds.current_version == 2
+    assert ds.latest_version == 2
     assert ds.version == 2
     assert len(ds) == 2
     # note: it looks like dataset order is not deterministic
@@ -806,7 +806,7 @@ def test_dataset_extend(llmobs, test_dataset):
         ]
     )
     assert len(test_dataset) == 3
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     wait_for_backend()
@@ -820,13 +820,13 @@ def test_dataset_extend(llmobs, test_dataset):
     assert test_dataset[2]["expected_output"] == {"answer": "Stockholm"}
     assert test_dataset.name == test_dataset.name
     assert test_dataset.description == test_dataset.description
-    assert test_dataset.current_version == 2
+    assert test_dataset.latest_version == 2
     assert test_dataset.version == 2
 
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds.current_version == 2
+    assert ds.latest_version == 2
     assert ds.version == 2
     assert len(ds) == 3
     assert ds[2]["input_data"] == {"prompt": "What is the capital of France?"}
@@ -844,7 +844,7 @@ def test_dataset_extend(llmobs, test_dataset):
 def test_dataset_append_no_expected_output(llmobs, test_dataset):
     test_dataset.append(DatasetRecord(input_data={"prompt": "What is the capital of Sealand?"}))
     assert len(test_dataset) == 2
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     wait_for_backend()
@@ -856,13 +856,13 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset):
     assert "expected_output" not in test_dataset[1]
     assert test_dataset.name == test_dataset.name
     assert test_dataset.description == test_dataset.description
-    assert test_dataset.current_version == 2
+    assert test_dataset.latest_version == 2
     assert test_dataset.version == 2
 
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds.current_version == 2
+    assert ds.latest_version == 2
     assert ds.version == 2
     assert len(ds) == 2
     # note: it looks like dataset order is not deterministic
@@ -886,12 +886,12 @@ def test_dataset_append_no_expected_output(llmobs, test_dataset):
 def test_dataset_delete(llmobs, test_dataset):
     test_dataset.delete(0)
     assert len(test_dataset) == 1
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset.current_version == 2
+    assert test_dataset.latest_version == 2
     assert test_dataset.version == 2
     assert len(test_dataset) == 1
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
@@ -902,7 +902,7 @@ def test_dataset_delete(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds.current_version == 2
+    assert ds.latest_version == 2
     assert ds.version == 2
     assert len(ds) == 1
     assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
@@ -921,12 +921,12 @@ def test_dataset_delete(llmobs, test_dataset):
 def test_dataset_delete_no_expected_output(llmobs, test_dataset):
     test_dataset.delete(1)
     assert len(test_dataset) == 1
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset.current_version == 2
+    assert test_dataset.latest_version == 2
     assert test_dataset.version == 2
     assert len(test_dataset) == 1
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Nauru?"}
@@ -937,7 +937,7 @@ def test_dataset_delete_no_expected_output(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds.current_version == 2
+    assert ds.latest_version == 2
     assert ds.version == 2
     assert len(ds) == 1
     assert ds[0]["input_data"] == {"prompt": "What is the capital of Nauru?"}
@@ -962,12 +962,12 @@ def test_dataset_delete_after_update(llmobs, test_dataset):
 
     test_dataset.delete(0)
     assert len(test_dataset) == 1
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset.current_version == 2
+    assert test_dataset.latest_version == 2
     assert test_dataset.version == 2
     assert len(test_dataset) == 1
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
@@ -978,7 +978,7 @@ def test_dataset_delete_after_update(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds.current_version == 2
+    assert ds.latest_version == 2
     assert ds.version == 2
     assert len(ds) == 1
     assert ds[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
@@ -1004,7 +1004,7 @@ def test_dataset_delete_after_append(llmobs, test_dataset):
     test_dataset.delete(0)
     # all that remains should be Italy and Sweden questions
     assert len(test_dataset) == 2
-    assert test_dataset.current_version == 1
+    assert test_dataset.latest_version == 1
     assert test_dataset.version == 1
 
     assert len(test_dataset._new_records_by_record_id) == 1
@@ -1012,7 +1012,7 @@ def test_dataset_delete_after_append(llmobs, test_dataset):
 
     wait_for_backend()
     test_dataset.push()
-    assert test_dataset.current_version == 2
+    assert test_dataset.latest_version == 2
     assert test_dataset.version == 2
     assert len(test_dataset) == 2
     assert test_dataset[0]["input_data"] == {"prompt": "What is the capital of Italy?"}
@@ -1025,7 +1025,7 @@ def test_dataset_delete_after_append(llmobs, test_dataset):
     # check that a pulled dataset matches the pushed dataset
     wait_for_backend()
     ds = llmobs.pull_dataset(dataset_name=test_dataset.name)
-    assert ds.current_version == 2
+    assert ds.latest_version == 2
     assert ds.version == 2
     assert len(ds) == 2
     sds = sorted(ds, key=lambda r: r["input_data"]["prompt"])
@@ -1195,7 +1195,7 @@ def test_experiment_create(llmobs, test_dataset_one_record):
     project = llmobs._instance._dne_client.project_create_or_get("test-project")
     project_id = project.get("_id")
     exp_id, exp_run_name = llmobs._instance._dne_client.experiment_create(
-        exp.name, exp._dataset._id, project_id, exp._dataset.current_version, exp._config
+        exp.name, exp._dataset._id, project_id, exp._dataset.latest_version, exp._config
     )
     assert exp_id is not None
     assert exp_run_name.startswith("test_experiment")

From 48e4ca9f8792af531bf54b95875c16140ae53d7f Mon Sep 17 00:00:00 2001
From: gary-huang <garyhuang@hotmail.ca>
Date: Wed, 22 Oct 2025 15:22:45 -0400
Subject: [PATCH 7/7] fresh url options after list records call

---
 ddtrace/llmobs/_writer.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index be22a54aa86..1f70c0c3bae 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -3,7 +3,6 @@
 import json
 import os
 import tempfile
-import urllib
 from typing import Any
 from typing import Dict
 from typing import List
@@ -12,6 +11,7 @@
 from typing import TypedDict
 from typing import Union
 from typing import cast
+import urllib
 from urllib.parse import quote
 from urllib.parse import urlparse
 
@@ -495,15 +495,16 @@ def dataset_get_with_records(
 
         list_base_path = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
 
-        url_options = {}
-        if version:
-            url_options["filter[version]"] = version
-
         has_next_page = True
         class_records: List[DatasetRecord] = []
-        list_path = f"{list_base_path}?{urllib.parse.urlencode(url_options, safe='[]')}"
         page_num = 0
+        url_options = {}
         while has_next_page:
+            if version:
+                url_options["filter[version]"] = version
+
+            list_path = f"{list_base_path}?{urllib.parse.urlencode(url_options, safe='[]')}"
+            logger.debug("list records page %d, request path=%s", page_num, list_path)
             resp = self.request("GET", list_path, timeout=self.LIST_RECORDS_TIMEOUT)
             if resp.status != 200:
                 raise ValueError(
@@ -523,12 +524,12 @@ def dataset_get_with_records(
                     }
                 )
             next_cursor = records_data.get("meta", {}).get("after")
+
+            url_options = {}
             has_next_page = False
             if next_cursor:
                 has_next_page = True
                 url_options["page[cursor]"] = next_cursor
-                list_path = f"{list_base_path}?{urllib.parse.urlencode(url_options, safe='[]')}"
-                logger.debug("next list records request path %s", list_path)
                 page_num += 1
         return Dataset(
             name=dataset_name,