From 9a9a0f56fb0f43dca7c97403af8ed3a1d0e3b0e1 Mon Sep 17 00:00:00 2001 From: Simon Alibert <75076266+aliberts@users.noreply.github.com> Date: Fri, 29 Nov 2024 19:04:00 +0100 Subject: [PATCH] Dataset v2.0 (#461) Co-authored-by: Remi --- .github/PULL_REQUEST_TEMPLATE.md | 2 +- .github/workflows/nightly-tests.yml | 8 +- .github/workflows/test.yml | 5 +- CONTRIBUTING.md | 2 +- README.md | 14 +- benchmarks/video/run_video_benchmark.py | 2 +- examples/10_use_so100.md | 9 +- examples/11_use_moss.md | 9 +- examples/1_load_lerobot_dataset.py | 123 +- examples/3_train_policy.py | 2 +- examples/6_add_image_transforms.py | 6 +- examples/7_get_started_with_real_robot.md | 10 +- examples/8_use_stretch.md | 2 - examples/9_use_aloha.md | 7 +- .../advanced/2_calculate_validation_loss.py | 36 +- examples/port_datasets/pusht_zarr.py | 222 +++ lerobot/__init__.py | 4 +- lerobot/common/datasets/card_template.md | 27 + lerobot/common/datasets/compute_stats.py | 39 +- lerobot/common/datasets/factory.py | 5 +- lerobot/common/datasets/image_writer.py | 160 ++ lerobot/common/datasets/lerobot_dataset.py | 1070 ++++++++++-- lerobot/common/datasets/online_buffer.py | 6 +- .../push_dataset_to_hub/aloha_hdf5_format.py | 2 +- .../push_dataset_to_hub/cam_png_format.py | 7 +- .../dora_parquet_format.py | 2 +- .../push_dataset_to_hub/openx_rlds_format.py | 2 +- .../push_dataset_to_hub/pusht_zarr_format.py | 2 +- .../push_dataset_to_hub/umi_zarr_format.py | 2 +- .../datasets/push_dataset_to_hub/utils.py | 57 + .../push_dataset_to_hub/xarm_pkl_format.py | 2 +- lerobot/common/datasets/utils.py | 632 ++++--- .../v2/batch_convert_dataset_v1_to_v2.py | 882 ++++++++++ .../datasets/v2/convert_dataset_v1_to_v2.py | 665 +++++++ lerobot/common/datasets/video_utils.py | 146 +- .../robot_devices/cameras/intelrealsense.py | 4 + .../common/robot_devices/cameras/opencv.py | 4 + lerobot/common/robot_devices/control_utils.py | 35 +- .../robot_devices/robots/manipulator.py | 36 + lerobot/common/robot_devices/robots/utils.py | 1 + lerobot/scripts/control_robot.py | 157 +- lerobot/scripts/eval.py | 2 +- lerobot/scripts/push_dataset_to_hub.py | 10 +- lerobot/scripts/train.py | 14 +- lerobot/scripts/visualize_dataset.py | 26 +- lerobot/scripts/visualize_dataset_html.py | 62 +- lerobot/scripts/visualize_image_transforms.py | 2 +- .../templates/visualize_dataset_template.html | 2 +- poetry.lock | 1534 +++++++++-------- pyproject.toml | 6 +- tests/conftest.py | 7 + tests/fixtures/constants.py | 29 + tests/fixtures/dataset_factories.py | 396 +++++ tests/fixtures/files.py | 114 ++ tests/fixtures/hub.py | 105 ++ .../save_image_transforms_to_safetensors.py | 2 +- tests/scripts/save_policy_to_safetensors.py | 2 +- tests/test_control_robot.py | 127 +- tests/test_datasets.py | 138 +- tests/test_delta_timestamps.py | 256 +++ tests/test_examples.py | 27 +- tests/test_image_transforms.py | 108 +- tests/test_image_writer.py | 359 ++++ tests/test_online_buffer.py | 64 +- tests/test_policies.py | 10 +- tests/test_push_dataset_to_hub.py | 5 +- tests/test_sampler.py | 2 +- tests/test_utils.py | 17 +- tests/test_visualize_dataset.py | 18 +- tests/test_visualize_dataset_html.py | 20 +- 70 files changed, 6111 insertions(+), 1761 deletions(-) create mode 100644 examples/port_datasets/pusht_zarr.py create mode 100644 lerobot/common/datasets/card_template.md create mode 100644 lerobot/common/datasets/image_writer.py create mode 100644 lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py create mode 100644 lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py create mode 100644 tests/fixtures/constants.py create mode 100644 tests/fixtures/dataset_factories.py create mode 100644 tests/fixtures/files.py create mode 100644 tests/fixtures/hub.py create mode 100644 tests/test_delta_timestamps.py create mode 100644 tests/test_image_writer.py diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 4063e395f..2e6fd4490 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -21,7 +21,7 @@ Provide a simple way for the reviewer to try out your changes. Examples: ```bash -DATA_DIR=tests/data pytest -sx tests/test_stuff.py::test_something +pytest -sx tests/test_stuff.py::test_something ``` ```bash python lerobot/scripts/train.py --some.option=true diff --git a/.github/workflows/nightly-tests.yml b/.github/workflows/nightly-tests.yml index f967533ae..bbee19a17 100644 --- a/.github/workflows/nightly-tests.yml +++ b/.github/workflows/nightly-tests.yml @@ -7,10 +7,8 @@ on: schedule: - cron: "0 2 * * *" -env: - DATA_DIR: tests/data +# env: # SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }} - jobs: run_all_tests_cpu: name: CPU @@ -30,13 +28,9 @@ jobs: working-directory: /lerobot steps: - name: Tests - env: - DATA_DIR: tests/data run: pytest -v --cov=./lerobot --disable-warnings tests - name: Tests end-to-end - env: - DATA_DIR: tests/data run: make test-end-to-end diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9d51def4d..5de071750 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -29,7 +29,6 @@ jobs: name: Pytest runs-on: ubuntu-latest env: - DATA_DIR: tests/data MUJOCO_GL: egl steps: - uses: actions/checkout@v4 @@ -70,7 +69,6 @@ jobs: name: Pytest (minimal install) runs-on: ubuntu-latest env: - DATA_DIR: tests/data MUJOCO_GL: egl steps: - uses: actions/checkout@v4 @@ -103,12 +101,11 @@ jobs: -W ignore::UserWarning:gymnasium.utils.env_checker:247 \ && rm -rf tests/outputs outputs - + # TODO(aliberts, rcadene): redesign after v2 migration / removing hydra end-to-end: name: End-to-end runs-on: ubuntu-latest env: - DATA_DIR: tests/data MUJOCO_GL: egl steps: - uses: actions/checkout@v4 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 822197ba2..b8c198568 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -267,7 +267,7 @@ We use `pytest` in order to run the tests. From the root of the repository, here's how to run tests with `pytest` for the library: ```bash -DATA_DIR="tests/data" python -m pytest -sv ./tests +python -m pytest -sv ./tests ``` diff --git a/README.md b/README.md index 5fbf74f44..9331bdeca 100644 --- a/README.md +++ b/README.md @@ -153,10 +153,12 @@ python lerobot/scripts/visualize_dataset.py \ --episode-index 0 ``` -or from a dataset in a local folder with the root `DATA_DIR` environment variable (in the following case the dataset will be searched for in `./my_local_data_dir/lerobot/pusht`) +or from a dataset in a local folder with the `root` option and the `--local-files-only` (in the following case the dataset will be searched for in `./my_local_data_dir/lerobot/pusht`) ```bash -DATA_DIR='./my_local_data_dir' python lerobot/scripts/visualize_dataset.py \ +python lerobot/scripts/visualize_dataset.py \ --repo-id lerobot/pusht \ + --root ./my_local_data_dir \ + --local-files-only 1 \ --episode-index 0 ``` @@ -208,12 +210,10 @@ dataset attributes: A `LeRobotDataset` is serialised using several widespread file formats for each of its parts, namely: - hf_dataset stored using Hugging Face datasets library serialization to parquet -- videos are stored in mp4 format to save space or png files -- episode_data_index saved using `safetensor` tensor serialization format -- stats saved using `safetensor` tensor serialization format -- info are saved using JSON +- videos are stored in mp4 format to save space +- metadata are stored in plain json/jsonl files -Dataset can be uploaded/downloaded from the HuggingFace hub seamlessly. To work on a local dataset, you can set the `DATA_DIR` environment variable to your root dataset folder as illustrated in the above section on dataset visualization. +Dataset can be uploaded/downloaded from the HuggingFace hub seamlessly. To work on a local dataset, you can use the `local_files_only` argument and specify its location with the `root` argument if it's not in the default `~/.cache/huggingface/lerobot` location. ### Evaluate a pretrained policy diff --git a/benchmarks/video/run_video_benchmark.py b/benchmarks/video/run_video_benchmark.py index 46806c075..e90664872 100644 --- a/benchmarks/video/run_video_benchmark.py +++ b/benchmarks/video/run_video_benchmark.py @@ -266,7 +266,7 @@ def benchmark_encoding_decoding( ) ep_num_images = dataset.episode_data_index["to"][0].item() - width, height = tuple(dataset[0][dataset.camera_keys[0]].shape[-2:]) + width, height = tuple(dataset[0][dataset.meta.camera_keys[0]].shape[-2:]) num_pixels = width * height video_size_bytes = video_path.stat().st_size images_size_bytes = get_directory_size(imgs_dir) diff --git a/examples/10_use_so100.md b/examples/10_use_so100.md index 32d7f22dd..70e4ed8ba 100644 --- a/examples/10_use_so100.md +++ b/examples/10_use_so100.md @@ -192,7 +192,6 @@ Record 2 episodes and upload your dataset to the hub: python lerobot/scripts/control_robot.py record \ --robot-path lerobot/configs/robot/so100.yaml \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/so100_test \ --tags so100 tutorial \ --warmup-time-s 5 \ @@ -212,7 +211,6 @@ echo ${HF_USER}/so100_test If you didn't upload with `--push-to-hub 0`, you can also visualize it locally with: ```bash python lerobot/scripts/visualize_dataset_html.py \ - --root data \ --repo-id ${HF_USER}/so100_test ``` @@ -220,10 +218,9 @@ python lerobot/scripts/visualize_dataset_html.py \ Now try to replay the first episode on your robot: ```bash -DATA_DIR=data python lerobot/scripts/control_robot.py replay \ +python lerobot/scripts/control_robot.py replay \ --robot-path lerobot/configs/robot/so100.yaml \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/so100_test \ --episode 0 ``` @@ -232,7 +229,7 @@ DATA_DIR=data python lerobot/scripts/control_robot.py replay \ To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command: ```bash -DATA_DIR=data python lerobot/scripts/train.py \ +python lerobot/scripts/train.py \ dataset_repo_id=${HF_USER}/so100_test \ policy=act_so100_real \ env=so100_real \ @@ -248,7 +245,6 @@ Let's explain it: 3. We provided an environment as argument with `env=so100_real`. This loads configurations from [`lerobot/configs/env/so100_real.yaml`](../lerobot/configs/env/so100_real.yaml). 4. We provided `device=cuda` since we are training on a Nvidia GPU, but you can also use `device=mps` if you are using a Mac with Apple silicon, or `device=cpu` otherwise. 5. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`. -6. We added `DATA_DIR=data` to access your dataset stored in your local `data` directory. If you dont provide `DATA_DIR`, your dataset will be downloaded from Hugging Face hub to your cache folder `$HOME/.cache/hugginface`. In future versions of `lerobot`, both directories will be in sync. Training should take several hours. You will find checkpoints in `outputs/train/act_so100_test/checkpoints`. @@ -259,7 +255,6 @@ You can use the `record` function from [`lerobot/scripts/control_robot.py`](../l python lerobot/scripts/control_robot.py record \ --robot-path lerobot/configs/robot/so100.yaml \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/eval_act_so100_test \ --tags so100 tutorial eval \ --warmup-time-s 5 \ diff --git a/examples/11_use_moss.md b/examples/11_use_moss.md index 4ccbb93db..55d6fcaf9 100644 --- a/examples/11_use_moss.md +++ b/examples/11_use_moss.md @@ -192,7 +192,6 @@ Record 2 episodes and upload your dataset to the hub: python lerobot/scripts/control_robot.py record \ --robot-path lerobot/configs/robot/moss.yaml \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/moss_test \ --tags moss tutorial \ --warmup-time-s 5 \ @@ -212,7 +211,6 @@ echo ${HF_USER}/moss_test If you didn't upload with `--push-to-hub 0`, you can also visualize it locally with: ```bash python lerobot/scripts/visualize_dataset_html.py \ - --root data \ --repo-id ${HF_USER}/moss_test ``` @@ -220,10 +218,9 @@ python lerobot/scripts/visualize_dataset_html.py \ Now try to replay the first episode on your robot: ```bash -DATA_DIR=data python lerobot/scripts/control_robot.py replay \ +python lerobot/scripts/control_robot.py replay \ --robot-path lerobot/configs/robot/moss.yaml \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/moss_test \ --episode 0 ``` @@ -232,7 +229,7 @@ DATA_DIR=data python lerobot/scripts/control_robot.py replay \ To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command: ```bash -DATA_DIR=data python lerobot/scripts/train.py \ +python lerobot/scripts/train.py \ dataset_repo_id=${HF_USER}/moss_test \ policy=act_moss_real \ env=moss_real \ @@ -248,7 +245,6 @@ Let's explain it: 3. We provided an environment as argument with `env=moss_real`. This loads configurations from [`lerobot/configs/env/moss_real.yaml`](../lerobot/configs/env/moss_real.yaml). 4. We provided `device=cuda` since we are training on a Nvidia GPU, but you can also use `device=mps` if you are using a Mac with Apple silicon, or `device=cpu` otherwise. 5. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`. -6. We added `DATA_DIR=data` to access your dataset stored in your local `data` directory. If you dont provide `DATA_DIR`, your dataset will be downloaded from Hugging Face hub to your cache folder `$HOME/.cache/hugginface`. In future versions of `lerobot`, both directories will be in sync. Training should take several hours. You will find checkpoints in `outputs/train/act_moss_test/checkpoints`. @@ -259,7 +255,6 @@ You can use the `record` function from [`lerobot/scripts/control_robot.py`](../l python lerobot/scripts/control_robot.py record \ --robot-path lerobot/configs/robot/moss.yaml \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/eval_act_moss_test \ --tags moss tutorial eval \ --warmup-time-s 5 \ diff --git a/examples/1_load_lerobot_dataset.py b/examples/1_load_lerobot_dataset.py index 3846926a6..96c104b68 100644 --- a/examples/1_load_lerobot_dataset.py +++ b/examples/1_load_lerobot_dataset.py @@ -3,78 +3,120 @@ It illustrates how to load datasets, manipulate them, and apply transformations suitable for machine learning tasks in PyTorch. Features included in this script: -- Loading a dataset and accessing its properties. -- Filtering data by episode number. -- Converting tensor data for visualization. -- Saving video files from dataset frames. +- Viewing a dataset's metadata and exploring its properties. +- Loading an existing dataset from the hub or a subset of it. +- Accessing frames by episode number. - Using advanced dataset features like timestamp-based frame selection. - Demonstrating compatibility with PyTorch DataLoader for batch processing. The script ends with examples of how to batch process data using PyTorch's DataLoader. """ -from pathlib import Path from pprint import pprint -import imageio import torch +from huggingface_hub import HfApi import lerobot -from lerobot.common.datasets.lerobot_dataset import LeRobotDataset +from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata +# We ported a number of existing datasets ourselves, use this to see the list: print("List of available datasets:") pprint(lerobot.available_datasets) -# Let's take one for this example -repo_id = "lerobot/pusht" - -# You can easily load a dataset from a Hugging Face repository +# You can also browse through the datasets created/ported by the community on the hub using the hub api: +hub_api = HfApi() +repo_ids = [info.id for info in hub_api.list_datasets(task_categories="robotics", tags=["LeRobot"])] +pprint(repo_ids) + +# Or simply explore them in your web browser directly at: +# https://huggingface.co/datasets?other=LeRobot + +# Let's take this one for this example +repo_id = "lerobot/aloha_mobile_cabinet" +# We can have a look and fetch its metadata to know more about it: +ds_meta = LeRobotDatasetMetadata(repo_id) + +# By instantiating just this class, you can quickly access useful information about the content and the +# structure of the dataset without downloading the actual data yet (only metadata files — which are +# lightweight). +print(f"Total number of episodes: {ds_meta.total_episodes}") +print(f"Average number of frames per episode: {ds_meta.total_frames / ds_meta.total_episodes:.3f}") +print(f"Frames per second used during data collection: {ds_meta.fps}") +print(f"Robot type: {ds_meta.robot_type}") +print(f"keys to access images from cameras: {ds_meta.camera_keys=}\n") + +print("Tasks:") +print(ds_meta.tasks) +print("Features:") +pprint(ds_meta.features) + +# You can also get a short summary by simply printing the object: +print(ds_meta) + +# You can then load the actual dataset from the hub. +# Either load any subset of episodes: +dataset = LeRobotDataset(repo_id, episodes=[0, 10, 11, 23]) + +# And see how many frames you have: +print(f"Selected episodes: {dataset.episodes}") +print(f"Number of episodes selected: {dataset.num_episodes}") +print(f"Number of frames selected: {dataset.num_frames}") + +# Or simply load the entire dataset: dataset = LeRobotDataset(repo_id) +print(f"Number of episodes selected: {dataset.num_episodes}") +print(f"Number of frames selected: {dataset.num_frames}") -# LeRobotDataset is actually a thin wrapper around an underlying Hugging Face dataset -# (see https://huggingface.co/docs/datasets/index for more information). -print(dataset) -print(dataset.hf_dataset) +# The previous metadata class is contained in the 'meta' attribute of the dataset: +print(dataset.meta) -# And provides additional utilities for robotics and compatibility with Pytorch -print(f"\naverage number of frames per episode: {dataset.num_samples / dataset.num_episodes:.3f}") -print(f"frames per second used during data collection: {dataset.fps=}") -print(f"keys to access images from cameras: {dataset.camera_keys=}\n") +# LeRobotDataset actually wraps an underlying Hugging Face dataset +# (see https://huggingface.co/docs/datasets for more information). +print(dataset.hf_dataset) -# Access frame indexes associated to first episode +# LeRobot datasets also subclasses PyTorch datasets so you can do everything you know and love from working +# with the latter, like iterating through the dataset. +# The __getitem__ iterates over the frames of the dataset. Since our datasets are also structured by +# episodes, you can access the frame indices of any episode using the episode_data_index. Here, we access +# frame indices associated to the first episode: episode_index = 0 from_idx = dataset.episode_data_index["from"][episode_index].item() to_idx = dataset.episode_data_index["to"][episode_index].item() -# LeRobot datasets actually subclass PyTorch datasets so you can do everything you know and love from working -# with the latter, like iterating through the dataset. Here we grab all the image frames. -frames = [dataset[idx]["observation.image"] for idx in range(from_idx, to_idx)] +# Then we grab all the image frames from the first camera: +camera_key = dataset.meta.camera_keys[0] +frames = [dataset[idx][camera_key] for idx in range(from_idx, to_idx)] -# Video frames are now float32 in range [0,1] channel first (c,h,w) to follow pytorch convention. To visualize -# them, we convert to uint8 in range [0,255] -frames = [(frame * 255).type(torch.uint8) for frame in frames] -# and to channel last (h,w,c). -frames = [frame.permute((1, 2, 0)).numpy() for frame in frames] +# The objects returned by the dataset are all torch.Tensors +print(type(frames[0])) +print(frames[0].shape) -# Finally, we save the frames to a mp4 video for visualization. -Path("outputs/examples/1_load_lerobot_dataset").mkdir(parents=True, exist_ok=True) -imageio.mimsave("outputs/examples/1_load_lerobot_dataset/episode_0.mp4", frames, fps=dataset.fps) +# Since we're using pytorch, the shape is in pytorch, channel-first convention (c, h, w). +# We can compare this shape with the information available for that feature +pprint(dataset.features[camera_key]) +# In particular: +print(dataset.features[camera_key]["shape"]) +# The shape is in (h, w, c) which is a more universal format. # For many machine learning applications we need to load the history of past observations or trajectories of # future actions. Our datasets can load previous and future frames for each key/modality, using timestamps # differences with the current loaded frame. For instance: delta_timestamps = { # loads 4 images: 1 second before current frame, 500 ms before, 200 ms before, and current frame - "observation.image": [-1, -0.5, -0.20, 0], - # loads 8 state vectors: 1.5 seconds before, 1 second before, ... 20 ms, 10 ms, and current frame - "observation.state": [-1.5, -1, -0.5, -0.20, -0.10, -0.02, -0.01, 0], + camera_key: [-1, -0.5, -0.20, 0], + # loads 8 state vectors: 1.5 seconds before, 1 second before, ... 200 ms, 100 ms, and current frame + "observation.state": [-1.5, -1, -0.5, -0.20, -0.10, 0], # loads 64 action vectors: current frame, 1 frame in the future, 2 frames, ... 63 frames in the future "action": [t / dataset.fps for t in range(64)], } +# Note that in any case, these delta_timestamps values need to be multiples of (1/fps) so that added to any +# timestamp, you still get a valid timestamp. + dataset = LeRobotDataset(repo_id, delta_timestamps=delta_timestamps) -print(f"\n{dataset[0]['observation.image'].shape=}") # (4,c,h,w) -print(f"{dataset[0]['observation.state'].shape=}") # (8,c) -print(f"{dataset[0]['action'].shape=}\n") # (64,c) +print(f"\n{dataset[0][camera_key].shape=}") # (4, c, h, w) +print(f"{dataset[0]['observation.state'].shape=}") # (6, c) +print(f"{dataset[0]['action'].shape=}\n") # (64, c) # Finally, our datasets are fully compatible with PyTorch dataloaders and samplers because they are just # PyTorch datasets. @@ -84,8 +126,9 @@ batch_size=32, shuffle=True, ) + for batch in dataloader: - print(f"{batch['observation.image'].shape=}") # (32,4,c,h,w) - print(f"{batch['observation.state'].shape=}") # (32,8,c) - print(f"{batch['action'].shape=}") # (32,64,c) + print(f"{batch[camera_key].shape=}") # (32, 4, c, h, w) + print(f"{batch['observation.state'].shape=}") # (32, 5, c) + print(f"{batch['action'].shape=}") # (32, 64, c) break diff --git a/examples/3_train_policy.py b/examples/3_train_policy.py index c5ce0d184..935ab2dbf 100644 --- a/examples/3_train_policy.py +++ b/examples/3_train_policy.py @@ -40,7 +40,7 @@ # For this example, no arguments need to be passed because the defaults are set up for PushT. # If you're doing something different, you will likely need to change at least some of the defaults. cfg = DiffusionConfig() -policy = DiffusionPolicy(cfg, dataset_stats=dataset.stats) +policy = DiffusionPolicy(cfg, dataset_stats=dataset.meta.stats) policy.train() policy.to(device) diff --git a/examples/6_add_image_transforms.py b/examples/6_add_image_transforms.py index bdcc6d7b9..82b70f5c1 100644 --- a/examples/6_add_image_transforms.py +++ b/examples/6_add_image_transforms.py @@ -1,7 +1,7 @@ """ This script demonstrates how to use torchvision's image transformation with LeRobotDataset for data augmentation purposes. The transformations are passed to the dataset as an argument upon creation, and -transforms are applied to the observation images before they are returned in the dataset's __get_item__. +transforms are applied to the observation images before they are returned in the dataset's __getitem__. """ from pathlib import Path @@ -20,7 +20,7 @@ first_idx = dataset.episode_data_index["from"][0].item() # Get the frame corresponding to the first camera -frame = dataset[first_idx][dataset.camera_keys[0]] +frame = dataset[first_idx][dataset.meta.camera_keys[0]] # Define the transformations @@ -36,7 +36,7 @@ transformed_dataset = LeRobotDataset(dataset_repo_id, image_transforms=transforms) # Get a frame from the transformed dataset -transformed_frame = transformed_dataset[first_idx][transformed_dataset.camera_keys[0]] +transformed_frame = transformed_dataset[first_idx][transformed_dataset.meta.camera_keys[0]] # Create a directory to store output images output_dir = Path("outputs/image_transforms") diff --git a/examples/7_get_started_with_real_robot.md b/examples/7_get_started_with_real_robot.md index 9db7e2521..76408275d 100644 --- a/examples/7_get_started_with_real_robot.md +++ b/examples/7_get_started_with_real_robot.md @@ -778,7 +778,6 @@ Now run this to record 2 episodes: python lerobot/scripts/control_robot.py record \ --robot-path lerobot/configs/robot/koch.yaml \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/koch_test \ --tags tutorial \ --warmup-time-s 5 \ @@ -787,7 +786,7 @@ python lerobot/scripts/control_robot.py record \ --num-episodes 2 ``` -This will write your dataset locally to `{root}/{repo-id}` (e.g. `data/cadene/koch_test`) and push it on the hub at `https://huggingface.co/datasets/{HF_USER}/{repo-id}`. Your dataset will be automatically tagged with `LeRobot` for the community to find it easily, and you can also add custom tags (in this case `tutorial` for example). +This will write your dataset locally to `~/.cache/huggingface/lerobot/{repo-id}` (e.g. `data/cadene/koch_test`) and push it on the hub at `https://huggingface.co/datasets/{HF_USER}/{repo-id}`. Your dataset will be automatically tagged with `LeRobot` for the community to find it easily, and you can also add custom tags (in this case `tutorial` for example). You can look for other LeRobot datasets on the hub by searching for `LeRobot` tags: https://huggingface.co/datasets?other=LeRobot @@ -840,7 +839,6 @@ In the coming months, we plan to release a foundational model for robotics. We a You can visualize your dataset by running: ```bash python lerobot/scripts/visualize_dataset_html.py \ - --root data \ --repo-id ${HF_USER}/koch_test ``` @@ -858,7 +856,6 @@ To replay the first episode of the dataset you just recorded, run the following python lerobot/scripts/control_robot.py replay \ --robot-path lerobot/configs/robot/koch.yaml \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/koch_test \ --episode 0 ``` @@ -871,7 +868,7 @@ Your robot should replicate movements similar to those you recorded. For example To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command: ```bash -DATA_DIR=data python lerobot/scripts/train.py \ +python lerobot/scripts/train.py \ dataset_repo_id=${HF_USER}/koch_test \ policy=act_koch_real \ env=koch_real \ @@ -918,7 +915,6 @@ env: It should match your dataset (e.g. `fps: 30`) and your robot (e.g. `state_dim: 6` and `action_dim: 6`). We are still working on simplifying this in future versions of `lerobot`. 4. We provided `device=cuda` since we are training on a Nvidia GPU, but you could use `device=mps` to train on Apple silicon. 5. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`. -6. We added `DATA_DIR=data` to access your dataset stored in your local `data` directory. If you dont provide `DATA_DIR`, your dataset will be downloaded from Hugging Face hub to your cache folder `$HOME/.cache/hugginface`. In future versions of `lerobot`, both directories will be in sync. For more information on the `train` script see the previous tutorial: [`examples/4_train_policy_with_script.md`](../examples/4_train_policy_with_script.md) @@ -991,7 +987,6 @@ To this end, you can use the `record` function from [`lerobot/scripts/control_ro python lerobot/scripts/control_robot.py record \ --robot-path lerobot/configs/robot/koch.yaml \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/eval_koch_test \ --tags tutorial eval \ --warmup-time-s 5 \ @@ -1010,7 +1005,6 @@ As you can see, it's almost the same command as previously used to record your t You can then visualize your evaluation dataset by running the same command as before but with the new inference dataset as argument: ```bash python lerobot/scripts/visualize_dataset.py \ - --root data \ --repo-id ${HF_USER}/eval_koch_test ``` diff --git a/examples/8_use_stretch.md b/examples/8_use_stretch.md index 97caf2ba8..c2c306f07 100644 --- a/examples/8_use_stretch.md +++ b/examples/8_use_stretch.md @@ -128,7 +128,6 @@ Record one episode: python lerobot/scripts/control_robot.py record \ --robot-path lerobot/configs/robot/stretch.yaml \ --fps 20 \ - --root data \ --repo-id ${HF_USER}/stretch_test \ --tags stretch tutorial \ --warmup-time-s 3 \ @@ -146,7 +145,6 @@ Now try to replay this episode (make sure the robot's initial position is the sa python lerobot/scripts/control_robot.py replay \ --robot-path lerobot/configs/robot/stretch.yaml \ --fps 20 \ - --root data \ --repo-id ${HF_USER}/stretch_test \ --episode 0 ``` diff --git a/examples/9_use_aloha.md b/examples/9_use_aloha.md index cf5e1ceca..1abf7c495 100644 --- a/examples/9_use_aloha.md +++ b/examples/9_use_aloha.md @@ -84,7 +84,6 @@ python lerobot/scripts/control_robot.py record \ --robot-path lerobot/configs/robot/aloha.yaml \ --robot-overrides max_relative_target=null \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/aloha_test \ --tags aloha tutorial \ --warmup-time-s 5 \ @@ -104,7 +103,6 @@ echo ${HF_USER}/aloha_test If you didn't upload with `--push-to-hub 0`, you can also visualize it locally with: ```bash python lerobot/scripts/visualize_dataset_html.py \ - --root data \ --repo-id ${HF_USER}/aloha_test ``` @@ -119,7 +117,6 @@ python lerobot/scripts/control_robot.py replay \ --robot-path lerobot/configs/robot/aloha.yaml \ --robot-overrides max_relative_target=null \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/aloha_test \ --episode 0 ``` @@ -128,7 +125,7 @@ python lerobot/scripts/control_robot.py replay \ To train a policy to control your robot, use the [`python lerobot/scripts/train.py`](../lerobot/scripts/train.py) script. A few arguments are required. Here is an example command: ```bash -DATA_DIR=data python lerobot/scripts/train.py \ +python lerobot/scripts/train.py \ dataset_repo_id=${HF_USER}/aloha_test \ policy=act_aloha_real \ env=aloha_real \ @@ -144,7 +141,6 @@ Let's explain it: 3. We provided an environment as argument with `env=aloha_real`. This loads configurations from [`lerobot/configs/env/aloha_real.yaml`](../lerobot/configs/env/aloha_real.yaml). Note: this yaml defines 18 dimensions for the `state_dim` and `action_dim`, corresponding to 18 motors, not 14 motors as used in previous Aloha work. This is because, we include the `shoulder_shadow` and `elbow_shadow` motors for simplicity. 4. We provided `device=cuda` since we are training on a Nvidia GPU. 5. We provided `wandb.enable=true` to use [Weights and Biases](https://docs.wandb.ai/quickstart) for visualizing training plots. This is optional but if you use it, make sure you are logged in by running `wandb login`. -6. We added `DATA_DIR=data` to access your dataset stored in your local `data` directory. If you dont provide `DATA_DIR`, your dataset will be downloaded from Hugging Face hub to your cache folder `$HOME/.cache/hugginface`. In future versions of `lerobot`, both directories will be in sync. Training should take several hours. You will find checkpoints in `outputs/train/act_aloha_test/checkpoints`. @@ -156,7 +152,6 @@ python lerobot/scripts/control_robot.py record \ --robot-path lerobot/configs/robot/aloha.yaml \ --robot-overrides max_relative_target=null \ --fps 30 \ - --root data \ --repo-id ${HF_USER}/eval_act_aloha_test \ --tags aloha tutorial eval \ --warmup-time-s 5 \ diff --git a/examples/advanced/2_calculate_validation_loss.py b/examples/advanced/2_calculate_validation_loss.py index 1428014b6..00ba9930f 100644 --- a/examples/advanced/2_calculate_validation_loss.py +++ b/examples/advanced/2_calculate_validation_loss.py @@ -14,7 +14,7 @@ import torch from huggingface_hub import snapshot_download -from lerobot.common.datasets.lerobot_dataset import LeRobotDataset +from lerobot.common.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy device = torch.device("cuda") @@ -41,26 +41,20 @@ } # Load the last 10% of episodes of the dataset as a validation set. -# - Load full dataset -full_dataset = LeRobotDataset("lerobot/pusht", split="train") -# - Calculate train and val subsets -num_train_episodes = math.floor(full_dataset.num_episodes * 90 / 100) -num_val_episodes = full_dataset.num_episodes - num_train_episodes -print(f"Number of episodes in full dataset: {full_dataset.num_episodes}") -print(f"Number of episodes in training dataset (90% subset): {num_train_episodes}") -print(f"Number of episodes in validation dataset (10% subset): {num_val_episodes}") -# - Get first frame index of the validation set -first_val_frame_index = full_dataset.episode_data_index["from"][num_train_episodes].item() -# - Load frames subset belonging to validation set using the `split` argument. -# It utilizes the `datasets` library's syntax for slicing datasets. -# For more information on the Slice API, please see: -# https://huggingface.co/docs/datasets/v2.19.0/loading#slice-splits -train_dataset = LeRobotDataset( - "lerobot/pusht", split=f"train[:{first_val_frame_index}]", delta_timestamps=delta_timestamps -) -val_dataset = LeRobotDataset( - "lerobot/pusht", split=f"train[{first_val_frame_index}:]", delta_timestamps=delta_timestamps -) +# - Load dataset metadata +dataset_metadata = LeRobotDatasetMetadata("lerobot/pusht") +# - Calculate train and val episodes +total_episodes = dataset_metadata.total_episodes +episodes = list(range(dataset_metadata.total_episodes)) +num_train_episodes = math.floor(total_episodes * 90 / 100) +train_episodes = episodes[:num_train_episodes] +val_episodes = episodes[num_train_episodes:] +print(f"Number of episodes in full dataset: {total_episodes}") +print(f"Number of episodes in training dataset (90% subset): {len(train_episodes)}") +print(f"Number of episodes in validation dataset (10% subset): {len(val_episodes)}") +# - Load train an val datasets +train_dataset = LeRobotDataset("lerobot/pusht", episodes=train_episodes, delta_timestamps=delta_timestamps) +val_dataset = LeRobotDataset("lerobot/pusht", episodes=val_episodes, delta_timestamps=delta_timestamps) print(f"Number of frames in training dataset (90% subset): {len(train_dataset)}") print(f"Number of frames in validation dataset (10% subset): {len(val_dataset)}") diff --git a/examples/port_datasets/pusht_zarr.py b/examples/port_datasets/pusht_zarr.py new file mode 100644 index 000000000..60df98405 --- /dev/null +++ b/examples/port_datasets/pusht_zarr.py @@ -0,0 +1,222 @@ +import shutil +from pathlib import Path + +import numpy as np +import torch + +from lerobot.common.datasets.lerobot_dataset import LEROBOT_HOME, LeRobotDataset +from lerobot.common.datasets.push_dataset_to_hub._download_raw import download_raw + +PUSHT_TASK = "Push the T-shaped blue block onto the T-shaped green target surface." +PUSHT_FEATURES = { + "observation.state": { + "dtype": "float32", + "shape": (2,), + "names": { + "axes": ["x", "y"], + }, + }, + "action": { + "dtype": "float32", + "shape": (2,), + "names": { + "axes": ["x", "y"], + }, + }, + "next.reward": { + "dtype": "float32", + "shape": (1,), + "names": None, + }, + "next.success": { + "dtype": "bool", + "shape": (1,), + "names": None, + }, + "observation.environment_state": { + "dtype": "float32", + "shape": (16,), + "names": [ + "keypoints", + ], + }, + "observation.image": { + "dtype": None, + "shape": (3, 96, 96), + "names": [ + "channel", + "height", + "width", + ], + }, +} + + +def build_features(mode: str) -> dict: + features = PUSHT_FEATURES + if mode == "keypoints": + features.pop("observation.image") + else: + features.pop("observation.environment_state") + features["observation.image"]["dtype"] = mode + + return features + + +def load_raw_dataset(zarr_path: Path): + try: + from lerobot.common.datasets.push_dataset_to_hub._diffusion_policy_replay_buffer import ( + ReplayBuffer as DiffusionPolicyReplayBuffer, + ) + except ModuleNotFoundError as e: + print("`gym_pusht` is not installed. Please install it with `pip install 'lerobot[gym_pusht]'`") + raise e + + zarr_data = DiffusionPolicyReplayBuffer.copy_from_path(zarr_path) + return zarr_data + + +def calculate_coverage(zarr_data): + try: + import pymunk + from gym_pusht.envs.pusht import PushTEnv, pymunk_to_shapely + except ModuleNotFoundError as e: + print("`gym_pusht` is not installed. Please install it with `pip install 'lerobot[gym_pusht]'`") + raise e + + block_pos = zarr_data["state"][:, 2:4] + block_angle = zarr_data["state"][:, 4] + + num_frames = len(block_pos) + + coverage = np.zeros((num_frames,)) + # 8 keypoints with 2 coords each + keypoints = np.zeros((num_frames, 16)) + + # Set x, y, theta (in radians) + goal_pos_angle = np.array([256, 256, np.pi / 4]) + goal_body = PushTEnv.get_goal_pose_body(goal_pos_angle) + + for i in range(num_frames): + space = pymunk.Space() + space.gravity = 0, 0 + space.damping = 0 + + # Add walls. + walls = [ + PushTEnv.add_segment(space, (5, 506), (5, 5), 2), + PushTEnv.add_segment(space, (5, 5), (506, 5), 2), + PushTEnv.add_segment(space, (506, 5), (506, 506), 2), + PushTEnv.add_segment(space, (5, 506), (506, 506), 2), + ] + space.add(*walls) + + block_body, block_shapes = PushTEnv.add_tee(space, block_pos[i].tolist(), block_angle[i].item()) + goal_geom = pymunk_to_shapely(goal_body, block_body.shapes) + block_geom = pymunk_to_shapely(block_body, block_body.shapes) + intersection_area = goal_geom.intersection(block_geom).area + goal_area = goal_geom.area + coverage[i] = intersection_area / goal_area + keypoints[i] = torch.from_numpy(PushTEnv.get_keypoints(block_shapes).flatten()) + + return coverage, keypoints + + +def calculate_success(coverage: float, success_threshold: float): + return coverage > success_threshold + + +def calculate_reward(coverage: float, success_threshold: float): + return np.clip(coverage / success_threshold, 0, 1) + + +def main(raw_dir: Path, repo_id: str, mode: str = "video", push_to_hub: bool = True): + if mode not in ["video", "image", "keypoints"]: + raise ValueError(mode) + + if (LEROBOT_HOME / repo_id).exists(): + shutil.rmtree(LEROBOT_HOME / repo_id) + + if not raw_dir.exists(): + download_raw(raw_dir, repo_id="lerobot-raw/pusht_raw") + + zarr_data = load_raw_dataset(zarr_path=raw_dir / "pusht_cchi_v7_replay.zarr") + + env_state = zarr_data["state"][:] + agent_pos = env_state[:, :2] + + action = zarr_data["action"][:] + image = zarr_data["img"] # (b, h, w, c) + + episode_data_index = { + "from": np.concatenate(([0], zarr_data.meta["episode_ends"][:-1])), + "to": zarr_data.meta["episode_ends"], + } + + # Calculate success and reward based on the overlapping area + # of the T-object and the T-area. + coverage, keypoints = calculate_coverage(zarr_data) + success = calculate_success(coverage, success_threshold=0.95) + reward = calculate_reward(coverage, success_threshold=0.95) + + features = build_features(mode) + dataset = LeRobotDataset.create( + repo_id=repo_id, + fps=10, + robot_type="2d pointer", + features=features, + image_writer_threads=4, + ) + episodes = range(len(episode_data_index["from"])) + for ep_idx in episodes: + from_idx = episode_data_index["from"][ep_idx] + to_idx = episode_data_index["to"][ep_idx] + num_frames = to_idx - from_idx + + for frame_idx in range(num_frames): + i = from_idx + frame_idx + frame = { + "action": torch.from_numpy(action[i]), + # Shift reward and success by +1 until the last item of the episode + "next.reward": reward[i + (frame_idx < num_frames - 1)], + "next.success": success[i + (frame_idx < num_frames - 1)], + } + + frame["observation.state"] = torch.from_numpy(agent_pos[i]) + + if mode == "keypoints": + frame["observation.environment_state"] = torch.from_numpy(keypoints[i]) + else: + frame["observation.image"] = torch.from_numpy(image[i]) + + dataset.add_frame(frame) + + dataset.save_episode(task=PUSHT_TASK) + + dataset.consolidate() + + if push_to_hub: + dataset.push_to_hub() + + +if __name__ == "__main__": + # To try this script, modify the repo id with your own HuggingFace user (e.g cadene/pusht) + repo_id = "lerobot/pusht" + + modes = ["video", "image", "keypoints"] + # Uncomment if you want to try with a specific mode + # modes = ["video"] + # modes = ["image"] + # modes = ["keypoints"] + + raw_dir = Path("data/lerobot-raw/pusht_raw") + for mode in modes: + if mode in ["image", "keypoints"]: + repo_id += f"_{mode}" + + # download and load raw dataset, create LeRobotDataset, populate it, push to hub + main(raw_dir, repo_id=repo_id, mode=mode) + + # Uncomment if you want to load the local dataset and explore it + # dataset = LeRobotDataset(repo_id=repo_id, local_files_only=True) + # breakpoint() diff --git a/lerobot/__init__.py b/lerobot/__init__.py index d462c7337..3d5bb6aaa 100644 --- a/lerobot/__init__.py +++ b/lerobot/__init__.py @@ -181,8 +181,8 @@ "lerobot/usc_cloth_sim", ] -available_datasets = list( - itertools.chain(*available_datasets_per_env.values(), available_real_world_datasets) +available_datasets = sorted( + set(itertools.chain(*available_datasets_per_env.values(), available_real_world_datasets)) ) # lists all available policies from `lerobot/common/policies` diff --git a/lerobot/common/datasets/card_template.md b/lerobot/common/datasets/card_template.md new file mode 100644 index 000000000..7ee27df95 --- /dev/null +++ b/lerobot/common/datasets/card_template.md @@ -0,0 +1,27 @@ +--- +# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1 +# Doc / guide: https://huggingface.co/docs/hub/datasets-cards +{{ card_data }} +--- + +This dataset was created using [LeRobot](https://github.com/huggingface/lerobot). + +## Dataset Description + +{{ dataset_description | default("", true) }} + +- **Homepage:** {{ url | default("[More Information Needed]", true)}} +- **Paper:** {{ paper | default("[More Information Needed]", true)}} +- **License:** {{ license | default("[More Information Needed]", true)}} + +## Dataset Structure + +{{ dataset_structure | default("[More Information Needed]", true)}} + +## Citation + +**BibTeX:** + +```bibtex +{{ citation_bibtex | default("[More Information Needed]", true)}} +``` diff --git a/lerobot/common/datasets/compute_stats.py b/lerobot/common/datasets/compute_stats.py index bafac2e1e..c62116994 100644 --- a/lerobot/common/datasets/compute_stats.py +++ b/lerobot/common/datasets/compute_stats.py @@ -19,9 +19,6 @@ import einops import torch import tqdm -from datasets import Image - -from lerobot.common.datasets.video_utils import VideoFrame def get_stats_einops_patterns(dataset, num_workers=0): @@ -39,15 +36,13 @@ def get_stats_einops_patterns(dataset, num_workers=0): batch = next(iter(dataloader)) stats_patterns = {} - for key, feats_type in dataset.features.items(): - # NOTE: skip language_instruction embedding in stats computation - if key == "language_instruction": - continue + for key in dataset.features: # sanity check that tensors are not float64 assert batch[key].dtype != torch.float64 - if isinstance(feats_type, (VideoFrame, Image)): + # if isinstance(feats_type, (VideoFrame, Image)): + if key in dataset.meta.camera_keys: # sanity check that images are channel first _, c, h, w = batch[key].shape assert c < h and c < w, f"expect channel first images, but instead {batch[key].shape}" @@ -63,7 +58,7 @@ def get_stats_einops_patterns(dataset, num_workers=0): elif batch[key].ndim == 1: stats_patterns[key] = "b -> 1" else: - raise ValueError(f"{key}, {feats_type}, {batch[key].shape}") + raise ValueError(f"{key}, {batch[key].shape}") return stats_patterns @@ -175,39 +170,45 @@ def aggregate_stats(ls_datasets) -> dict[str, torch.Tensor]: """ data_keys = set() for dataset in ls_datasets: - data_keys.update(dataset.stats.keys()) + data_keys.update(dataset.meta.stats.keys()) stats = {k: {} for k in data_keys} for data_key in data_keys: for stat_key in ["min", "max"]: # compute `max(dataset_0["max"], dataset_1["max"], ...)` stats[data_key][stat_key] = einops.reduce( - torch.stack([d.stats[data_key][stat_key] for d in ls_datasets if data_key in d.stats], dim=0), + torch.stack( + [ds.meta.stats[data_key][stat_key] for ds in ls_datasets if data_key in ds.meta.stats], + dim=0, + ), "n ... -> ...", stat_key, ) - total_samples = sum(d.num_samples for d in ls_datasets if data_key in d.stats) + total_samples = sum(d.num_frames for d in ls_datasets if data_key in d.meta.stats) # Compute the "sum" statistic by multiplying each mean by the number of samples in the respective # dataset, then divide by total_samples to get the overall "mean". - # NOTE: the brackets around (d.num_samples / total_samples) are needed tor minimize the risk of + # NOTE: the brackets around (d.num_frames / total_samples) are needed tor minimize the risk of # numerical overflow! stats[data_key]["mean"] = sum( - d.stats[data_key]["mean"] * (d.num_samples / total_samples) + d.meta.stats[data_key]["mean"] * (d.num_frames / total_samples) for d in ls_datasets - if data_key in d.stats + if data_key in d.meta.stats ) # The derivation for standard deviation is a little more involved but is much in the same spirit as # the computation of the mean. # Given two sets of data where the statistics are known: # σ_combined = sqrt[ (n1 * (σ1^2 + d1^2) + n2 * (σ2^2 + d2^2)) / (n1 + n2) ] # where d1 = μ1 - μ_combined, d2 = μ2 - μ_combined - # NOTE: the brackets around (d.num_samples / total_samples) are needed tor minimize the risk of + # NOTE: the brackets around (d.num_frames / total_samples) are needed tor minimize the risk of # numerical overflow! stats[data_key]["std"] = torch.sqrt( sum( - (d.stats[data_key]["std"] ** 2 + (d.stats[data_key]["mean"] - stats[data_key]["mean"]) ** 2) - * (d.num_samples / total_samples) + ( + d.meta.stats[data_key]["std"] ** 2 + + (d.meta.stats[data_key]["mean"] - stats[data_key]["mean"]) ** 2 + ) + * (d.num_frames / total_samples) for d in ls_datasets - if data_key in d.stats + if data_key in d.meta.stats ) ) return stats diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py index 96a353fbf..f6164ed1d 100644 --- a/lerobot/common/datasets/factory.py +++ b/lerobot/common/datasets/factory.py @@ -91,9 +91,9 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData ) if isinstance(cfg.dataset_repo_id, str): + # TODO (aliberts): add 'episodes' arg from config after removing hydra dataset = LeRobotDataset( cfg.dataset_repo_id, - split=split, delta_timestamps=cfg.training.get("delta_timestamps"), image_transforms=image_transforms, video_backend=cfg.video_backend, @@ -101,7 +101,6 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData else: dataset = MultiLeRobotDataset( cfg.dataset_repo_id, - split=split, delta_timestamps=cfg.training.get("delta_timestamps"), image_transforms=image_transforms, video_backend=cfg.video_backend, @@ -112,6 +111,6 @@ def make_dataset(cfg, split: str = "train") -> LeRobotDataset | MultiLeRobotData for stats_type, listconfig in stats_dict.items(): # example of stats_type: min, max, mean, std stats = OmegaConf.to_container(listconfig, resolve=True) - dataset.stats[key][stats_type] = torch.tensor(stats, dtype=torch.float32) + dataset.meta.stats[key][stats_type] = torch.tensor(stats, dtype=torch.float32) return dataset diff --git a/lerobot/common/datasets/image_writer.py b/lerobot/common/datasets/image_writer.py new file mode 100644 index 000000000..9564fb591 --- /dev/null +++ b/lerobot/common/datasets/image_writer.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python + +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import multiprocessing +import queue +import threading +from pathlib import Path + +import numpy as np +import PIL.Image +import torch + + +def safe_stop_image_writer(func): + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except Exception as e: + dataset = kwargs.get("dataset", None) + image_writer = getattr(dataset, "image_writer", None) if dataset else None + if image_writer is not None: + print("Waiting for image writer to terminate...") + image_writer.stop() + raise e + + return wrapper + + +def image_array_to_image(image_array: np.ndarray) -> PIL.Image.Image: + # TODO(aliberts): handle 1 channel and 4 for depth images + if image_array.ndim == 3 and image_array.shape[0] in [1, 3]: + # Transpose from pytorch convention (C, H, W) to (H, W, C) + image_array = image_array.transpose(1, 2, 0) + if image_array.dtype != np.uint8: + # Assume the image is in [0, 1] range for floating-point data + image_array = np.clip(image_array, 0, 1) + image_array = (image_array * 255).astype(np.uint8) + return PIL.Image.fromarray(image_array) + + +def write_image(image: np.ndarray | PIL.Image.Image, fpath: Path): + try: + if isinstance(image, np.ndarray): + img = image_array_to_image(image) + elif isinstance(image, PIL.Image.Image): + img = image + else: + raise TypeError(f"Unsupported image type: {type(image)}") + img.save(fpath) + except Exception as e: + print(f"Error writing image {fpath}: {e}") + + +def worker_thread_loop(queue: queue.Queue): + while True: + item = queue.get() + if item is None: + queue.task_done() + break + image_array, fpath = item + write_image(image_array, fpath) + queue.task_done() + + +def worker_process(queue: queue.Queue, num_threads: int): + threads = [] + for _ in range(num_threads): + t = threading.Thread(target=worker_thread_loop, args=(queue,)) + t.daemon = True + t.start() + threads.append(t) + for t in threads: + t.join() + + +class AsyncImageWriter: + """ + This class abstract away the initialisation of processes or/and threads to + save images on disk asynchrounously, which is critical to control a robot and record data + at a high frame rate. + + When `num_processes=0`, it creates a threads pool of size `num_threads`. + When `num_processes>0`, it creates processes pool of size `num_processes`, where each subprocess starts + their own threads pool of size `num_threads`. + + The optimal number of processes and threads depends on your computer capabilities. + We advise to use 4 threads per camera with 0 processes. If the fps is not stable, try to increase or lower + the number of threads. If it is still not stable, try to use 1 subprocess, or more. + """ + + def __init__(self, num_processes: int = 0, num_threads: int = 1): + self.num_processes = num_processes + self.num_threads = num_threads + self.queue = None + self.threads = [] + self.processes = [] + self._stopped = False + + if num_threads <= 0 and num_processes <= 0: + raise ValueError("Number of threads and processes must be greater than zero.") + + if self.num_processes == 0: + # Use threading + self.queue = queue.Queue() + for _ in range(self.num_threads): + t = threading.Thread(target=worker_thread_loop, args=(self.queue,)) + t.daemon = True + t.start() + self.threads.append(t) + else: + # Use multiprocessing + self.queue = multiprocessing.JoinableQueue() + for _ in range(self.num_processes): + p = multiprocessing.Process(target=worker_process, args=(self.queue, self.num_threads)) + p.daemon = True + p.start() + self.processes.append(p) + + def save_image(self, image: torch.Tensor | np.ndarray | PIL.Image.Image, fpath: Path): + if isinstance(image, torch.Tensor): + # Convert tensor to numpy array to minimize main process time + image = image.cpu().numpy() + self.queue.put((image, fpath)) + + def wait_until_done(self): + self.queue.join() + + def stop(self): + if self._stopped: + return + + if self.num_processes == 0: + for _ in self.threads: + self.queue.put(None) + for t in self.threads: + t.join() + else: + num_nones = self.num_processes * self.num_threads + for _ in range(num_nones): + self.queue.put(None) + for p in self.processes: + p.join() + if p.is_alive(): + p.terminate() + self.queue.close() + self.queue.join_thread() + + self._stopped = True diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py index eb76f78d6..b32cf7095 100644 --- a/lerobot/common/datasets/lerobot_dataset.py +++ b/lerobot/common/datasets/lerobot_dataset.py @@ -15,202 +15,946 @@ # limitations under the License. import logging import os +import shutil +from functools import cached_property from pathlib import Path from typing import Callable import datasets +import numpy as np +import PIL.Image import torch import torch.utils +from datasets import load_dataset +from huggingface_hub import create_repo, snapshot_download, upload_folder -from lerobot.common.datasets.compute_stats import aggregate_stats +from lerobot.common.datasets.compute_stats import aggregate_stats, compute_stats +from lerobot.common.datasets.image_writer import AsyncImageWriter, write_image from lerobot.common.datasets.utils import ( - calculate_episode_data_index, - load_episode_data_index, - load_hf_dataset, + DEFAULT_FEATURES, + DEFAULT_IMAGE_PATH, + EPISODES_PATH, + INFO_PATH, + STATS_PATH, + TASKS_PATH, + append_jsonlines, + check_delta_timestamps, + check_timestamps_sync, + check_version_compatibility, + create_branch, + create_empty_dataset_info, + create_lerobot_dataset_card, + get_delta_indices, + get_episode_data_index, + get_features_from_robot, + get_hf_features_from_features, + get_hub_safe_version, + hf_transform_to_torch, + load_episodes, load_info, - load_previous_and_future_frames, load_stats, - load_videos, - reset_episode_index, + load_tasks, + serialize_dict, + write_json, + write_parquet, ) -from lerobot.common.datasets.video_utils import VideoFrame, load_from_videos +from lerobot.common.datasets.video_utils import ( + VideoFrame, + decode_video_frames_torchvision, + encode_video_frames, + get_video_info, +) +from lerobot.common.robot_devices.robots.utils import Robot # For maintainers, see lerobot/common/datasets/push_dataset_to_hub/CODEBASE_VERSION.md -CODEBASE_VERSION = "v1.6" -DATA_DIR = Path(os.environ["DATA_DIR"]) if "DATA_DIR" in os.environ else None +CODEBASE_VERSION = "v2.0" +LEROBOT_HOME = Path(os.getenv("LEROBOT_HOME", "~/.cache/huggingface/lerobot")).expanduser() + + +class LeRobotDatasetMetadata: + def __init__( + self, + repo_id: str, + root: str | Path | None = None, + local_files_only: bool = False, + ): + self.repo_id = repo_id + self.root = Path(root) if root is not None else LEROBOT_HOME / repo_id + self.local_files_only = local_files_only + + # Load metadata + (self.root / "meta").mkdir(exist_ok=True, parents=True) + self.pull_from_repo(allow_patterns="meta/") + self.info = load_info(self.root) + self.stats = load_stats(self.root) + self.tasks = load_tasks(self.root) + self.episodes = load_episodes(self.root) + + def pull_from_repo( + self, + allow_patterns: list[str] | str | None = None, + ignore_patterns: list[str] | str | None = None, + ) -> None: + snapshot_download( + self.repo_id, + repo_type="dataset", + revision=self._hub_version, + local_dir=self.root, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + local_files_only=self.local_files_only, + ) + + @cached_property + def _hub_version(self) -> str | None: + return None if self.local_files_only else get_hub_safe_version(self.repo_id, CODEBASE_VERSION) + + @property + def _version(self) -> str: + """Codebase version used to create this dataset.""" + return self.info["codebase_version"] + + def get_data_file_path(self, ep_index: int) -> Path: + ep_chunk = self.get_episode_chunk(ep_index) + fpath = self.data_path.format(episode_chunk=ep_chunk, episode_index=ep_index) + return Path(fpath) + + def get_video_file_path(self, ep_index: int, vid_key: str) -> Path: + ep_chunk = self.get_episode_chunk(ep_index) + fpath = self.video_path.format(episode_chunk=ep_chunk, video_key=vid_key, episode_index=ep_index) + return Path(fpath) + + def get_episode_chunk(self, ep_index: int) -> int: + return ep_index // self.chunks_size + + @property + def data_path(self) -> str: + """Formattable string for the parquet files.""" + return self.info["data_path"] + + @property + def video_path(self) -> str | None: + """Formattable string for the video files.""" + return self.info["video_path"] + + @property + def robot_type(self) -> str | None: + """Robot type used in recording this dataset.""" + return self.info["robot_type"] + + @property + def fps(self) -> int: + """Frames per second used during data collection.""" + return self.info["fps"] + + @property + def features(self) -> dict[str, dict]: + """All features contained in the dataset.""" + return self.info["features"] + + @property + def image_keys(self) -> list[str]: + """Keys to access visual modalities stored as images.""" + return [key for key, ft in self.features.items() if ft["dtype"] == "image"] + + @property + def video_keys(self) -> list[str]: + """Keys to access visual modalities stored as videos.""" + return [key for key, ft in self.features.items() if ft["dtype"] == "video"] + + @property + def camera_keys(self) -> list[str]: + """Keys to access visual modalities (regardless of their storage method).""" + return [key for key, ft in self.features.items() if ft["dtype"] in ["video", "image"]] + + @property + def names(self) -> dict[str, list | dict]: + """Names of the various dimensions of vector modalities.""" + return {key: ft["names"] for key, ft in self.features.items()} + + @property + def shapes(self) -> dict: + """Shapes for the different features.""" + return {key: tuple(ft["shape"]) for key, ft in self.features.items()} + + @property + def total_episodes(self) -> int: + """Total number of episodes available.""" + return self.info["total_episodes"] + + @property + def total_frames(self) -> int: + """Total number of frames saved in this dataset.""" + return self.info["total_frames"] + + @property + def total_tasks(self) -> int: + """Total number of different tasks performed in this dataset.""" + return self.info["total_tasks"] + + @property + def total_chunks(self) -> int: + """Total number of chunks (groups of episodes).""" + return self.info["total_chunks"] + + @property + def chunks_size(self) -> int: + """Max number of episodes per chunk.""" + return self.info["chunks_size"] + + @property + def task_to_task_index(self) -> dict: + return {task: task_idx for task_idx, task in self.tasks.items()} + + def get_task_index(self, task: str) -> int: + """ + Given a task in natural language, returns its task_index if the task already exists in the dataset, + otherwise creates a new task_index. + """ + task_index = self.task_to_task_index.get(task, None) + return task_index if task_index is not None else self.total_tasks + + def save_episode(self, episode_index: int, episode_length: int, task: str, task_index: int) -> None: + self.info["total_episodes"] += 1 + self.info["total_frames"] += episode_length + + if task_index not in self.tasks: + self.info["total_tasks"] += 1 + self.tasks[task_index] = task + task_dict = { + "task_index": task_index, + "task": task, + } + append_jsonlines(task_dict, self.root / TASKS_PATH) + + chunk = self.get_episode_chunk(episode_index) + if chunk >= self.total_chunks: + self.info["total_chunks"] += 1 + + self.info["splits"] = {"train": f"0:{self.info['total_episodes']}"} + self.info["total_videos"] += len(self.video_keys) + write_json(self.info, self.root / INFO_PATH) + + episode_dict = { + "episode_index": episode_index, + "tasks": [task], + "length": episode_length, + } + self.episodes.append(episode_dict) + append_jsonlines(episode_dict, self.root / EPISODES_PATH) + + # TODO(aliberts): refactor stats in save_episodes + # image_sampling = int(self.fps / 2) # sample 2 img/s for the stats + # ep_stats = compute_episode_stats(episode_buffer, self.features, episode_length, image_sampling=image_sampling) + # ep_stats = serialize_dict(ep_stats) + # append_jsonlines(ep_stats, self.root / STATS_PATH) + + def write_video_info(self) -> None: + """ + Warning: this function writes info from first episode videos, implicitly assuming that all videos have + been encoded the same way. Also, this means it assumes the first episode exists. + """ + for key in self.video_keys: + if not self.features[key].get("info", None): + video_path = self.root / self.get_video_file_path(ep_index=0, vid_key=key) + self.info["features"][key]["info"] = get_video_info(video_path) + + write_json(self.info, self.root / INFO_PATH) + + def __repr__(self): + feature_keys = list(self.features) + return ( + f"{self.__class__.__name__}({{\n" + f" Repository ID: '{self.repo_id}',\n" + f" Total episodes: '{self.total_episodes}',\n" + f" Total frames: '{self.total_frames}',\n" + f" Features: '{feature_keys}',\n" + "})',\n" + ) + + @classmethod + def create( + cls, + repo_id: str, + fps: int, + root: str | Path | None = None, + robot: Robot | None = None, + robot_type: str | None = None, + features: dict | None = None, + use_videos: bool = True, + ) -> "LeRobotDatasetMetadata": + """Creates metadata for a LeRobotDataset.""" + obj = cls.__new__(cls) + obj.repo_id = repo_id + obj.root = Path(root) if root is not None else LEROBOT_HOME / repo_id + + obj.root.mkdir(parents=True, exist_ok=False) + + if robot is not None: + features = get_features_from_robot(robot, use_videos) + robot_type = robot.robot_type + if not all(cam.fps == fps for cam in robot.cameras.values()): + logging.warning( + f"Some cameras in your {robot.robot_type} robot don't have an fps matching the fps of your dataset." + "In this case, frames from lower fps cameras will be repeated to fill in the blanks." + ) + elif features is None: + raise ValueError( + "Dataset features must either come from a Robot or explicitly passed upon creation." + ) + else: + # TODO(aliberts, rcadene): implement sanity check for features + features = {**features, **DEFAULT_FEATURES} + + obj.tasks, obj.stats, obj.episodes = {}, {}, [] + obj.info = create_empty_dataset_info(CODEBASE_VERSION, fps, robot_type, features, use_videos) + if len(obj.video_keys) > 0 and not use_videos: + raise ValueError() + write_json(obj.info, obj.root / INFO_PATH) + obj.local_files_only = True + return obj class LeRobotDataset(torch.utils.data.Dataset): def __init__( self, repo_id: str, - root: Path | None = DATA_DIR, - split: str = "train", + root: str | Path | None = None, + episodes: list[int] | None = None, image_transforms: Callable | None = None, delta_timestamps: dict[list[float]] | None = None, + tolerance_s: float = 1e-4, + download_videos: bool = True, + local_files_only: bool = False, video_backend: str | None = None, ): + """ + 2 modes are available for instantiating this class, depending on 2 different use cases: + + 1. Your dataset already exists: + - On your local disk in the 'root' folder. This is typically the case when you recorded your + dataset locally and you may or may not have pushed it to the hub yet. Instantiating this class + with 'root' will load your dataset directly from disk. This can happen while you're offline (no + internet connection), in that case, use local_files_only=True. + + - On the Hugging Face Hub at the address https://huggingface.co/datasets/{repo_id} and not on + your local disk in the 'root' folder. Instantiating this class with this 'repo_id' will download + the dataset from that address and load it, pending your dataset is compliant with + codebase_version v2.0. If your dataset has been created before this new format, you will be + prompted to convert it using our conversion script from v1.6 to v2.0, which you can find at + lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py. + + + 2. Your dataset doesn't already exists (either on local disk or on the Hub): you can create an empty + LeRobotDataset with the 'create' classmethod. This can be used for recording a dataset or port an + existing dataset to the LeRobotDataset format. + + + In terms of files, LeRobotDataset encapsulates 3 main things: + - metadata: + - info contains various information about the dataset like shapes, keys, fps etc. + - stats stores the dataset statistics of the different modalities for normalization + - tasks contains the prompts for each task of the dataset, which can be used for + task-conditionned training. + - hf_dataset (from datasets.Dataset), which will read any values from parquet files. + - videos (optional) from which frames are loaded to be synchronous with data from parquet files. + + A typical LeRobotDataset looks like this from its root path: + . + ├── data + │ ├── chunk-000 + │ │ ├── episode_000000.parquet + │ │ ├── episode_000001.parquet + │ │ ├── episode_000002.parquet + │ │ └── ... + │ ├── chunk-001 + │ │ ├── episode_001000.parquet + │ │ ├── episode_001001.parquet + │ │ ├── episode_001002.parquet + │ │ └── ... + │ └── ... + ├── meta + │ ├── episodes.jsonl + │ ├── info.json + │ ├── stats.json + │ └── tasks.jsonl + └── videos + ├── chunk-000 + │ ├── observation.images.laptop + │ │ ├── episode_000000.mp4 + │ │ ├── episode_000001.mp4 + │ │ ├── episode_000002.mp4 + │ │ └── ... + │ ├── observation.images.phone + │ │ ├── episode_000000.mp4 + │ │ ├── episode_000001.mp4 + │ │ ├── episode_000002.mp4 + │ │ └── ... + ├── chunk-001 + └── ... + + Note that this file-based structure is designed to be as versatile as possible. The files are split by + episodes which allows a more granular control over which episodes one wants to use and download. The + structure of the dataset is entirely described in the info.json file, which can be easily downloaded + or viewed directly on the hub before downloading any actual data. The type of files used are very + simple and do not need complex tools to be read, it only uses .parquet, .json and .mp4 files (and .md + for the README). + + Args: + repo_id (str): This is the repo id that will be used to fetch the dataset. Locally, the dataset + will be stored under root/repo_id. + root (Path | None, optional): Local directory to use for downloading/writing files. You can also + set the LEROBOT_HOME environment variable to point to a different location. Defaults to + '~/.cache/huggingface/lerobot'. + episodes (list[int] | None, optional): If specified, this will only load episodes specified by + their episode_index in this list. Defaults to None. + image_transforms (Callable | None, optional): You can pass standard v2 image transforms from + torchvision.transforms.v2 here which will be applied to visual modalities (whether they come + from videos or images). Defaults to None. + delta_timestamps (dict[list[float]] | None, optional): _description_. Defaults to None. + tolerance_s (float, optional): Tolerance in seconds used to ensure data timestamps are actually in + sync with the fps value. It is used at the init of the dataset to make sure that each + timestamps is separated to the next by 1/fps +/- tolerance_s. This also applies to frames + decoded from video files. It is also used to check that `delta_timestamps` (when provided) are + multiples of 1/fps. Defaults to 1e-4. + download_videos (bool, optional): Flag to download the videos. Note that when set to True but the + video files are already present on local disk, they won't be downloaded again. Defaults to + True. + local_files_only (bool, optional): Flag to use local files only. If True, no requests to the hub + will be made. Defaults to False. + video_backend (str | None, optional): Video backend to use for decoding videos. There is currently + a single option which is the pyav decoder used by Torchvision. Defaults to pyav. + """ super().__init__() self.repo_id = repo_id - self.root = root - self.split = split + self.root = Path(root) if root else LEROBOT_HOME / repo_id self.image_transforms = image_transforms self.delta_timestamps = delta_timestamps - # load data from hub or locally when root is provided + self.episodes = episodes + self.tolerance_s = tolerance_s + self.video_backend = video_backend if video_backend else "pyav" + self.delta_indices = None + self.local_files_only = local_files_only + + # Unused attributes + self.image_writer = None + self.episode_buffer = None + + self.root.mkdir(exist_ok=True, parents=True) + + # Load metadata + self.meta = LeRobotDatasetMetadata(self.repo_id, self.root, self.local_files_only) + + # Check version + check_version_compatibility(self.repo_id, self.meta._version, CODEBASE_VERSION) + + # Load actual data + self.download_episodes(download_videos) + self.hf_dataset = self.load_hf_dataset() + self.episode_data_index = get_episode_data_index(self.meta.episodes, self.episodes) + + # Check timestamps + check_timestamps_sync(self.hf_dataset, self.episode_data_index, self.fps, self.tolerance_s) + + # Setup delta_indices + if self.delta_timestamps is not None: + check_delta_timestamps(self.delta_timestamps, self.fps, self.tolerance_s) + self.delta_indices = get_delta_indices(self.delta_timestamps, self.fps) + + # Available stats implies all videos have been encoded and dataset is iterable + self.consolidated = self.meta.stats is not None + + def push_to_hub( + self, + tags: list | None = None, + license: str | None = "apache-2.0", + push_videos: bool = True, + private: bool = False, + **card_kwargs, + ) -> None: + if not self.consolidated: + logging.warning( + "You are trying to upload to the hub a LeRobotDataset that has not been consolidated yet. " + "Consolidating first." + ) + self.consolidate() + + ignore_patterns = ["images/"] + if not push_videos: + ignore_patterns.append("videos/") + + create_repo( + repo_id=self.repo_id, + private=private, + repo_type="dataset", + exist_ok=True, + ) + + upload_folder( + repo_id=self.repo_id, + folder_path=self.root, + repo_type="dataset", + ignore_patterns=ignore_patterns, + ) + card = create_lerobot_dataset_card( + tags=tags, dataset_info=self.meta.info, license=license, **card_kwargs + ) + card.push_to_hub(repo_id=self.repo_id, repo_type="dataset") + create_branch(repo_id=self.repo_id, branch=CODEBASE_VERSION, repo_type="dataset") + + def pull_from_repo( + self, + allow_patterns: list[str] | str | None = None, + ignore_patterns: list[str] | str | None = None, + ) -> None: + snapshot_download( + self.repo_id, + repo_type="dataset", + revision=self.meta._hub_version, + local_dir=self.root, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + local_files_only=self.local_files_only, + ) + + def download_episodes(self, download_videos: bool = True) -> None: + """Downloads the dataset from the given 'repo_id' at the provided version. If 'episodes' is given, this + will only download those episodes (selected by their episode_index). If 'episodes' is None, the whole + dataset will be downloaded. Thanks to the behavior of snapshot_download, if the files are already present + in 'local_dir', they won't be downloaded again. + """ # TODO(rcadene, aliberts): implement faster transfer # https://huggingface.co/docs/huggingface_hub/en/guides/download#faster-downloads - self.hf_dataset = load_hf_dataset(repo_id, CODEBASE_VERSION, root, split) - if split == "train": - self.episode_data_index = load_episode_data_index(repo_id, CODEBASE_VERSION, root) + files = None + ignore_patterns = None if download_videos else "videos/" + if self.episodes is not None: + files = [str(self.meta.get_data_file_path(ep_idx)) for ep_idx in self.episodes] + if len(self.meta.video_keys) > 0 and download_videos: + video_files = [ + str(self.meta.get_video_file_path(ep_idx, vid_key)) + for vid_key in self.meta.video_keys + for ep_idx in self.episodes + ] + files += video_files + + self.pull_from_repo(allow_patterns=files, ignore_patterns=ignore_patterns) + + def load_hf_dataset(self) -> datasets.Dataset: + """hf_dataset contains all the observations, states, actions, rewards, etc.""" + if self.episodes is None: + path = str(self.root / "data") + hf_dataset = load_dataset("parquet", data_dir=path, split="train") else: - self.episode_data_index = calculate_episode_data_index(self.hf_dataset) - self.hf_dataset = reset_episode_index(self.hf_dataset) - self.stats = load_stats(repo_id, CODEBASE_VERSION, root) - self.info = load_info(repo_id, CODEBASE_VERSION, root) - if self.video: - self.videos_dir = load_videos(repo_id, CODEBASE_VERSION, root) - self.video_backend = video_backend if video_backend is not None else "pyav" + files = [str(self.root / self.meta.get_data_file_path(ep_idx)) for ep_idx in self.episodes] + hf_dataset = load_dataset("parquet", data_files=files, split="train") + + # TODO(aliberts): hf_dataset.set_format("torch") + hf_dataset.set_transform(hf_transform_to_torch) + + return hf_dataset @property def fps(self) -> int: """Frames per second used during data collection.""" - return self.info["fps"] + return self.meta.fps @property - def video(self) -> bool: - """Returns True if this dataset loads video frames from mp4 files. - Returns False if it only loads images from png files. - """ - return self.info.get("video", False) + def num_frames(self) -> int: + """Number of frames in selected episodes.""" + return len(self.hf_dataset) if self.hf_dataset is not None else self.meta.total_frames @property - def features(self) -> datasets.Features: - return self.hf_dataset.features + def num_episodes(self) -> int: + """Number of episodes selected.""" + return len(self.episodes) if self.episodes is not None else self.meta.total_episodes @property - def camera_keys(self) -> list[str]: - """Keys to access image and video stream from cameras.""" - keys = [] - for key, feats in self.hf_dataset.features.items(): - if isinstance(feats, (datasets.Image, VideoFrame)): - keys.append(key) - return keys + def features(self) -> dict[str, dict]: + return self.meta.features @property - def video_frame_keys(self) -> list[str]: - """Keys to access video frames that requires to be decoded into images. + def hf_features(self) -> datasets.Features: + """Features of the hf_dataset.""" + if self.hf_dataset is not None: + return self.hf_dataset.features + else: + return get_hf_features_from_features(self.features) - Note: It is empty if the dataset contains images only, - or equal to `self.cameras` if the dataset contains videos only, - or can even be a subset of `self.cameras` in a case of a mixed image/video dataset. - """ - video_frame_keys = [] - for key, feats in self.hf_dataset.features.items(): - if isinstance(feats, VideoFrame): - video_frame_keys.append(key) - return video_frame_keys + def _get_query_indices(self, idx: int, ep_idx: int) -> tuple[dict[str, list[int | bool]]]: + ep_start = self.episode_data_index["from"][ep_idx] + ep_end = self.episode_data_index["to"][ep_idx] + query_indices = { + key: [max(ep_start.item(), min(ep_end.item() - 1, idx + delta)) for delta in delta_idx] + for key, delta_idx in self.delta_indices.items() + } + padding = { # Pad values outside of current episode range + f"{key}_is_pad": torch.BoolTensor( + [(idx + delta < ep_start.item()) | (idx + delta >= ep_end.item()) for delta in delta_idx] + ) + for key, delta_idx in self.delta_indices.items() + } + return query_indices, padding - @property - def num_samples(self) -> int: - """Number of samples/frames.""" - return len(self.hf_dataset) + def _get_query_timestamps( + self, + current_ts: float, + query_indices: dict[str, list[int]] | None = None, + ) -> dict[str, list[float]]: + query_timestamps = {} + for key in self.meta.video_keys: + if query_indices is not None and key in query_indices: + timestamps = self.hf_dataset.select(query_indices[key])["timestamp"] + query_timestamps[key] = torch.stack(timestamps).tolist() + else: + query_timestamps[key] = [current_ts] - @property - def num_episodes(self) -> int: - """Number of episodes.""" - return len(self.hf_dataset.unique("episode_index")) + return query_timestamps - @property - def tolerance_s(self) -> float: - """Tolerance in seconds used to discard loaded frames when their timestamps - are not close enough from the requested frames. It is only used when `delta_timestamps` - is provided or when loading video frames from mp4 files. + def _query_hf_dataset(self, query_indices: dict[str, list[int]]) -> dict: + return { + key: torch.stack(self.hf_dataset.select(q_idx)[key]) + for key, q_idx in query_indices.items() + if key not in self.meta.video_keys + } + + def _query_videos(self, query_timestamps: dict[str, list[float]], ep_idx: int) -> dict: + """Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function + in the main process (e.g. by using a second Dataloader with num_workers=0). It will result in a + Segmentation Fault. This probably happens because a memory reference to the video loader is created in + the main process and a subprocess fails to access it. """ - # 1e-4 to account for possible numerical error - return 1 / self.fps - 1e-4 + item = {} + for vid_key, query_ts in query_timestamps.items(): + video_path = self.root / self.meta.get_video_file_path(ep_idx, vid_key) + frames = decode_video_frames_torchvision( + video_path, query_ts, self.tolerance_s, self.video_backend + ) + item[vid_key] = frames.squeeze(0) + + return item + + def _add_padding_keys(self, item: dict, padding: dict[str, list[bool]]) -> dict: + for key, val in padding.items(): + item[key] = torch.BoolTensor(val) + return item def __len__(self): - return self.num_samples + return self.num_frames - def __getitem__(self, idx): + def __getitem__(self, idx) -> dict: item = self.hf_dataset[idx] + ep_idx = item["episode_index"].item() - if self.delta_timestamps is not None: - item = load_previous_and_future_frames( - item, - self.hf_dataset, - self.episode_data_index, - self.delta_timestamps, - self.tolerance_s, - ) + query_indices = None + if self.delta_indices is not None: + current_ep_idx = self.episodes.index(ep_idx) if self.episodes is not None else ep_idx + query_indices, padding = self._get_query_indices(idx, current_ep_idx) + query_result = self._query_hf_dataset(query_indices) + item = {**item, **padding} + for key, val in query_result.items(): + item[key] = val - if self.video: - item = load_from_videos( - item, - self.video_frame_keys, - self.videos_dir, - self.tolerance_s, - self.video_backend, - ) + if len(self.meta.video_keys) > 0: + current_ts = item["timestamp"].item() + query_timestamps = self._get_query_timestamps(current_ts, query_indices) + video_frames = self._query_videos(query_timestamps, ep_idx) + item = {**video_frames, **item} if self.image_transforms is not None: - for cam in self.camera_keys: + image_keys = self.meta.camera_keys + for cam in image_keys: item[cam] = self.image_transforms(item[cam]) return item def __repr__(self): + feature_keys = list(self.features) return ( - f"{self.__class__.__name__}(\n" - f" Repository ID: '{self.repo_id}',\n" - f" Split: '{self.split}',\n" - f" Number of Samples: {self.num_samples},\n" - f" Number of Episodes: {self.num_episodes},\n" - f" Type: {'video (.mp4)' if self.video else 'image (.png)'},\n" - f" Recorded Frames per Second: {self.fps},\n" - f" Camera Keys: {self.camera_keys},\n" - f" Video Frame Keys: {self.video_frame_keys if self.video else 'N/A'},\n" - f" Transformations: {self.image_transforms},\n" - f" Codebase Version: {self.info.get('codebase_version', '< v1.6')},\n" - f")" + f"{self.__class__.__name__}({{\n" + f" Repository ID: '{self.repo_id}',\n" + f" Number of selected episodes: '{self.num_episodes}',\n" + f" Number of selected samples: '{self.num_frames}',\n" + f" Features: '{feature_keys}',\n" + "})',\n" + ) + + def create_episode_buffer(self, episode_index: int | None = None) -> dict: + current_ep_idx = self.meta.total_episodes if episode_index is None else episode_index + return { + "size": 0, + **{key: current_ep_idx if key == "episode_index" else [] for key in self.features}, + } + + def _get_image_file_path(self, episode_index: int, image_key: str, frame_index: int) -> Path: + fpath = DEFAULT_IMAGE_PATH.format( + image_key=image_key, episode_index=episode_index, frame_index=frame_index ) + return self.root / fpath + + def _save_image(self, image: torch.Tensor | np.ndarray | PIL.Image.Image, fpath: Path) -> None: + if self.image_writer is None: + if isinstance(image, torch.Tensor): + image = image.cpu().numpy() + write_image(image, fpath) + else: + self.image_writer.save_image(image=image, fpath=fpath) + + def add_frame(self, frame: dict) -> None: + """ + This function only adds the frame to the episode_buffer. Apart from images — which are written in a + temporary directory — nothing is written to disk. To save those frames, the 'save_episode()' method + then needs to be called. + """ + # TODO(aliberts, rcadene): Add sanity check for the input, check it's numpy or torch, + # check the dtype and shape matches, etc. + + if self.episode_buffer is None: + self.episode_buffer = self.create_episode_buffer() + + frame_index = self.episode_buffer["size"] + timestamp = frame.pop("timestamp") if "timestamp" in frame else frame_index / self.fps + self.episode_buffer["frame_index"].append(frame_index) + self.episode_buffer["timestamp"].append(timestamp) + + for key in frame: + if key not in self.features: + raise ValueError(key) + + if self.features[key]["dtype"] not in ["image", "video"]: + item = frame[key].numpy() if isinstance(frame[key], torch.Tensor) else frame[key] + self.episode_buffer[key].append(item) + elif self.features[key]["dtype"] in ["image", "video"]: + img_path = self._get_image_file_path( + episode_index=self.episode_buffer["episode_index"], image_key=key, frame_index=frame_index + ) + if frame_index == 0: + img_path.parent.mkdir(parents=True, exist_ok=True) + self._save_image(frame[key], img_path) + self.episode_buffer[key].append(str(img_path)) + + self.episode_buffer["size"] += 1 + + def save_episode(self, task: str, encode_videos: bool = True, episode_data: dict | None = None) -> None: + """ + This will save to disk the current episode in self.episode_buffer. Note that since it affects files on + disk, it sets self.consolidated to False to ensure proper consolidation later on before uploading to + the hub. + + Use 'encode_videos' if you want to encode videos during the saving of this episode. Otherwise, + you can do it later with dataset.consolidate(). This is to give more flexibility on when to spend + time for video encoding. + """ + if not episode_data: + episode_buffer = self.episode_buffer + + episode_length = episode_buffer.pop("size") + episode_index = episode_buffer["episode_index"] + if episode_index != self.meta.total_episodes: + # TODO(aliberts): Add option to use existing episode_index + raise NotImplementedError( + "You might have manually provided the episode_buffer with an episode_index that doesn't " + "match the total number of episodes in the dataset. This is not supported for now." + ) + + if episode_length == 0: + raise ValueError( + "You must add one or several frames with `add_frame` before calling `add_episode`." + ) + + task_index = self.meta.get_task_index(task) + + if not set(episode_buffer.keys()) == set(self.features): + raise ValueError() + + for key, ft in self.features.items(): + if key == "index": + episode_buffer[key] = np.arange( + self.meta.total_frames, self.meta.total_frames + episode_length + ) + elif key == "episode_index": + episode_buffer[key] = np.full((episode_length,), episode_index) + elif key == "task_index": + episode_buffer[key] = np.full((episode_length,), task_index) + elif ft["dtype"] in ["image", "video"]: + continue + elif len(ft["shape"]) == 1 and ft["shape"][0] == 1: + episode_buffer[key] = np.array(episode_buffer[key], dtype=ft["dtype"]) + elif len(ft["shape"]) == 1 and ft["shape"][0] > 1: + episode_buffer[key] = np.stack(episode_buffer[key]) + else: + raise ValueError(key) + + self._wait_image_writer() + self._save_episode_table(episode_buffer, episode_index) + + self.meta.save_episode(episode_index, episode_length, task, task_index) + + if encode_videos and len(self.meta.video_keys) > 0: + video_paths = self.encode_episode_videos(episode_index) + for key in self.meta.video_keys: + episode_buffer[key] = video_paths[key] + + if not episode_data: # Reset the buffer + self.episode_buffer = self.create_episode_buffer() + + self.consolidated = False + + def _save_episode_table(self, episode_buffer: dict, episode_index: int) -> None: + episode_dict = {key: episode_buffer[key] for key in self.hf_features} + ep_dataset = datasets.Dataset.from_dict(episode_dict, features=self.hf_features, split="train") + ep_data_path = self.root / self.meta.get_data_file_path(ep_index=episode_index) + ep_data_path.parent.mkdir(parents=True, exist_ok=True) + write_parquet(ep_dataset, ep_data_path) + + def clear_episode_buffer(self) -> None: + episode_index = self.episode_buffer["episode_index"] + if self.image_writer is not None: + for cam_key in self.meta.camera_keys: + img_dir = self._get_image_file_path( + episode_index=episode_index, image_key=cam_key, frame_index=0 + ).parent + if img_dir.is_dir(): + shutil.rmtree(img_dir) + + # Reset the buffer + self.episode_buffer = self.create_episode_buffer() + + def start_image_writer(self, num_processes: int = 0, num_threads: int = 4) -> None: + if isinstance(self.image_writer, AsyncImageWriter): + logging.warning( + "You are starting a new AsyncImageWriter that is replacing an already existing one in the dataset." + ) + + self.image_writer = AsyncImageWriter( + num_processes=num_processes, + num_threads=num_threads, + ) + + def stop_image_writer(self) -> None: + """ + Whenever wrapping this dataset inside a parallelized DataLoader, this needs to be called first to + remove the image_write in order for the LeRobotDataset object to be pickleable and parallelized. + """ + if self.image_writer is not None: + self.image_writer.stop() + self.image_writer = None + + def _wait_image_writer(self) -> None: + """Wait for asynchronous image writer to finish.""" + if self.image_writer is not None: + self.image_writer.wait_until_done() + + def encode_videos(self) -> None: + """ + Use ffmpeg to convert frames stored as png into mp4 videos. + Note: `encode_video_frames` is a blocking call. Making it asynchronous shouldn't speedup encoding, + since video encoding with ffmpeg is already using multithreading. + """ + for ep_idx in range(self.meta.total_episodes): + self.encode_episode_videos(ep_idx) + + def encode_episode_videos(self, episode_index: int) -> dict: + """ + Use ffmpeg to convert frames stored as png into mp4 videos. + Note: `encode_video_frames` is a blocking call. Making it asynchronous shouldn't speedup encoding, + since video encoding with ffmpeg is already using multithreading. + """ + video_paths = {} + for key in self.meta.video_keys: + video_path = self.root / self.meta.get_video_file_path(episode_index, key) + video_paths[key] = str(video_path) + if video_path.is_file(): + # Skip if video is already encoded. Could be the case when resuming data recording. + continue + img_dir = self._get_image_file_path( + episode_index=episode_index, image_key=key, frame_index=0 + ).parent + encode_video_frames(img_dir, video_path, self.fps, overwrite=True) + + return video_paths + + def consolidate(self, run_compute_stats: bool = True, keep_image_files: bool = False) -> None: + self.hf_dataset = self.load_hf_dataset() + self.episode_data_index = get_episode_data_index(self.meta.episodes, self.episodes) + check_timestamps_sync(self.hf_dataset, self.episode_data_index, self.fps, self.tolerance_s) + + if len(self.meta.video_keys) > 0: + self.encode_videos() + self.meta.write_video_info() + + if not keep_image_files: + img_dir = self.root / "images" + if img_dir.is_dir(): + shutil.rmtree(self.root / "images") + + video_files = list(self.root.rglob("*.mp4")) + assert len(video_files) == self.num_episodes * len(self.meta.video_keys) + + parquet_files = list(self.root.rglob("*.parquet")) + assert len(parquet_files) == self.num_episodes + + if run_compute_stats: + self.stop_image_writer() + # TODO(aliberts): refactor stats in save_episodes + self.meta.stats = compute_stats(self) + serialized_stats = serialize_dict(self.meta.stats) + write_json(serialized_stats, self.root / STATS_PATH) + self.consolidated = True + else: + logging.warning( + "Skipping computation of the dataset statistics, dataset is not fully consolidated." + ) @classmethod - def from_preloaded( + def create( cls, - repo_id: str = "from_preloaded", - root: Path | None = None, - split: str = "train", - transform: callable = None, - delta_timestamps: dict[list[float]] | None = None, - # additional preloaded attributes - hf_dataset=None, - episode_data_index=None, - stats=None, - info=None, - videos_dir=None, - video_backend=None, + repo_id: str, + fps: int, + root: str | Path | None = None, + robot: Robot | None = None, + robot_type: str | None = None, + features: dict | None = None, + use_videos: bool = True, + tolerance_s: float = 1e-4, + image_writer_processes: int = 0, + image_writer_threads: int = 0, + video_backend: str | None = None, ) -> "LeRobotDataset": - """Create a LeRobot Dataset from existing data and attributes instead of loading from the filesystem. + """Create a LeRobot Dataset from scratch in order to record data.""" + obj = cls.__new__(cls) + obj.meta = LeRobotDatasetMetadata.create( + repo_id=repo_id, + fps=fps, + root=root, + robot=robot, + robot_type=robot_type, + features=features, + use_videos=use_videos, + ) + obj.repo_id = obj.meta.repo_id + obj.root = obj.meta.root + obj.local_files_only = obj.meta.local_files_only + obj.tolerance_s = tolerance_s + obj.image_writer = None - It is especially useful when converting raw data into LeRobotDataset before saving the dataset - on the filesystem or uploading to the hub. + if image_writer_processes or image_writer_threads: + obj.start_image_writer(image_writer_processes, image_writer_threads) - Note: Meta-data attributes like `repo_id`, `version`, `root`, etc are optional and potentially - meaningless depending on the downstream usage of the return dataset. - """ - # create an empty object of type LeRobotDataset - obj = cls.__new__(cls) - obj.repo_id = repo_id - obj.root = root - obj.split = split - obj.image_transforms = transform - obj.delta_timestamps = delta_timestamps - obj.hf_dataset = hf_dataset - obj.episode_data_index = episode_data_index - obj.stats = stats - obj.info = info if info is not None else {} - obj.videos_dir = videos_dir + # TODO(aliberts, rcadene, alexander-soare): Merge this with OnlineBuffer/DataBuffer + obj.episode_buffer = obj.create_episode_buffer() + + # This bool indicates that the current LeRobotDataset instance is in sync with the files on disk. It + # is used to know when certain operations are need (for instance, computing dataset statistics). In + # order to be able to push the dataset to the hub, it needs to be consolidated first by calling + # self.consolidate(). + obj.consolidated = True + + obj.episodes = None + obj.hf_dataset = None + obj.image_transforms = None + obj.delta_timestamps = None + obj.delta_indices = None + obj.episode_data_index = None obj.video_backend = video_backend if video_backend is not None else "pyav" return obj @@ -225,57 +969,56 @@ class MultiLeRobotDataset(torch.utils.data.Dataset): def __init__( self, repo_ids: list[str], - root: Path | None = DATA_DIR, - split: str = "train", + root: str | Path | None = None, + episodes: dict | None = None, image_transforms: Callable | None = None, delta_timestamps: dict[list[float]] | None = None, + tolerances_s: dict | None = None, + download_videos: bool = True, + local_files_only: bool = False, video_backend: str | None = None, ): super().__init__() self.repo_ids = repo_ids + self.root = Path(root) if root else LEROBOT_HOME + self.tolerances_s = tolerances_s if tolerances_s else {repo_id: 1e-4 for repo_id in repo_ids} # Construct the underlying datasets passing everything but `transform` and `delta_timestamps` which # are handled by this class. self._datasets = [ LeRobotDataset( repo_id, - root=root, - split=split, - delta_timestamps=delta_timestamps, + root=self.root / repo_id, + episodes=episodes[repo_id] if episodes else None, image_transforms=image_transforms, + delta_timestamps=delta_timestamps, + tolerance_s=self.tolerances_s[repo_id], + download_videos=download_videos, + local_files_only=local_files_only, video_backend=video_backend, ) for repo_id in repo_ids ] - # Check that some properties are consistent across datasets. Note: We may relax some of these - # consistency requirements in future iterations of this class. - for repo_id, dataset in zip(self.repo_ids, self._datasets, strict=True): - if dataset.info != self._datasets[0].info: - raise ValueError( - f"Detected a mismatch in dataset info between {self.repo_ids[0]} and {repo_id}. This is " - "not yet supported." - ) + # Disable any data keys that are not common across all of the datasets. Note: we may relax this # restriction in future iterations of this class. For now, this is necessary at least for being able # to use PyTorch's default DataLoader collate function. - self.disabled_data_keys = set() - intersection_data_keys = set(self._datasets[0].hf_dataset.features) - for dataset in self._datasets: - intersection_data_keys.intersection_update(dataset.hf_dataset.features) - if len(intersection_data_keys) == 0: + self.disabled_features = set() + intersection_features = set(self._datasets[0].features) + for ds in self._datasets: + intersection_features.intersection_update(ds.features) + if len(intersection_features) == 0: raise RuntimeError( - "Multiple datasets were provided but they had no keys common to all of them. The " - "multi-dataset functionality currently only keeps common keys." + "Multiple datasets were provided but they had no keys common to all of them. " + "The multi-dataset functionality currently only keeps common keys." ) - for repo_id, dataset in zip(self.repo_ids, self._datasets, strict=True): - extra_keys = set(dataset.hf_dataset.features).difference(intersection_data_keys) + for repo_id, ds in zip(self.repo_ids, self._datasets, strict=True): + extra_keys = set(ds.features).difference(intersection_features) logging.warning( f"keys {extra_keys} of {repo_id} were disabled as they are not contained in all the " "other datasets." ) - self.disabled_data_keys.update(extra_keys) + self.disabled_features.update(extra_keys) - self.root = root - self.split = split self.image_transforms = image_transforms self.delta_timestamps = delta_timestamps self.stats = aggregate_stats(self._datasets) @@ -299,7 +1042,7 @@ def fps(self) -> int: NOTE: Fow now, this relies on a check in __init__ to make sure all sub-datasets have the same info. """ - return self._datasets[0].info["fps"] + return self._datasets[0].meta.info["fps"] @property def video(self) -> bool: @@ -309,13 +1052,13 @@ def video(self) -> bool: NOTE: Fow now, this relies on a check in __init__ to make sure all sub-datasets have the same info. """ - return self._datasets[0].info.get("video", False) + return self._datasets[0].meta.info.get("video", False) @property def features(self) -> datasets.Features: features = {} for dataset in self._datasets: - features.update({k: v for k, v in dataset.features.items() if k not in self.disabled_data_keys}) + features.update({k: v for k, v in dataset.hf_features.items() if k not in self.disabled_features}) return features @property @@ -342,9 +1085,9 @@ def video_frame_keys(self) -> list[str]: return video_frame_keys @property - def num_samples(self) -> int: + def num_frames(self) -> int: """Number of samples/frames.""" - return sum(d.num_samples for d in self._datasets) + return sum(d.num_frames for d in self._datasets) @property def num_episodes(self) -> int: @@ -361,7 +1104,7 @@ def tolerance_s(self) -> float: return 1 / self.fps - 1e-4 def __len__(self): - return self.num_samples + return self.num_frames def __getitem__(self, idx: int) -> dict[str, torch.Tensor]: if idx >= len(self): @@ -370,8 +1113,8 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]: start_idx = 0 dataset_idx = 0 for dataset in self._datasets: - if idx >= start_idx + dataset.num_samples: - start_idx += dataset.num_samples + if idx >= start_idx + dataset.num_frames: + start_idx += dataset.num_frames dataset_idx += 1 continue break @@ -379,7 +1122,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]: raise AssertionError("We expect the loop to break out as long as the index is within bounds.") item = self._datasets[dataset_idx][idx - start_idx] item["dataset_index"] = torch.tensor(dataset_idx) - for data_key in self.disabled_data_keys: + for data_key in self.disabled_features: if data_key in item: del item[data_key] @@ -389,8 +1132,7 @@ def __repr__(self): return ( f"{self.__class__.__name__}(\n" f" Repository IDs: '{self.repo_ids}',\n" - f" Split: '{self.split}',\n" - f" Number of Samples: {self.num_samples},\n" + f" Number of Samples: {self.num_frames},\n" f" Number of Episodes: {self.num_episodes},\n" f" Type: {'video (.mp4)' if self.video else 'image (.png)'},\n" f" Recorded Frames per Second: {self.fps},\n" diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py index 6b093cda7..d907e4687 100644 --- a/lerobot/common/datasets/online_buffer.py +++ b/lerobot/common/datasets/online_buffer.py @@ -187,7 +187,7 @@ def add_data(self, data: dict[str, np.ndarray]): assert data[OnlineBuffer.INDEX_KEY][0].item() == 0 # Shift the incoming indices if necessary. - if self.num_samples > 0: + if self.num_frames > 0: last_episode_index = self._data[OnlineBuffer.EPISODE_INDEX_KEY][next_index - 1] last_data_index = self._data[OnlineBuffer.INDEX_KEY][next_index - 1] data[OnlineBuffer.EPISODE_INDEX_KEY] += last_episode_index + 1 @@ -227,11 +227,11 @@ def num_episodes(self) -> int: ) @property - def num_samples(self) -> int: + def num_frames(self) -> int: return np.count_nonzero(self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]) def __len__(self): - return self.num_samples + return self.num_frames def _item_to_tensors(self, item: dict) -> dict: item_ = {} diff --git a/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py b/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py index 52c4bba3d..e2973ef81 100644 --- a/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py +++ b/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py @@ -30,12 +30,12 @@ from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION from lerobot.common.datasets.push_dataset_to_hub.utils import ( + calculate_episode_data_index, concatenate_episodes, get_default_encoding, save_images_concurrently, ) from lerobot.common.datasets.utils import ( - calculate_episode_data_index, hf_transform_to_torch, ) from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames diff --git a/lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py b/lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py index be20c92cd..264925766 100644 --- a/lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py +++ b/lerobot/common/datasets/push_dataset_to_hub/cam_png_format.py @@ -24,8 +24,11 @@ from PIL import Image as PILImage from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION -from lerobot.common.datasets.push_dataset_to_hub.utils import concatenate_episodes -from lerobot.common.datasets.utils import calculate_episode_data_index, hf_transform_to_torch +from lerobot.common.datasets.push_dataset_to_hub.utils import ( + calculate_episode_data_index, + concatenate_episodes, +) +from lerobot.common.datasets.utils import hf_transform_to_torch from lerobot.common.datasets.video_utils import VideoFrame diff --git a/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py b/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py index 72be130e3..95f9c0071 100644 --- a/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py +++ b/lerobot/common/datasets/push_dataset_to_hub/dora_parquet_format.py @@ -26,8 +26,8 @@ from datasets import Dataset, Features, Image, Sequence, Value from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION +from lerobot.common.datasets.push_dataset_to_hub.utils import calculate_episode_data_index from lerobot.common.datasets.utils import ( - calculate_episode_data_index, hf_transform_to_torch, ) from lerobot.common.datasets.video_utils import VideoFrame diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py b/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py index f5744c521..cfe115034 100644 --- a/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py +++ b/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py @@ -42,12 +42,12 @@ from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION from lerobot.common.datasets.push_dataset_to_hub.openx.transforms import OPENX_STANDARDIZATION_TRANSFORMS from lerobot.common.datasets.push_dataset_to_hub.utils import ( + calculate_episode_data_index, concatenate_episodes, get_default_encoding, save_images_concurrently, ) from lerobot.common.datasets.utils import ( - calculate_episode_data_index, hf_transform_to_torch, ) from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames diff --git a/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py b/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py index 13d6c837e..27b31ba24 100644 --- a/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py +++ b/lerobot/common/datasets/push_dataset_to_hub/pusht_zarr_format.py @@ -27,12 +27,12 @@ from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION from lerobot.common.datasets.push_dataset_to_hub.utils import ( + calculate_episode_data_index, concatenate_episodes, get_default_encoding, save_images_concurrently, ) from lerobot.common.datasets.utils import ( - calculate_episode_data_index, hf_transform_to_torch, ) from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames diff --git a/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py b/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py index d724cf33e..fec893a7f 100644 --- a/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py +++ b/lerobot/common/datasets/push_dataset_to_hub/umi_zarr_format.py @@ -28,12 +28,12 @@ from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION from lerobot.common.datasets.push_dataset_to_hub._umi_imagecodecs_numcodecs import register_codecs from lerobot.common.datasets.push_dataset_to_hub.utils import ( + calculate_episode_data_index, concatenate_episodes, get_default_encoding, save_images_concurrently, ) from lerobot.common.datasets.utils import ( - calculate_episode_data_index, hf_transform_to_torch, ) from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames diff --git a/lerobot/common/datasets/push_dataset_to_hub/utils.py b/lerobot/common/datasets/push_dataset_to_hub/utils.py index 97b54e45b..ebcf87f77 100644 --- a/lerobot/common/datasets/push_dataset_to_hub/utils.py +++ b/lerobot/common/datasets/push_dataset_to_hub/utils.py @@ -16,7 +16,9 @@ import inspect from concurrent.futures import ThreadPoolExecutor from pathlib import Path +from typing import Dict +import datasets import numpy import PIL import torch @@ -72,3 +74,58 @@ def check_repo_id(repo_id: str) -> None: f"""`repo_id` is expected to contain a community or user id `/` the name of the dataset (e.g. 'lerobot/pusht'), but contains '{repo_id}'.""" ) + + +# TODO(aliberts): remove +def calculate_episode_data_index(hf_dataset: datasets.Dataset) -> Dict[str, torch.Tensor]: + """ + Calculate episode data index for the provided HuggingFace Dataset. Relies on episode_index column of hf_dataset. + + Parameters: + - hf_dataset (datasets.Dataset): A HuggingFace dataset containing the episode index. + + Returns: + - episode_data_index: A dictionary containing the data index for each episode. The dictionary has two keys: + - "from": A tensor containing the starting index of each episode. + - "to": A tensor containing the ending index of each episode. + """ + episode_data_index = {"from": [], "to": []} + + current_episode = None + """ + The episode_index is a list of integers, each representing the episode index of the corresponding example. + For instance, the following is a valid episode_index: + [0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2] + + Below, we iterate through the episode_index and populate the episode_data_index dictionary with the starting and + ending index of each episode. For the episode_index above, the episode_data_index dictionary will look like this: + { + "from": [0, 3, 7], + "to": [3, 7, 12] + } + """ + if len(hf_dataset) == 0: + episode_data_index = { + "from": torch.tensor([]), + "to": torch.tensor([]), + } + return episode_data_index + for idx, episode_idx in enumerate(hf_dataset["episode_index"]): + if episode_idx != current_episode: + # We encountered a new episode, so we append its starting location to the "from" list + episode_data_index["from"].append(idx) + # If this is not the first episode, we append the ending location of the previous episode to the "to" list + if current_episode is not None: + episode_data_index["to"].append(idx) + # Let's keep track of the current episode index + current_episode = episode_idx + else: + # We are still in the same episode, so there is nothing for us to do here + pass + # We have reached the end of the dataset, so we append the ending location of the last episode to the "to" list + episode_data_index["to"].append(idx + 1) + + for k in ["from", "to"]: + episode_data_index[k] = torch.tensor(episode_data_index[k]) + + return episode_data_index diff --git a/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py b/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py index ad1cb560e..0047e48c3 100644 --- a/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py +++ b/lerobot/common/datasets/push_dataset_to_hub/xarm_pkl_format.py @@ -27,12 +27,12 @@ from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION from lerobot.common.datasets.push_dataset_to_hub.utils import ( + calculate_episode_data_index, concatenate_episodes, get_default_encoding, save_images_concurrently, ) from lerobot.common.datasets.utils import ( - calculate_episode_data_index, hf_transform_to_torch, ) from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames diff --git a/lerobot/common/datasets/utils.py b/lerobot/common/datasets/utils.py index d6aef15f5..5f088b118 100644 --- a/lerobot/common/datasets/utils.py +++ b/lerobot/common/datasets/utils.py @@ -14,30 +14,56 @@ # See the License for the specific language governing permissions and # limitations under the License. import json -import re -import warnings -from functools import cache +import logging +import textwrap +from itertools import accumulate from pathlib import Path -from typing import Dict +from pprint import pformat +from typing import Any import datasets +import jsonlines +import numpy as np +import pyarrow.compute as pc import torch -from datasets import load_dataset, load_from_disk -from huggingface_hub import DatasetCard, HfApi, hf_hub_download, snapshot_download +from datasets.table import embed_table_storage +from huggingface_hub import DatasetCard, DatasetCardData, HfApi from PIL import Image as PILImage -from safetensors.torch import load_file from torchvision import transforms +from lerobot.common.robot_devices.robots.utils import Robot + +DEFAULT_CHUNK_SIZE = 1000 # Max number of episodes per chunk + +INFO_PATH = "meta/info.json" +EPISODES_PATH = "meta/episodes.jsonl" +STATS_PATH = "meta/stats.json" +TASKS_PATH = "meta/tasks.jsonl" + +DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4" +DEFAULT_PARQUET_PATH = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet" +DEFAULT_IMAGE_PATH = "images/{image_key}/episode_{episode_index:06d}/frame_{frame_index:06d}.png" + DATASET_CARD_TEMPLATE = """ --- # Metadata will go there --- This dataset was created using [LeRobot](https://github.com/huggingface/lerobot). +## {} + """ +DEFAULT_FEATURES = { + "timestamp": {"dtype": "float32", "shape": (1,), "names": None}, + "frame_index": {"dtype": "int64", "shape": (1,), "names": None}, + "episode_index": {"dtype": "int64", "shape": (1,), "names": None}, + "index": {"dtype": "int64", "shape": (1,), "names": None}, + "task_index": {"dtype": "int64", "shape": (1,), "names": None}, +} + -def flatten_dict(d, parent_key="", sep="/"): +def flatten_dict(d: dict, parent_key: str = "", sep: str = "/") -> dict: """Flatten a nested dictionary structure by collapsing nested keys into one key with a separator. For example: @@ -56,7 +82,7 @@ def flatten_dict(d, parent_key="", sep="/"): return dict(items) -def unflatten_dict(d, sep="/"): +def unflatten_dict(d: dict, sep: str = "/") -> dict: outdict = {} for key, value in d.items(): parts = key.split(sep) @@ -69,6 +95,82 @@ def unflatten_dict(d, sep="/"): return outdict +def serialize_dict(stats: dict[str, torch.Tensor | np.ndarray | dict]) -> dict: + serialized_dict = {key: value.tolist() for key, value in flatten_dict(stats).items()} + return unflatten_dict(serialized_dict) + + +def write_parquet(dataset: datasets.Dataset, fpath: Path) -> None: + # Embed image bytes into the table before saving to parquet + format = dataset.format + dataset = dataset.with_format("arrow") + dataset = dataset.map(embed_table_storage, batched=False) + dataset = dataset.with_format(**format) + dataset.to_parquet(fpath) + + +def load_json(fpath: Path) -> Any: + with open(fpath) as f: + return json.load(f) + + +def write_json(data: dict, fpath: Path) -> None: + fpath.parent.mkdir(exist_ok=True, parents=True) + with open(fpath, "w") as f: + json.dump(data, f, indent=4, ensure_ascii=False) + + +def load_jsonlines(fpath: Path) -> list[Any]: + with jsonlines.open(fpath, "r") as reader: + return list(reader) + + +def write_jsonlines(data: dict, fpath: Path) -> None: + fpath.parent.mkdir(exist_ok=True, parents=True) + with jsonlines.open(fpath, "w") as writer: + writer.write_all(data) + + +def append_jsonlines(data: dict, fpath: Path) -> None: + fpath.parent.mkdir(exist_ok=True, parents=True) + with jsonlines.open(fpath, "a") as writer: + writer.write(data) + + +def load_info(local_dir: Path) -> dict: + info = load_json(local_dir / INFO_PATH) + for ft in info["features"].values(): + ft["shape"] = tuple(ft["shape"]) + return info + + +def load_stats(local_dir: Path) -> dict: + if not (local_dir / STATS_PATH).exists(): + return None + stats = load_json(local_dir / STATS_PATH) + stats = {key: torch.tensor(value) for key, value in flatten_dict(stats).items()} + return unflatten_dict(stats) + + +def load_tasks(local_dir: Path) -> dict: + tasks = load_jsonlines(local_dir / TASKS_PATH) + return {item["task_index"]: item["task"] for item in sorted(tasks, key=lambda x: x["task_index"])} + + +def load_episodes(local_dir: Path) -> dict: + return load_jsonlines(local_dir / EPISODES_PATH) + + +def load_image_as_numpy(fpath: str | Path, dtype="float32", channel_first: bool = True) -> np.ndarray: + img = PILImage.open(fpath).convert("RGB") + img_array = np.array(img, dtype=dtype) + if channel_first: # (H, W, C) -> (C, H, W) + img_array = np.transpose(img_array, (2, 0, 1)) + if "float" in dtype: + img_array /= 255.0 + return img_array + + def hf_transform_to_torch(items_dict: dict[torch.Tensor | None]): """Get a transform function that convert items from Hugging Face dataset (pyarrow) to torch tensors. Importantly, images are converted from PIL, which corresponds to @@ -80,14 +182,6 @@ def hf_transform_to_torch(items_dict: dict[torch.Tensor | None]): if isinstance(first_item, PILImage.Image): to_tensor = transforms.ToTensor() items_dict[key] = [to_tensor(img) for img in items_dict[key]] - elif isinstance(first_item, str): - # TODO (michel-aractingi): add str2embedding via language tokenizer - # For now we leave this part up to the user to choose how to address - # language conditioned tasks - pass - elif isinstance(first_item, dict) and "path" in first_item and "timestamp" in first_item: - # video frame will be processed downstream - pass elif first_item is None: pass else: @@ -95,19 +189,67 @@ def hf_transform_to_torch(items_dict: dict[torch.Tensor | None]): return items_dict -@cache -def get_hf_dataset_safe_version(repo_id: str, version: str) -> str: +def _get_major_minor(version: str) -> tuple[int]: + split = version.strip("v").split(".") + return int(split[0]), int(split[1]) + + +class BackwardCompatibilityError(Exception): + def __init__(self, repo_id, version): + message = textwrap.dedent(f""" + BackwardCompatibilityError: The dataset you requested ({repo_id}) is in {version} format. + + We introduced a new format since v2.0 which is not backward compatible with v1.x. + Please, use our conversion script. Modify the following command with your own task description: + ``` + python lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py \\ + --repo-id {repo_id} \\ + --single-task "TASK DESCRIPTION." # <---- /!\\ Replace TASK DESCRIPTION /!\\ + ``` + + A few examples to replace TASK DESCRIPTION: "Pick up the blue cube and place it into the bin.", + "Insert the peg into the socket.", "Slide open the ziploc bag.", "Take the elevator to the 1st floor.", + "Open the top cabinet, store the pot inside it then close the cabinet.", "Push the T-shaped block onto the T-shaped target.", + "Grab the spray paint on the shelf and place it in the bin on top of the robot dog.", "Fold the sweatshirt.", ... + + If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) + or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + """) + super().__init__(message) + + +def check_version_compatibility( + repo_id: str, version_to_check: str, current_version: str, enforce_breaking_major: bool = True +) -> None: + current_major, _ = _get_major_minor(current_version) + major_to_check, _ = _get_major_minor(version_to_check) + if major_to_check < current_major and enforce_breaking_major: + raise BackwardCompatibilityError(repo_id, version_to_check) + elif float(version_to_check.strip("v")) < float(current_version.strip("v")): + logging.warning( + f"""The dataset you requested ({repo_id}) was created with a previous version ({version_to_check}) of the + codebase. The current codebase version is {current_version}. You should be fine since + backward compatibility is maintained. If you encounter a problem, contact LeRobot maintainers on + Discord ('https://discord.com/invite/s3KuuzsPFb') or open an issue on github.""", + ) + + +def get_hub_safe_version(repo_id: str, version: str) -> str: api = HfApi() dataset_info = api.list_repo_refs(repo_id, repo_type="dataset") branches = [b.name for b in dataset_info.branches] if version not in branches: - warnings.warn( + num_version = float(version.strip("v")) + hub_num_versions = [float(v.strip("v")) for v in branches if v.startswith("v")] + if num_version >= 2.0 and all(v < 2.0 for v in hub_num_versions): + raise BackwardCompatibilityError(repo_id, version) + + logging.warning( f"""You are trying to load a dataset from {repo_id} created with a previous version of the codebase. The following versions are available: {branches}. The requested version ('{version}') is not found. You should be fine since backward compatibility is maintained. If you encounter a problem, contact LeRobot maintainers on Discord ('https://discord.com/invite/s3KuuzsPFb') or open an issue on github.""", - stacklevel=1, ) if "main" not in branches: raise ValueError(f"Version 'main' not found on {repo_id}") @@ -116,275 +258,184 @@ def get_hf_dataset_safe_version(repo_id: str, version: str) -> str: return version -def load_hf_dataset(repo_id: str, version: str, root: Path, split: str) -> datasets.Dataset: - """hf_dataset contains all the observations, states, actions, rewards, etc.""" - if root is not None: - hf_dataset = load_from_disk(str(Path(root) / repo_id / "train")) - # TODO(rcadene): clean this which enables getting a subset of dataset - if split != "train": - if "%" in split: - raise NotImplementedError(f"We dont support splitting based on percentage for now ({split}).") - match_from = re.search(r"train\[(\d+):\]", split) - match_to = re.search(r"train\[:(\d+)\]", split) - if match_from: - from_frame_index = int(match_from.group(1)) - hf_dataset = hf_dataset.select(range(from_frame_index, len(hf_dataset))) - elif match_to: - to_frame_index = int(match_to.group(1)) - hf_dataset = hf_dataset.select(range(to_frame_index)) - else: - raise ValueError( - f'`split` ({split}) should either be "train", "train[INT:]", or "train[:INT]"' - ) - else: - safe_version = get_hf_dataset_safe_version(repo_id, version) - hf_dataset = load_dataset(repo_id, revision=safe_version, split=split) - - hf_dataset.set_transform(hf_transform_to_torch) - return hf_dataset - - -def load_episode_data_index(repo_id, version, root) -> dict[str, torch.Tensor]: - """episode_data_index contains the range of indices for each episode - - Example: - ```python - from_id = episode_data_index["from"][episode_id].item() - to_id = episode_data_index["to"][episode_id].item() - episode_frames = [dataset[i] for i in range(from_id, to_id)] - ``` - """ - if root is not None: - path = Path(root) / repo_id / "meta_data" / "episode_data_index.safetensors" - else: - safe_version = get_hf_dataset_safe_version(repo_id, version) - path = hf_hub_download( - repo_id, "meta_data/episode_data_index.safetensors", repo_type="dataset", revision=safe_version - ) - - return load_file(path) - - -def load_stats(repo_id, version, root) -> dict[str, dict[str, torch.Tensor]]: - """stats contains the statistics per modality computed over the full dataset, such as max, min, mean, std - - Example: - ```python - normalized_action = (action - stats["action"]["mean"]) / stats["action"]["std"] - ``` - """ - if root is not None: - path = Path(root) / repo_id / "meta_data" / "stats.safetensors" - else: - safe_version = get_hf_dataset_safe_version(repo_id, version) - path = hf_hub_download( - repo_id, "meta_data/stats.safetensors", repo_type="dataset", revision=safe_version - ) +def get_hf_features_from_features(features: dict) -> datasets.Features: + hf_features = {} + for key, ft in features.items(): + if ft["dtype"] == "video": + continue + elif ft["dtype"] == "image": + hf_features[key] = datasets.Image() + elif ft["shape"] == (1,): + hf_features[key] = datasets.Value(dtype=ft["dtype"]) + else: + assert len(ft["shape"]) == 1 + hf_features[key] = datasets.Sequence( + length=ft["shape"][0], feature=datasets.Value(dtype=ft["dtype"]) + ) - stats = load_file(path) - return unflatten_dict(stats) + return datasets.Features(hf_features) -def load_info(repo_id, version, root) -> dict: - """info contains useful information regarding the dataset that are not stored elsewhere +def get_features_from_robot(robot: Robot, use_videos: bool = True) -> dict: + camera_ft = {} + if robot.cameras: + camera_ft = { + key: {"dtype": "video" if use_videos else "image", **ft} + for key, ft in robot.camera_features.items() + } + return {**robot.motor_features, **camera_ft, **DEFAULT_FEATURES} + + +def create_empty_dataset_info( + codebase_version: str, + fps: int, + robot_type: str, + features: dict, + use_videos: bool, +) -> dict: + return { + "codebase_version": codebase_version, + "robot_type": robot_type, + "total_episodes": 0, + "total_frames": 0, + "total_tasks": 0, + "total_videos": 0, + "total_chunks": 0, + "chunks_size": DEFAULT_CHUNK_SIZE, + "fps": fps, + "splits": {}, + "data_path": DEFAULT_PARQUET_PATH, + "video_path": DEFAULT_VIDEO_PATH if use_videos else None, + "features": features, + } - Example: - ```python - print("frame per second used to collect the video", info["fps"]) - ``` - """ - if root is not None: - path = Path(root) / repo_id / "meta_data" / "info.json" - else: - safe_version = get_hf_dataset_safe_version(repo_id, version) - path = hf_hub_download(repo_id, "meta_data/info.json", repo_type="dataset", revision=safe_version) - with open(path) as f: - info = json.load(f) - return info +def get_episode_data_index( + episode_dicts: list[dict], episodes: list[int] | None = None +) -> dict[str, torch.Tensor]: + episode_lengths = {ep_idx: ep_dict["length"] for ep_idx, ep_dict in enumerate(episode_dicts)} + if episodes is not None: + episode_lengths = {ep_idx: episode_lengths[ep_idx] for ep_idx in episodes} + cumulative_lenghts = list(accumulate(episode_lengths.values())) + return { + "from": torch.LongTensor([0] + cumulative_lenghts[:-1]), + "to": torch.LongTensor(cumulative_lenghts), + } -def load_videos(repo_id, version, root) -> Path: - if root is not None: - path = Path(root) / repo_id / "videos" - else: - # TODO(rcadene): we download the whole repo here. see if we can avoid this - safe_version = get_hf_dataset_safe_version(repo_id, version) - repo_dir = snapshot_download(repo_id, repo_type="dataset", revision=safe_version) - path = Path(repo_dir) / "videos" - return path +def calculate_total_episode( + hf_dataset: datasets.Dataset, raise_if_not_contiguous: bool = True +) -> dict[str, torch.Tensor]: + episode_indices = sorted(hf_dataset.unique("episode_index")) + total_episodes = len(episode_indices) + if raise_if_not_contiguous and episode_indices != list(range(total_episodes)): + raise ValueError("episode_index values are not sorted and contiguous.") + return total_episodes + + +def calculate_episode_data_index(hf_dataset: datasets.Dataset) -> dict[str, torch.Tensor]: + episode_lengths = [] + table = hf_dataset.data.table + total_episodes = calculate_total_episode(hf_dataset) + for ep_idx in range(total_episodes): + ep_table = table.filter(pc.equal(table["episode_index"], ep_idx)) + episode_lengths.insert(ep_idx, len(ep_table)) + + cumulative_lenghts = list(accumulate(episode_lengths)) + return { + "from": torch.LongTensor([0] + cumulative_lenghts[:-1]), + "to": torch.LongTensor(cumulative_lenghts), + } -def load_previous_and_future_frames( - item: dict[str, torch.Tensor], +def check_timestamps_sync( hf_dataset: datasets.Dataset, episode_data_index: dict[str, torch.Tensor], - delta_timestamps: dict[str, list[float]], + fps: int, tolerance_s: float, -) -> dict[torch.Tensor]: + raise_value_error: bool = True, +) -> bool: """ - Given a current item in the dataset containing a timestamp (e.g. 0.6 seconds), and a list of time differences of - some modalities (e.g. delta_timestamps={"observation.image": [-0.8, -0.2, 0, 0.2]}), this function computes for each - given modality (e.g. "observation.image") a list of query timestamps (e.g. [-0.2, 0.4, 0.6, 0.8]) and loads the closest - frames in the dataset. - - Importantly, when no frame can be found around a query timestamp within a specified tolerance window, this function - raises an AssertionError. When a timestamp is queried before the first available timestamp of the episode or after - the last available timestamp, the violation of the tolerance doesnt raise an AssertionError, and the function - populates a boolean array indicating which frames are outside of the episode range. For instance, this boolean array - is useful during batched training to not supervise actions associated to timestamps coming after the end of the - episode, or to pad the observations in a specific way. Note that by default the observation frames before the start - of the episode are the same as the first frame of the episode. - - Parameters: - - item (dict): A dictionary containing all the data related to a frame. It is the result of `dataset[idx]`. Each key - corresponds to a different modality (e.g., "timestamp", "observation.image", "action"). - - hf_dataset (datasets.Dataset): A dictionary containing the full dataset. Each key corresponds to a different - modality (e.g., "timestamp", "observation.image", "action"). - - episode_data_index (dict): A dictionary containing two keys ("from" and "to") associated to dataset indices. - They indicate the start index and end index of each episode in the dataset. - - delta_timestamps (dict): A dictionary containing lists of delta timestamps for each possible modality to be - retrieved. These deltas are added to the item timestamp to form the query timestamps. - - tolerance_s (float, optional): The tolerance level (in seconds) used to determine if a data point is close enough to the query - timestamp by asserting `tol > difference`. It is suggested to set `tol` to a smaller value than the - smallest expected inter-frame period, but large enough to account for jitter. - - Returns: - - The same item with the queried frames for each modality specified in delta_timestamps, with an additional key for - each modality (e.g. "observation.image_is_pad"). - - Raises: - - AssertionError: If any of the frames unexpectedly violate the tolerance level. This could indicate synchronization - issues with timestamps during data collection. + This check is to make sure that each timestamps is separated to the next by 1/fps +/- tolerance to + account for possible numerical error. """ - # get indices of the frames associated to the episode, and their timestamps - ep_id = item["episode_index"].item() - ep_data_id_from = episode_data_index["from"][ep_id].item() - ep_data_id_to = episode_data_index["to"][ep_id].item() - ep_data_ids = torch.arange(ep_data_id_from, ep_data_id_to, 1) - - # load timestamps - ep_timestamps = hf_dataset.select_columns("timestamp")[ep_data_id_from:ep_data_id_to]["timestamp"] - ep_timestamps = torch.stack(ep_timestamps) - - # we make the assumption that the timestamps are sorted - ep_first_ts = ep_timestamps[0] - ep_last_ts = ep_timestamps[-1] - current_ts = item["timestamp"].item() - - for key in delta_timestamps: - # get timestamps used as query to retrieve data of previous/future frames - delta_ts = delta_timestamps[key] - query_ts = current_ts + torch.tensor(delta_ts) - - # compute distances between each query timestamp and all timestamps of all the frames belonging to the episode - dist = torch.cdist(query_ts[:, None], ep_timestamps[:, None], p=1) - min_, argmin_ = dist.min(1) - - # TODO(rcadene): synchronize timestamps + interpolation if needed - - is_pad = min_ > tolerance_s - - # check violated query timestamps are all outside the episode range - assert ((query_ts[is_pad] < ep_first_ts) | (ep_last_ts < query_ts[is_pad])).all(), ( - f"One or several timestamps unexpectedly violate the tolerance ({min_} > {tolerance_s=}) inside episode range." - "This might be due to synchronization issues with timestamps during data collection." - ) - - # get dataset indices corresponding to frames to be loaded - data_ids = ep_data_ids[argmin_] - - # load frames modality - item[key] = hf_dataset.select_columns(key)[data_ids][key] - - if isinstance(item[key][0], dict) and "path" in item[key][0]: - # video mode where frame are expressed as dict of path and timestamp - item[key] = item[key] - else: - item[key] = torch.stack(item[key]) - - item[f"{key}_is_pad"] = is_pad - - return item - - -def calculate_episode_data_index(hf_dataset: datasets.Dataset) -> Dict[str, torch.Tensor]: + timestamps = torch.stack(hf_dataset["timestamp"]) + diffs = torch.diff(timestamps) + within_tolerance = torch.abs(diffs - 1 / fps) <= tolerance_s + + # We mask differences between the timestamp at the end of an episode + # and the one at the start of the next episode since these are expected + # to be outside tolerance. + mask = torch.ones(len(diffs), dtype=torch.bool) + ignored_diffs = episode_data_index["to"][:-1] - 1 + mask[ignored_diffs] = False + filtered_within_tolerance = within_tolerance[mask] + + if not torch.all(filtered_within_tolerance): + # Track original indices before masking + original_indices = torch.arange(len(diffs)) + filtered_indices = original_indices[mask] + outside_tolerance_filtered_indices = torch.nonzero(~filtered_within_tolerance) # .squeeze() + outside_tolerance_indices = filtered_indices[outside_tolerance_filtered_indices] + episode_indices = torch.stack(hf_dataset["episode_index"]) + + outside_tolerances = [] + for idx in outside_tolerance_indices: + entry = { + "timestamps": [timestamps[idx], timestamps[idx + 1]], + "diff": diffs[idx], + "episode_index": episode_indices[idx].item(), + } + outside_tolerances.append(entry) + + if raise_value_error: + raise ValueError( + f"""One or several timestamps unexpectedly violate the tolerance inside episode range. + This might be due to synchronization issues with timestamps during data collection. + \n{pformat(outside_tolerances)}""" + ) + return False + + return True + + +def check_delta_timestamps( + delta_timestamps: dict[str, list[float]], fps: int, tolerance_s: float, raise_value_error: bool = True +) -> bool: + """This will check if all the values in delta_timestamps are multiples of 1/fps +/- tolerance. + This is to ensure that these delta_timestamps added to any timestamp from a dataset will themselves be + actual timestamps from the dataset. """ - Calculate episode data index for the provided HuggingFace Dataset. Relies on episode_index column of hf_dataset. + outside_tolerance = {} + for key, delta_ts in delta_timestamps.items(): + within_tolerance = [abs(ts * fps - round(ts * fps)) / fps <= tolerance_s for ts in delta_ts] + if not all(within_tolerance): + outside_tolerance[key] = [ + ts for ts, is_within in zip(delta_ts, within_tolerance, strict=True) if not is_within + ] - Parameters: - - hf_dataset (datasets.Dataset): A HuggingFace dataset containing the episode index. + if len(outside_tolerance) > 0: + if raise_value_error: + raise ValueError( + f""" + The following delta_timestamps are found outside of tolerance range. + Please make sure they are multiples of 1/{fps} +/- tolerance and adjust + their values accordingly. + \n{pformat(outside_tolerance)} + """ + ) + return False - Returns: - - episode_data_index: A dictionary containing the data index for each episode. The dictionary has two keys: - - "from": A tensor containing the starting index of each episode. - - "to": A tensor containing the ending index of each episode. - """ - episode_data_index = {"from": [], "to": []} + return True - current_episode = None - """ - The episode_index is a list of integers, each representing the episode index of the corresponding example. - For instance, the following is a valid episode_index: - [0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2] - - Below, we iterate through the episode_index and populate the episode_data_index dictionary with the starting and - ending index of each episode. For the episode_index above, the episode_data_index dictionary will look like this: - { - "from": [0, 3, 7], - "to": [3, 7, 12] - } - """ - if len(hf_dataset) == 0: - episode_data_index = { - "from": torch.tensor([]), - "to": torch.tensor([]), - } - return episode_data_index - for idx, episode_idx in enumerate(hf_dataset["episode_index"]): - if episode_idx != current_episode: - # We encountered a new episode, so we append its starting location to the "from" list - episode_data_index["from"].append(idx) - # If this is not the first episode, we append the ending location of the previous episode to the "to" list - if current_episode is not None: - episode_data_index["to"].append(idx) - # Let's keep track of the current episode index - current_episode = episode_idx - else: - # We are still in the same episode, so there is nothing for us to do here - pass - # We have reached the end of the dataset, so we append the ending location of the last episode to the "to" list - episode_data_index["to"].append(idx + 1) - for k in ["from", "to"]: - episode_data_index[k] = torch.tensor(episode_data_index[k]) +def get_delta_indices(delta_timestamps: dict[str, list[float]], fps: int) -> dict[str, list[int]]: + delta_indices = {} + for key, delta_ts in delta_timestamps.items(): + delta_indices[key] = (torch.tensor(delta_ts) * fps).long().tolist() - return episode_data_index - - -def reset_episode_index(hf_dataset: datasets.Dataset) -> datasets.Dataset: - """Reset the `episode_index` of the provided HuggingFace Dataset. - - `episode_data_index` (and related functionality such as `load_previous_and_future_frames`) requires the - `episode_index` to be sorted, continuous (1,1,1 and not 1,2,1) and start at 0. - - This brings the `episode_index` to the required format. - """ - if len(hf_dataset) == 0: - return hf_dataset - unique_episode_idxs = torch.stack(hf_dataset["episode_index"]).unique().tolist() - episode_idx_to_reset_idx_mapping = { - ep_id: reset_ep_id for reset_ep_id, ep_id in enumerate(unique_episode_idxs) - } - - def modify_ep_idx_func(example): - example["episode_index"] = episode_idx_to_reset_idx_mapping[example["episode_index"].item()] - return example - - hf_dataset = hf_dataset.map(modify_ep_idx_func) - - return hf_dataset + return delta_indices def cycle(iterable): @@ -400,7 +451,7 @@ def cycle(iterable): iterator = iter(iterable) -def create_branch(repo_id, *, branch: str, repo_type: str | None = None): +def create_branch(repo_id, *, branch: str, repo_type: str | None = None) -> None: """Create a branch on a existing Hugging Face repo. Delete the branch if it already exists before creating it. """ @@ -415,12 +466,35 @@ def create_branch(repo_id, *, branch: str, repo_type: str | None = None): api.create_branch(repo_id, repo_type=repo_type, branch=branch) -def create_lerobot_dataset_card(tags: list | None = None, text: str | None = None) -> DatasetCard: - card = DatasetCard(DATASET_CARD_TEMPLATE) - card.data.task_categories = ["robotics"] - card.data.tags = ["LeRobot"] - if tags is not None: - card.data.tags += tags - if text is not None: - card.text += text - return card +def create_lerobot_dataset_card( + tags: list | None = None, + dataset_info: dict | None = None, + **kwargs, +) -> DatasetCard: + """ + Keyword arguments will be used to replace values in ./lerobot/common/datasets/card_template.md. + Note: If specified, license must be one of https://huggingface.co/docs/hub/repositories-licenses. + """ + card_tags = ["LeRobot"] + if tags: + card_tags += tags + if dataset_info: + dataset_structure = "[meta/info.json](meta/info.json):\n" + dataset_structure += f"```json\n{json.dumps(dataset_info, indent=4)}\n```\n" + kwargs = {**kwargs, "dataset_structure": dataset_structure} + card_data = DatasetCardData( + license=kwargs.get("license"), + tags=card_tags, + task_categories=["robotics"], + configs=[ + { + "config_name": "default", + "data_files": "data/*/*.parquet", + } + ], + ) + return DatasetCard.from_template( + card_data=card_data, + template_path="./lerobot/common/datasets/card_template.md", + **kwargs, + ) diff --git a/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py b/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py new file mode 100644 index 000000000..c8da2fe14 --- /dev/null +++ b/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py @@ -0,0 +1,882 @@ +#!/usr/bin/env python + +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script is for internal use to convert all datasets under the 'lerobot' hub user account to v2. + +Note: Since the original Aloha datasets don't use shadow motors, you need to comment those out in +lerobot/configs/robot/aloha.yaml before running this script. +""" + +import traceback +from pathlib import Path +from textwrap import dedent + +from lerobot import available_datasets +from lerobot.common.datasets.v2.convert_dataset_v1_to_v2 import convert_dataset, parse_robot_config + +LOCAL_DIR = Path("data/") + +ALOHA_CONFIG = Path("lerobot/configs/robot/aloha.yaml") +ALOHA_MOBILE_INFO = { + "robot_config": parse_robot_config(ALOHA_CONFIG), + "license": "mit", + "url": "https://mobile-aloha.github.io/", + "paper": "https://arxiv.org/abs/2401.02117", + "citation_bibtex": dedent(r""" + @inproceedings{fu2024mobile, + author = {Fu, Zipeng and Zhao, Tony Z. and Finn, Chelsea}, + title = {Mobile ALOHA: Learning Bimanual Mobile Manipulation with Low-Cost Whole-Body Teleoperation}, + booktitle = {arXiv}, + year = {2024}, + }""").lstrip(), +} +ALOHA_STATIC_INFO = { + "robot_config": parse_robot_config(ALOHA_CONFIG), + "license": "mit", + "url": "https://tonyzhaozh.github.io/aloha/", + "paper": "https://arxiv.org/abs/2304.13705", + "citation_bibtex": dedent(r""" + @article{Zhao2023LearningFB, + title={Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware}, + author={Tony Zhao and Vikash Kumar and Sergey Levine and Chelsea Finn}, + journal={RSS}, + year={2023}, + volume={abs/2304.13705}, + url={https://arxiv.org/abs/2304.13705} + }""").lstrip(), +} +PUSHT_INFO = { + "license": "mit", + "url": "https://diffusion-policy.cs.columbia.edu/", + "paper": "https://arxiv.org/abs/2303.04137v5", + "citation_bibtex": dedent(r""" + @article{chi2024diffusionpolicy, + author = {Cheng Chi and Zhenjia Xu and Siyuan Feng and Eric Cousineau and Yilun Du and Benjamin Burchfiel and Russ Tedrake and Shuran Song}, + title ={Diffusion Policy: Visuomotor Policy Learning via Action Diffusion}, + journal = {The International Journal of Robotics Research}, + year = {2024}, + }""").lstrip(), +} +XARM_INFO = { + "license": "mit", + "url": "https://www.nicklashansen.com/td-mpc/", + "paper": "https://arxiv.org/abs/2203.04955", + "citation_bibtex": dedent(r""" + @inproceedings{Hansen2022tdmpc, + title={Temporal Difference Learning for Model Predictive Control}, + author={Nicklas Hansen and Xiaolong Wang and Hao Su}, + booktitle={ICML}, + year={2022} + } + """), +} +UNITREEH_INFO = { + "license": "apache-2.0", +} + +DATASETS = { + "aloha_mobile_cabinet": { + "single_task": "Open the top cabinet, store the pot inside it then close the cabinet.", + **ALOHA_MOBILE_INFO, + }, + "aloha_mobile_chair": { + "single_task": "Push the chairs in front of the desk to place them against it.", + **ALOHA_MOBILE_INFO, + }, + "aloha_mobile_elevator": { + "single_task": "Take the elevator to the 1st floor.", + **ALOHA_MOBILE_INFO, + }, + "aloha_mobile_shrimp": { + "single_task": "Sauté the raw shrimp on both sides, then serve it in the bowl.", + **ALOHA_MOBILE_INFO, + }, + "aloha_mobile_wash_pan": { + "single_task": "Pick up the pan, rinse it in the sink and then place it in the drying rack.", + **ALOHA_MOBILE_INFO, + }, + "aloha_mobile_wipe_wine": { + "single_task": "Pick up the wet cloth on the faucet and use it to clean the spilled wine on the table and underneath the glass.", + **ALOHA_MOBILE_INFO, + }, + "aloha_static_battery": { + "single_task": "Place the battery into the slot of the remote controller.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_candy": {"single_task": "Pick up the candy and unwrap it.", **ALOHA_STATIC_INFO}, + "aloha_static_coffee": { + "single_task": "Place the coffee capsule inside the capsule container, then place the cup onto the center of the cup tray, then push the 'Hot Water' and 'Travel Mug' buttons.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_coffee_new": { + "single_task": "Place the coffee capsule inside the capsule container, then place the cup onto the center of the cup tray.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_cups_open": { + "single_task": "Pick up the plastic cup and open its lid.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_fork_pick_up": { + "single_task": "Pick up the fork and place it on the plate.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_pingpong_test": { + "single_task": "Transfer one of the two balls in the right glass into the left glass, then transfer it back to the right glass.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_pro_pencil": { + "single_task": "Pick up the pencil with the right arm, hand it over to the left arm then place it back onto the table.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_screw_driver": { + "single_task": "Pick up the screwdriver with the right arm, hand it over to the left arm then place it into the cup.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_tape": { + "single_task": "Cut a small piece of tape from the tape dispenser then place it on the cardboard box's edge.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_thread_velcro": { + "single_task": "Pick up the velcro cable tie with the left arm, then insert the end of the velcro tie into the other end's loop with the right arm.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_towel": { + "single_task": "Pick up a piece of paper towel and place it on the spilled liquid.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_vinh_cup": { + "single_task": "Pick up the platic cup with the right arm, then pop its lid open with the left arm.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_vinh_cup_left": { + "single_task": "Pick up the platic cup with the left arm, then pop its lid open with the right arm.", + **ALOHA_STATIC_INFO, + }, + "aloha_static_ziploc_slide": {"single_task": "Slide open the ziploc bag.", **ALOHA_STATIC_INFO}, + "aloha_sim_insertion_scripted": {"single_task": "Insert the peg into the socket.", **ALOHA_STATIC_INFO}, + "aloha_sim_insertion_scripted_image": { + "single_task": "Insert the peg into the socket.", + **ALOHA_STATIC_INFO, + }, + "aloha_sim_insertion_human": {"single_task": "Insert the peg into the socket.", **ALOHA_STATIC_INFO}, + "aloha_sim_insertion_human_image": { + "single_task": "Insert the peg into the socket.", + **ALOHA_STATIC_INFO, + }, + "aloha_sim_transfer_cube_scripted": { + "single_task": "Pick up the cube with the right arm and transfer it to the left arm.", + **ALOHA_STATIC_INFO, + }, + "aloha_sim_transfer_cube_scripted_image": { + "single_task": "Pick up the cube with the right arm and transfer it to the left arm.", + **ALOHA_STATIC_INFO, + }, + "aloha_sim_transfer_cube_human": { + "single_task": "Pick up the cube with the right arm and transfer it to the left arm.", + **ALOHA_STATIC_INFO, + }, + "aloha_sim_transfer_cube_human_image": { + "single_task": "Pick up the cube with the right arm and transfer it to the left arm.", + **ALOHA_STATIC_INFO, + }, + "pusht": {"single_task": "Push the T-shaped block onto the T-shaped target.", **PUSHT_INFO}, + "pusht_image": {"single_task": "Push the T-shaped block onto the T-shaped target.", **PUSHT_INFO}, + "unitreeh1_fold_clothes": {"single_task": "Fold the sweatshirt.", **UNITREEH_INFO}, + "unitreeh1_rearrange_objects": {"single_task": "Put the object into the bin.", **UNITREEH_INFO}, + "unitreeh1_two_robot_greeting": { + "single_task": "Greet the other robot with a high five.", + **UNITREEH_INFO, + }, + "unitreeh1_warehouse": { + "single_task": "Grab the spray paint on the shelf and place it in the bin on top of the robot dog.", + **UNITREEH_INFO, + }, + "xarm_lift_medium": {"single_task": "Pick up the cube and lift it.", **XARM_INFO}, + "xarm_lift_medium_image": {"single_task": "Pick up the cube and lift it.", **XARM_INFO}, + "xarm_lift_medium_replay": {"single_task": "Pick up the cube and lift it.", **XARM_INFO}, + "xarm_lift_medium_replay_image": {"single_task": "Pick up the cube and lift it.", **XARM_INFO}, + "xarm_push_medium": {"single_task": "Push the cube onto the target.", **XARM_INFO}, + "xarm_push_medium_image": {"single_task": "Push the cube onto the target.", **XARM_INFO}, + "xarm_push_medium_replay": {"single_task": "Push the cube onto the target.", **XARM_INFO}, + "xarm_push_medium_replay_image": {"single_task": "Push the cube onto the target.", **XARM_INFO}, + "umi_cup_in_the_wild": { + "single_task": "Put the cup on the plate.", + "license": "apache-2.0", + }, + "asu_table_top": { + "tasks_col": "language_instruction", + "license": "mit", + "paper": "https://link.springer.com/article/10.1007/s10514-023-10129-1", + "citation_bibtex": dedent(r""" + @inproceedings{zhou2023modularity, + title={Modularity through Attention: Efficient Training and Transfer of Language-Conditioned Policies for Robot Manipulation}, + author={Zhou, Yifan and Sonawani, Shubham and Phielipp, Mariano and Stepputtis, Simon and Amor, Heni}, + booktitle={Conference on Robot Learning}, + pages={1684--1695}, + year={2023}, + organization={PMLR} + } + @article{zhou2023learning, + title={Learning modular language-conditioned robot policies through attention}, + author={Zhou, Yifan and Sonawani, Shubham and Phielipp, Mariano and Ben Amor, Heni and Stepputtis, Simon}, + journal={Autonomous Robots}, + pages={1--21}, + year={2023}, + publisher={Springer} + }""").lstrip(), + }, + "austin_buds_dataset": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://ut-austin-rpl.github.io/BUDS-website/", + "paper": "https://arxiv.org/abs/2109.13841", + "citation_bibtex": dedent(r""" + @article{zhu2022bottom, + title={Bottom-Up Skill Discovery From Unsegmented Demonstrations for Long-Horizon Robot Manipulation}, + author={Zhu, Yifeng and Stone, Peter and Zhu, Yuke}, + journal={IEEE Robotics and Automation Letters}, + volume={7}, + number={2}, + pages={4126--4133}, + year={2022}, + publisher={IEEE} + }""").lstrip(), + }, + "austin_sailor_dataset": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://ut-austin-rpl.github.io/sailor/", + "paper": "https://arxiv.org/abs/2210.11435", + "citation_bibtex": dedent(r""" + @inproceedings{nasiriany2022sailor, + title={Learning and Retrieval from Prior Data for Skill-based Imitation Learning}, + author={Soroush Nasiriany and Tian Gao and Ajay Mandlekar and Yuke Zhu}, + booktitle={Conference on Robot Learning (CoRL)}, + year={2022} + }""").lstrip(), + }, + "austin_sirius_dataset": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://ut-austin-rpl.github.io/sirius/", + "paper": "https://arxiv.org/abs/2211.08416", + "citation_bibtex": dedent(r""" + @inproceedings{liu2022robot, + title = {Robot Learning on the Job: Human-in-the-Loop Autonomy and Learning During Deployment}, + author = {Huihan Liu and Soroush Nasiriany and Lance Zhang and Zhiyao Bao and Yuke Zhu}, + booktitle = {Robotics: Science and Systems (RSS)}, + year = {2023} + }""").lstrip(), + }, + "berkeley_autolab_ur5": { + "tasks_col": "language_instruction", + "license": "cc-by-4.0", + "url": "https://sites.google.com/view/berkeley-ur5/home", + "citation_bibtex": dedent(r""" + @misc{BerkeleyUR5Website, + title = {Berkeley {UR5} Demonstration Dataset}, + author = {Lawrence Yunliang Chen and Simeon Adebola and Ken Goldberg}, + howpublished = {https://sites.google.com/view/berkeley-ur5/home}, + }""").lstrip(), + }, + "berkeley_cable_routing": { + "tasks_col": "language_instruction", + "license": "cc-by-4.0", + "url": "https://sites.google.com/view/cablerouting/home", + "paper": "https://arxiv.org/abs/2307.08927", + "citation_bibtex": dedent(r""" + @article{luo2023multistage, + author = {Jianlan Luo and Charles Xu and Xinyang Geng and Gilbert Feng and Kuan Fang and Liam Tan and Stefan Schaal and Sergey Levine}, + title = {Multi-Stage Cable Routing through Hierarchical Imitation Learning}, + journal = {arXiv pre-print}, + year = {2023}, + url = {https://arxiv.org/abs/2307.08927}, + }""").lstrip(), + }, + "berkeley_fanuc_manipulation": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://sites.google.com/berkeley.edu/fanuc-manipulation", + "citation_bibtex": dedent(r""" + @article{fanuc_manipulation2023, + title={Fanuc Manipulation: A Dataset for Learning-based Manipulation with FANUC Mate 200iD Robot}, + author={Zhu, Xinghao and Tian, Ran and Xu, Chenfeng and Ding, Mingyu and Zhan, Wei and Tomizuka, Masayoshi}, + year={2023}, + }""").lstrip(), + }, + "berkeley_gnm_cory_hall": { + "tasks_col": "language_instruction", + "license": "mit", + "paper": "https://arxiv.org/abs/1709.10489", + "citation_bibtex": dedent(r""" + @inproceedings{kahn2018self, + title={Self-supervised deep reinforcement learning with generalized computation graphs for robot navigation}, + author={Kahn, Gregory and Villaflor, Adam and Ding, Bosen and Abbeel, Pieter and Levine, Sergey}, + booktitle={2018 IEEE international conference on robotics and automation (ICRA)}, + pages={5129--5136}, + year={2018}, + organization={IEEE} + }""").lstrip(), + }, + "berkeley_gnm_recon": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://sites.google.com/view/recon-robot", + "paper": "https://arxiv.org/abs/2104.05859", + "citation_bibtex": dedent(r""" + @inproceedings{shah2021rapid, + title={Rapid Exploration for Open-World Navigation with Latent Goal Models}, + author={Dhruv Shah and Benjamin Eysenbach and Nicholas Rhinehart and Sergey Levine}, + booktitle={5th Annual Conference on Robot Learning }, + year={2021}, + url={https://openreview.net/forum?id=d_SWJhyKfVw} + }""").lstrip(), + }, + "berkeley_gnm_sac_son": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://sites.google.com/view/SACSoN-review", + "paper": "https://arxiv.org/abs/2306.01874", + "citation_bibtex": dedent(r""" + @article{hirose2023sacson, + title={SACSoN: Scalable Autonomous Data Collection for Social Navigation}, + author={Hirose, Noriaki and Shah, Dhruv and Sridhar, Ajay and Levine, Sergey}, + journal={arXiv preprint arXiv:2306.01874}, + year={2023} + }""").lstrip(), + }, + "berkeley_mvp": { + "tasks_col": "language_instruction", + "license": "mit", + "paper": "https://arxiv.org/abs/2203.06173", + "citation_bibtex": dedent(r""" + @InProceedings{Radosavovic2022, + title = {Real-World Robot Learning with Masked Visual Pre-training}, + author = {Ilija Radosavovic and Tete Xiao and Stephen James and Pieter Abbeel and Jitendra Malik and Trevor Darrell}, + booktitle = {CoRL}, + year = {2022} + }""").lstrip(), + }, + "berkeley_rpt": { + "tasks_col": "language_instruction", + "license": "mit", + "paper": "https://arxiv.org/abs/2306.10007", + "citation_bibtex": dedent(r""" + @article{Radosavovic2023, + title={Robot Learning with Sensorimotor Pre-training}, + author={Ilija Radosavovic and Baifeng Shi and Letian Fu and Ken Goldberg and Trevor Darrell and Jitendra Malik}, + year={2023}, + journal={arXiv:2306.10007} + }""").lstrip(), + }, + "cmu_franka_exploration_dataset": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://human-world-model.github.io/", + "paper": "https://arxiv.org/abs/2308.10901", + "citation_bibtex": dedent(r""" + @inproceedings{mendonca2023structured, + title={Structured World Models from Human Videos}, + author={Mendonca, Russell and Bahl, Shikhar and Pathak, Deepak}, + journal={RSS}, + year={2023} + }""").lstrip(), + }, + "cmu_play_fusion": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://play-fusion.github.io/", + "paper": "https://arxiv.org/abs/2312.04549", + "citation_bibtex": dedent(r""" + @inproceedings{chen2023playfusion, + title={PlayFusion: Skill Acquisition via Diffusion from Language-Annotated Play}, + author={Chen, Lili and Bahl, Shikhar and Pathak, Deepak}, + booktitle={CoRL}, + year={2023} + }""").lstrip(), + }, + "cmu_stretch": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://robo-affordances.github.io/", + "paper": "https://arxiv.org/abs/2304.08488", + "citation_bibtex": dedent(r""" + @inproceedings{bahl2023affordances, + title={Affordances from Human Videos as a Versatile Representation for Robotics}, + author={Bahl, Shikhar and Mendonca, Russell and Chen, Lili and Jain, Unnat and Pathak, Deepak}, + booktitle={CVPR}, + year={2023} + } + @article{mendonca2023structured, + title={Structured World Models from Human Videos}, + author={Mendonca, Russell and Bahl, Shikhar and Pathak, Deepak}, + journal={CoRL}, + year={2023} + }""").lstrip(), + }, + "columbia_cairlab_pusht_real": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://diffusion-policy.cs.columbia.edu/", + "paper": "https://arxiv.org/abs/2303.04137v5", + "citation_bibtex": dedent(r""" + @inproceedings{chi2023diffusionpolicy, + title={Diffusion Policy: Visuomotor Policy Learning via Action Diffusion}, + author={Chi, Cheng and Feng, Siyuan and Du, Yilun and Xu, Zhenjia and Cousineau, Eric and Burchfiel, Benjamin and Song, Shuran}, + booktitle={Proceedings of Robotics: Science and Systems (RSS)}, + year={2023} + }""").lstrip(), + }, + "conq_hose_manipulation": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://sites.google.com/view/conq-hose-manipulation-dataset/home", + "citation_bibtex": dedent(r""" + @misc{ConqHoseManipData, + author={Peter Mitrano and Dmitry Berenson}, + title={Conq Hose Manipulation Dataset, v1.15.0}, + year={2024}, + howpublished={https://sites.google.com/view/conq-hose-manipulation-dataset} + }""").lstrip(), + }, + "dlr_edan_shared_control": { + "tasks_col": "language_instruction", + "license": "mit", + "paper": "https://ieeexplore.ieee.org/document/9341156", + "citation_bibtex": dedent(r""" + @inproceedings{vogel_edan_2020, + title = {EDAN - an EMG-Controlled Daily Assistant to Help People with Physical Disabilities}, + language = {en}, + booktitle = {2020 {IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems} ({IROS})}, + author = {Vogel, Jörn and Hagengruber, Annette and Iskandar, Maged and Quere, Gabriel and Leipscher, Ulrike and Bustamante, Samuel and Dietrich, Alexander and Hoeppner, Hannes and Leidner, Daniel and Albu-Schäffer, Alin}, + year = {2020} + } + @inproceedings{quere_shared_2020, + address = {Paris, France}, + title = {Shared {Control} {Templates} for {Assistive} {Robotics}}, + language = {en}, + booktitle = {2020 {IEEE} {International} {Conference} on {Robotics} and {Automation} ({ICRA})}, + author = {Quere, Gabriel and Hagengruber, Annette and Iskandar, Maged and Bustamante, Samuel and Leidner, Daniel and Stulp, Freek and Vogel, Joern}, + year = {2020}, + pages = {7}, + }""").lstrip(), + }, + "dlr_sara_grid_clamp": { + "tasks_col": "language_instruction", + "license": "mit", + "paper": "https://www.researchsquare.com/article/rs-3289569/v1", + "citation_bibtex": dedent(r""" + @article{padalkar2023guided, + title={A guided reinforcement learning approach using shared control templates for learning manipulation skills in the real world}, + author={Padalkar, Abhishek and Quere, Gabriel and Raffin, Antonin and Silv{\'e}rio, Jo{\~a}o and Stulp, Freek}, + journal={Research square preprint rs-3289569/v1}, + year={2023} + }""").lstrip(), + }, + "dlr_sara_pour": { + "tasks_col": "language_instruction", + "license": "mit", + "paper": "https://elib.dlr.de/193739/1/padalkar2023rlsct.pdf", + "citation_bibtex": dedent(r""" + @inproceedings{padalkar2023guiding, + title={Guiding Reinforcement Learning with Shared Control Templates}, + author={Padalkar, Abhishek and Quere, Gabriel and Steinmetz, Franz and Raffin, Antonin and Nieuwenhuisen, Matthias and Silv{\'e}rio, Jo{\~a}o and Stulp, Freek}, + booktitle={40th IEEE International Conference on Robotics and Automation, ICRA 2023}, + year={2023}, + organization={IEEE} + }""").lstrip(), + }, + "droid_100": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://droid-dataset.github.io/", + "paper": "https://arxiv.org/abs/2403.12945", + "citation_bibtex": dedent(r""" + @article{khazatsky2024droid, + title = {DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset}, + author = {Alexander Khazatsky and Karl Pertsch and Suraj Nair and Ashwin Balakrishna and Sudeep Dasari and Siddharth Karamcheti and Soroush Nasiriany and Mohan Kumar Srirama and Lawrence Yunliang Chen and Kirsty Ellis and Peter David Fagan and Joey Hejna and Masha Itkina and Marion Lepert and Yecheng Jason Ma and Patrick Tree Miller and Jimmy Wu and Suneel Belkhale and Shivin Dass and Huy Ha and Arhan Jain and Abraham Lee and Youngwoon Lee and Marius Memmel and Sungjae Park and Ilija Radosavovic and Kaiyuan Wang and Albert Zhan and Kevin Black and Cheng Chi and Kyle Beltran Hatch and Shan Lin and Jingpei Lu and Jean Mercat and Abdul Rehman and Pannag R Sanketi and Archit Sharma and Cody Simpson and Quan Vuong and Homer Rich Walke and Blake Wulfe and Ted Xiao and Jonathan Heewon Yang and Arefeh Yavary and Tony Z. Zhao and Christopher Agia and Rohan Baijal and Mateo Guaman Castro and Daphne Chen and Qiuyu Chen and Trinity Chung and Jaimyn Drake and Ethan Paul Foster and Jensen Gao and David Antonio Herrera and Minho Heo and Kyle Hsu and Jiaheng Hu and Donovon Jackson and Charlotte Le and Yunshuang Li and Kevin Lin and Roy Lin and Zehan Ma and Abhiram Maddukuri and Suvir Mirchandani and Daniel Morton and Tony Nguyen and Abigail O'Neill and Rosario Scalise and Derick Seale and Victor Son and Stephen Tian and Emi Tran and Andrew E. Wang and Yilin Wu and Annie Xie and Jingyun Yang and Patrick Yin and Yunchu Zhang and Osbert Bastani and Glen Berseth and Jeannette Bohg and Ken Goldberg and Abhinav Gupta and Abhishek Gupta and Dinesh Jayaraman and Joseph J Lim and Jitendra Malik and Roberto Martín-Martín and Subramanian Ramamoorthy and Dorsa Sadigh and Shuran Song and Jiajun Wu and Michael C. Yip and Yuke Zhu and Thomas Kollar and Sergey Levine and Chelsea Finn}, + year = {2024}, + }""").lstrip(), + }, + "fmb": { + "tasks_col": "language_instruction", + "license": "cc-by-4.0", + "url": "https://functional-manipulation-benchmark.github.io/", + "paper": "https://arxiv.org/abs/2401.08553", + "citation_bibtex": dedent(r""" + @article{luo2024fmb, + title={FMB: a Functional Manipulation Benchmark for Generalizable Robotic Learning}, + author={Luo, Jianlan and Xu, Charles and Liu, Fangchen and Tan, Liam and Lin, Zipeng and Wu, Jeffrey and Abbeel, Pieter and Levine, Sergey}, + journal={arXiv preprint arXiv:2401.08553}, + year={2024} + }""").lstrip(), + }, + "iamlab_cmu_pickup_insert": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://openreview.net/forum?id=WuBv9-IGDUA", + "paper": "https://arxiv.org/abs/2401.14502", + "citation_bibtex": dedent(r""" + @inproceedings{saxena2023multiresolution, + title={Multi-Resolution Sensing for Real-Time Control with Vision-Language Models}, + author={Saumya Saxena and Mohit Sharma and Oliver Kroemer}, + booktitle={7th Annual Conference on Robot Learning}, + year={2023}, + url={https://openreview.net/forum?id=WuBv9-IGDUA} + }""").lstrip(), + }, + "imperialcollege_sawyer_wrist_cam": { + "tasks_col": "language_instruction", + "license": "mit", + }, + "jaco_play": { + "tasks_col": "language_instruction", + "license": "cc-by-4.0", + "url": "https://github.com/clvrai/clvr_jaco_play_dataset", + "citation_bibtex": dedent(r""" + @software{dass2023jacoplay, + author = {Dass, Shivin and Yapeter, Jullian and Zhang, Jesse and Zhang, Jiahui + and Pertsch, Karl and Nikolaidis, Stefanos and Lim, Joseph J.}, + title = {CLVR Jaco Play Dataset}, + url = {https://github.com/clvrai/clvr_jaco_play_dataset}, + version = {1.0.0}, + year = {2023} + }""").lstrip(), + }, + "kaist_nonprehensile": { + "tasks_col": "language_instruction", + "license": "cc-by-4.0", + "url": "https://github.com/JaeHyung-Kim/rlds_dataset_builder", + "citation_bibtex": dedent(r""" + @article{kimpre, + title={Pre-and post-contact policy decomposition for non-prehensile manipulation with zero-shot sim-to-real transfer}, + author={Kim, Minchan and Han, Junhyek and Kim, Jaehyung and Kim, Beomjoon}, + booktitle={2023 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)}, + year={2023}, + organization={IEEE} + }""").lstrip(), + }, + "nyu_door_opening_surprising_effectiveness": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://jyopari.github.io/VINN/", + "paper": "https://arxiv.org/abs/2112.01511", + "citation_bibtex": dedent(r""" + @misc{pari2021surprising, + title={The Surprising Effectiveness of Representation Learning for Visual Imitation}, + author={Jyothish Pari and Nur Muhammad Shafiullah and Sridhar Pandian Arunachalam and Lerrel Pinto}, + year={2021}, + eprint={2112.01511}, + archivePrefix={arXiv}, + primaryClass={cs.RO} + }""").lstrip(), + }, + "nyu_franka_play_dataset": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://play-to-policy.github.io/", + "paper": "https://arxiv.org/abs/2210.10047", + "citation_bibtex": dedent(r""" + @article{cui2022play, + title = {From Play to Policy: Conditional Behavior Generation from Uncurated Robot Data}, + author = {Cui, Zichen Jeff and Wang, Yibin and Shafiullah, Nur Muhammad Mahi and Pinto, Lerrel}, + journal = {arXiv preprint arXiv:2210.10047}, + year = {2022} + }""").lstrip(), + }, + "nyu_rot_dataset": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://rot-robot.github.io/", + "paper": "https://arxiv.org/abs/2206.15469", + "citation_bibtex": dedent(r""" + @inproceedings{haldar2023watch, + title={Watch and match: Supercharging imitation with regularized optimal transport}, + author={Haldar, Siddhant and Mathur, Vaibhav and Yarats, Denis and Pinto, Lerrel}, + booktitle={Conference on Robot Learning}, + pages={32--43}, + year={2023}, + organization={PMLR} + }""").lstrip(), + }, + "roboturk": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://roboturk.stanford.edu/dataset_real.html", + "paper": "PAPER", + "citation_bibtex": dedent(r""" + @inproceedings{mandlekar2019scaling, + title={Scaling robot supervision to hundreds of hours with roboturk: Robotic manipulation dataset through human reasoning and dexterity}, + author={Mandlekar, Ajay and Booher, Jonathan and Spero, Max and Tung, Albert and Gupta, Anchit and Zhu, Yuke and Garg, Animesh and Savarese, Silvio and Fei-Fei, Li}, + booktitle={2019 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)}, + pages={1048--1055}, + year={2019}, + organization={IEEE} + }""").lstrip(), + }, + "stanford_hydra_dataset": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://sites.google.com/view/hydra-il-2023", + "paper": "https://arxiv.org/abs/2306.17237", + "citation_bibtex": dedent(r""" + @article{belkhale2023hydra, + title={HYDRA: Hybrid Robot Actions for Imitation Learning}, + author={Belkhale, Suneel and Cui, Yuchen and Sadigh, Dorsa}, + journal={arxiv}, + year={2023} + }""").lstrip(), + }, + "stanford_kuka_multimodal_dataset": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://sites.google.com/view/visionandtouch", + "paper": "https://arxiv.org/abs/1810.10191", + "citation_bibtex": dedent(r""" + @inproceedings{lee2019icra, + title={Making sense of vision and touch: Self-supervised learning of multimodal representations for contact-rich tasks}, + author={Lee, Michelle A and Zhu, Yuke and Srinivasan, Krishnan and Shah, Parth and Savarese, Silvio and Fei-Fei, Li and Garg, Animesh and Bohg, Jeannette}, + booktitle={2019 IEEE International Conference on Robotics and Automation (ICRA)}, + year={2019}, + url={https://arxiv.org/abs/1810.10191} + }""").lstrip(), + }, + "stanford_robocook": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://hshi74.github.io/robocook/", + "paper": "https://arxiv.org/abs/2306.14447", + "citation_bibtex": dedent(r""" + @article{shi2023robocook, + title={RoboCook: Long-Horizon Elasto-Plastic Object Manipulation with Diverse Tools}, + author={Shi, Haochen and Xu, Huazhe and Clarke, Samuel and Li, Yunzhu and Wu, Jiajun}, + journal={arXiv preprint arXiv:2306.14447}, + year={2023} + }""").lstrip(), + }, + "taco_play": { + "tasks_col": "language_instruction", + "license": "cc-by-4.0", + "url": "https://www.kaggle.com/datasets/oiermees/taco-robot", + "paper": "https://arxiv.org/abs/2209.08959, https://arxiv.org/abs/2210.01911", + "citation_bibtex": dedent(r""" + @inproceedings{rosete2022tacorl, + author = {Erick Rosete-Beas and Oier Mees and Gabriel Kalweit and Joschka Boedecker and Wolfram Burgard}, + title = {Latent Plans for Task Agnostic Offline Reinforcement Learning}, + journal = {Proceedings of the 6th Conference on Robot Learning (CoRL)}, + year = {2022} + } + @inproceedings{mees23hulc2, + title={Grounding Language with Visual Affordances over Unstructured Data}, + author={Oier Mees and Jessica Borja-Diaz and Wolfram Burgard}, + booktitle = {Proceedings of the IEEE International Conference on Robotics and Automation (ICRA)}, + year={2023}, + address = {London, UK} + }""").lstrip(), + }, + "tokyo_u_lsmo": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "URL", + "paper": "https://arxiv.org/abs/2107.05842", + "citation_bibtex": dedent(r""" + @Article{Osa22, + author = {Takayuki Osa}, + journal = {The International Journal of Robotics Research}, + title = {Motion Planning by Learning the Solution Manifold in Trajectory Optimization}, + year = {2022}, + number = {3}, + pages = {291--311}, + volume = {41}, + }""").lstrip(), + }, + "toto": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://toto-benchmark.org/", + "paper": "https://arxiv.org/abs/2306.00942", + "citation_bibtex": dedent(r""" + @inproceedings{zhou2023train, + author={Zhou, Gaoyue and Dean, Victoria and Srirama, Mohan Kumar and Rajeswaran, Aravind and Pari, Jyothish and Hatch, Kyle and Jain, Aryan and Yu, Tianhe and Abbeel, Pieter and Pinto, Lerrel and Finn, Chelsea and Gupta, Abhinav}, + booktitle={2023 IEEE International Conference on Robotics and Automation (ICRA)}, + title={Train Offline, Test Online: A Real Robot Learning Benchmark}, + year={2023}, + }""").lstrip(), + }, + "ucsd_kitchen_dataset": { + "tasks_col": "language_instruction", + "license": "mit", + "citation_bibtex": dedent(r""" + @ARTICLE{ucsd_kitchens, + author = {Ge Yan, Kris Wu, and Xiaolong Wang}, + title = {{ucsd kitchens Dataset}}, + year = {2023}, + month = {August} + }""").lstrip(), + }, + "ucsd_pick_and_place_dataset": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://owmcorl.github.io/#", + "paper": "https://arxiv.org/abs/2310.16029", + "citation_bibtex": dedent(r""" + @preprint{Feng2023Finetuning, + title={Finetuning Offline World Models in the Real World}, + author={Yunhai Feng, Nicklas Hansen, Ziyan Xiong, Chandramouli Rajagopalan, Xiaolong Wang}, + year={2023} + }""").lstrip(), + }, + "uiuc_d3field": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://robopil.github.io/d3fields/", + "paper": "https://arxiv.org/abs/2309.16118", + "citation_bibtex": dedent(r""" + @article{wang2023d3field, + title={D^3Field: Dynamic 3D Descriptor Fields for Generalizable Robotic Manipulation}, + author={Wang, Yixuan and Li, Zhuoran and Zhang, Mingtong and Driggs-Campbell, Katherine and Wu, Jiajun and Fei-Fei, Li and Li, Yunzhu}, + journal={arXiv preprint arXiv:}, + year={2023}, + }""").lstrip(), + }, + "usc_cloth_sim": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://uscresl.github.io/dmfd/", + "paper": "https://arxiv.org/abs/2207.10148", + "citation_bibtex": dedent(r""" + @article{salhotra2022dmfd, + author={Salhotra, Gautam and Liu, I-Chun Arthur and Dominguez-Kuhne, Marcus and Sukhatme, Gaurav S.}, + journal={IEEE Robotics and Automation Letters}, + title={Learning Deformable Object Manipulation From Expert Demonstrations}, + year={2022}, + volume={7}, + number={4}, + pages={8775-8782}, + doi={10.1109/LRA.2022.3187843} + }""").lstrip(), + }, + "utaustin_mutex": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://ut-austin-rpl.github.io/MUTEX/", + "paper": "https://arxiv.org/abs/2309.14320", + "citation_bibtex": dedent(r""" + @inproceedings{shah2023mutex, + title={{MUTEX}: Learning Unified Policies from Multimodal Task Specifications}, + author={Rutav Shah and Roberto Mart{\'\i}n-Mart{\'\i}n and Yuke Zhu}, + booktitle={7th Annual Conference on Robot Learning}, + year={2023}, + url={https://openreview.net/forum?id=PwqiqaaEzJ} + }""").lstrip(), + }, + "utokyo_pr2_opening_fridge": { + "tasks_col": "language_instruction", + "license": "mit", + "citation_bibtex": dedent(r""" + @misc{oh2023pr2utokyodatasets, + author={Jihoon Oh and Naoaki Kanazawa and Kento Kawaharazuka}, + title={X-Embodiment U-Tokyo PR2 Datasets}, + year={2023}, + url={https://github.com/ojh6404/rlds_dataset_builder}, + }""").lstrip(), + }, + "utokyo_pr2_tabletop_manipulation": { + "tasks_col": "language_instruction", + "license": "mit", + "citation_bibtex": dedent(r""" + @misc{oh2023pr2utokyodatasets, + author={Jihoon Oh and Naoaki Kanazawa and Kento Kawaharazuka}, + title={X-Embodiment U-Tokyo PR2 Datasets}, + year={2023}, + url={https://github.com/ojh6404/rlds_dataset_builder}, + }""").lstrip(), + }, + "utokyo_saytap": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://saytap.github.io/", + "paper": "https://arxiv.org/abs/2306.07580", + "citation_bibtex": dedent(r""" + @article{saytap2023, + author = {Yujin Tang and Wenhao Yu and Jie Tan and Heiga Zen and Aleksandra Faust and + Tatsuya Harada}, + title = {SayTap: Language to Quadrupedal Locomotion}, + eprint = {arXiv:2306.07580}, + url = {https://saytap.github.io}, + note = {https://saytap.github.io}, + year = {2023} + }""").lstrip(), + }, + "utokyo_xarm_bimanual": { + "tasks_col": "language_instruction", + "license": "cc-by-4.0", + "citation_bibtex": dedent(r""" + @misc{matsushima2023weblab, + title={Weblab xArm Dataset}, + author={Tatsuya Matsushima and Hiroki Furuta and Yusuke Iwasawa and Yutaka Matsuo}, + year={2023}, + }""").lstrip(), + }, + "utokyo_xarm_pick_and_place": { + "tasks_col": "language_instruction", + "license": "cc-by-4.0", + "citation_bibtex": dedent(r""" + @misc{matsushima2023weblab, + title={Weblab xArm Dataset}, + author={Tatsuya Matsushima and Hiroki Furuta and Yusuke Iwasawa and Yutaka Matsuo}, + year={2023}, + }""").lstrip(), + }, + "viola": { + "tasks_col": "language_instruction", + "license": "mit", + "url": "https://ut-austin-rpl.github.io/VIOLA/", + "paper": "https://arxiv.org/abs/2210.11339", + "citation_bibtex": dedent(r""" + @article{zhu2022viola, + title={VIOLA: Imitation Learning for Vision-Based Manipulation with Object Proposal Priors}, + author={Zhu, Yifeng and Joshi, Abhishek and Stone, Peter and Zhu, Yuke}, + journal={6th Annual Conference on Robot Learning (CoRL)}, + year={2022} + }""").lstrip(), + }, +} + + +def batch_convert(): + status = {} + logfile = LOCAL_DIR / "conversion_log.txt" + assert set(DATASETS) == {id_.split("/")[1] for id_ in available_datasets} + for num, (name, kwargs) in enumerate(DATASETS.items()): + repo_id = f"lerobot/{name}" + print(f"\nConverting {repo_id} ({num}/{len(DATASETS)})") + print("---------------------------------------------------------") + try: + convert_dataset(repo_id, LOCAL_DIR, **kwargs) + status = f"{repo_id}: success." + with open(logfile, "a") as file: + file.write(status + "\n") + except Exception: + status = f"{repo_id}: failed\n {traceback.format_exc()}" + with open(logfile, "a") as file: + file.write(status + "\n") + continue + + +if __name__ == "__main__": + batch_convert() diff --git a/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py b/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py new file mode 100644 index 000000000..bf135043b --- /dev/null +++ b/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py @@ -0,0 +1,665 @@ +#!/usr/bin/env python + +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script will help you convert any LeRobot dataset already pushed to the hub from codebase version 1.6 to +2.0. You will be required to provide the 'tasks', which is a short but accurate description in plain English +for each of the task performed in the dataset. This will allow to easily train models with task-conditionning. + +We support 3 different scenarios for these tasks (see instructions below): + 1. Single task dataset: all episodes of your dataset have the same single task. + 2. Single task episodes: the episodes of your dataset each contain a single task but they can differ from + one episode to the next. + 3. Multi task episodes: episodes of your dataset may each contain several different tasks. + + +Can you can also provide a robot config .yaml file (not mandatory) to this script via the option +'--robot-config' so that it writes information about the robot (robot type, motors names) this dataset was +recorded with. For now, only Aloha/Koch type robots are supported with this option. + + +# 1. Single task dataset +If your dataset contains a single task, you can simply provide it directly via the CLI with the +'--single-task' option. + +Examples: + +```bash +python lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py \ + --repo-id lerobot/aloha_sim_insertion_human_image \ + --single-task "Insert the peg into the socket." \ + --robot-config lerobot/configs/robot/aloha.yaml \ + --local-dir data +``` + +```bash +python lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py \ + --repo-id aliberts/koch_tutorial \ + --single-task "Pick the Lego block and drop it in the box on the right." \ + --robot-config lerobot/configs/robot/koch.yaml \ + --local-dir data +``` + + +# 2. Single task episodes +If your dataset is a multi-task dataset, you have two options to provide the tasks to this script: + +- If your dataset already contains a language instruction column in its parquet file, you can simply provide + this column's name with the '--tasks-col' arg. + + Example: + + ```bash + python lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py \ + --repo-id lerobot/stanford_kuka_multimodal_dataset \ + --tasks-col "language_instruction" \ + --local-dir data + ``` + +- If your dataset doesn't contain a language instruction, you should provide the path to a .json file with the + '--tasks-path' arg. This file should have the following structure where keys correspond to each + episode_index in the dataset, and values are the language instruction for that episode. + + Example: + + ```json + { + "0": "Do something", + "1": "Do something else", + "2": "Do something", + "3": "Go there", + ... + } + ``` + +# 3. Multi task episodes +If you have multiple tasks per episodes, your dataset should contain a language instruction column in its +parquet file, and you must provide this column's name with the '--tasks-col' arg. + +Example: + +```bash +python lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py \ + --repo-id lerobot/stanford_kuka_multimodal_dataset \ + --tasks-col "language_instruction" \ + --local-dir data +``` +""" + +import argparse +import contextlib +import filecmp +import json +import logging +import math +import shutil +import subprocess +import tempfile +from pathlib import Path + +import datasets +import pyarrow.compute as pc +import pyarrow.parquet as pq +import torch +from datasets import Dataset +from huggingface_hub import HfApi +from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError +from safetensors.torch import load_file + +from lerobot.common.datasets.utils import ( + DEFAULT_CHUNK_SIZE, + DEFAULT_PARQUET_PATH, + DEFAULT_VIDEO_PATH, + EPISODES_PATH, + INFO_PATH, + STATS_PATH, + TASKS_PATH, + create_branch, + create_lerobot_dataset_card, + flatten_dict, + get_hub_safe_version, + load_json, + unflatten_dict, + write_json, + write_jsonlines, +) +from lerobot.common.datasets.video_utils import ( + VideoFrame, # noqa: F401 + get_image_pixel_channels, + get_video_info, +) +from lerobot.common.utils.utils import init_hydra_config + +V16 = "v1.6" +V20 = "v2.0" + +GITATTRIBUTES_REF = "aliberts/gitattributes_reference" +V1_VIDEO_FILE = "{video_key}_episode_{episode_index:06d}.mp4" +V1_INFO_PATH = "meta_data/info.json" +V1_STATS_PATH = "meta_data/stats.safetensors" + + +def parse_robot_config(config_path: Path, config_overrides: list[str] | None = None) -> tuple[str, dict]: + robot_cfg = init_hydra_config(config_path, config_overrides) + if robot_cfg["robot_type"] in ["aloha", "koch"]: + state_names = [ + f"{arm}_{motor}" if len(robot_cfg["follower_arms"]) > 1 else motor + for arm in robot_cfg["follower_arms"] + for motor in robot_cfg["follower_arms"][arm]["motors"] + ] + action_names = [ + # f"{arm}_{motor}" for arm in ["left", "right"] for motor in robot_cfg["leader_arms"][arm]["motors"] + f"{arm}_{motor}" if len(robot_cfg["leader_arms"]) > 1 else motor + for arm in robot_cfg["leader_arms"] + for motor in robot_cfg["leader_arms"][arm]["motors"] + ] + # elif robot_cfg["robot_type"] == "stretch3": TODO + else: + raise NotImplementedError( + "Please provide robot_config={'robot_type': ..., 'names': ...} directly to convert_dataset()." + ) + + return { + "robot_type": robot_cfg["robot_type"], + "names": { + "observation.state": state_names, + "observation.effort": state_names, + "action": action_names, + }, + } + + +def convert_stats_to_json(v1_dir: Path, v2_dir: Path) -> None: + safetensor_path = v1_dir / V1_STATS_PATH + stats = load_file(safetensor_path) + serialized_stats = {key: value.tolist() for key, value in stats.items()} + serialized_stats = unflatten_dict(serialized_stats) + + json_path = v2_dir / STATS_PATH + json_path.parent.mkdir(exist_ok=True, parents=True) + with open(json_path, "w") as f: + json.dump(serialized_stats, f, indent=4) + + # Sanity check + with open(json_path) as f: + stats_json = json.load(f) + + stats_json = flatten_dict(stats_json) + stats_json = {key: torch.tensor(value) for key, value in stats_json.items()} + for key in stats: + torch.testing.assert_close(stats_json[key], stats[key]) + + +def get_features_from_hf_dataset(dataset: Dataset, robot_config: dict | None = None) -> dict[str, list]: + features = {} + for key, ft in dataset.features.items(): + if isinstance(ft, datasets.Value): + dtype = ft.dtype + shape = (1,) + names = None + if isinstance(ft, datasets.Sequence): + assert isinstance(ft.feature, datasets.Value) + dtype = ft.feature.dtype + shape = (ft.length,) + motor_names = ( + robot_config["names"][key] if robot_config else [f"motor_{i}" for i in range(ft.length)] + ) + assert len(motor_names) == shape[0] + names = {"motors": motor_names} + elif isinstance(ft, datasets.Image): + dtype = "image" + image = dataset[0][key] # Assuming first row + channels = get_image_pixel_channels(image) + shape = (image.height, image.width, channels) + names = ["height", "width", "channel"] + elif ft._type == "VideoFrame": + dtype = "video" + shape = None # Add shape later + names = ["height", "width", "channel"] + + features[key] = { + "dtype": dtype, + "shape": shape, + "names": names, + } + + return features + + +def add_task_index_by_episodes(dataset: Dataset, tasks_by_episodes: dict) -> tuple[Dataset, list[str]]: + df = dataset.to_pandas() + tasks = list(set(tasks_by_episodes.values())) + tasks_to_task_index = {task: task_idx for task_idx, task in enumerate(tasks)} + episodes_to_task_index = {ep_idx: tasks_to_task_index[task] for ep_idx, task in tasks_by_episodes.items()} + df["task_index"] = df["episode_index"].map(episodes_to_task_index).astype(int) + + features = dataset.features + features["task_index"] = datasets.Value(dtype="int64") + dataset = Dataset.from_pandas(df, features=features, split="train") + return dataset, tasks + + +def add_task_index_from_tasks_col( + dataset: Dataset, tasks_col: str +) -> tuple[Dataset, dict[str, list[str]], list[str]]: + df = dataset.to_pandas() + + # HACK: This is to clean some of the instructions in our version of Open X datasets + prefix_to_clean = "tf.Tensor(b'" + suffix_to_clean = "', shape=(), dtype=string)" + df[tasks_col] = df[tasks_col].str.removeprefix(prefix_to_clean).str.removesuffix(suffix_to_clean) + + # Create task_index col + tasks_by_episode = df.groupby("episode_index")[tasks_col].unique().apply(lambda x: x.tolist()).to_dict() + tasks = df[tasks_col].unique().tolist() + tasks_to_task_index = {task: idx for idx, task in enumerate(tasks)} + df["task_index"] = df[tasks_col].map(tasks_to_task_index).astype(int) + + # Build the dataset back from df + features = dataset.features + features["task_index"] = datasets.Value(dtype="int64") + dataset = Dataset.from_pandas(df, features=features, split="train") + dataset = dataset.remove_columns(tasks_col) + + return dataset, tasks, tasks_by_episode + + +def split_parquet_by_episodes( + dataset: Dataset, + total_episodes: int, + total_chunks: int, + output_dir: Path, +) -> list: + table = dataset.data.table + episode_lengths = [] + for ep_chunk in range(total_chunks): + ep_chunk_start = DEFAULT_CHUNK_SIZE * ep_chunk + ep_chunk_end = min(DEFAULT_CHUNK_SIZE * (ep_chunk + 1), total_episodes) + chunk_dir = "/".join(DEFAULT_PARQUET_PATH.split("/")[:-1]).format(episode_chunk=ep_chunk) + (output_dir / chunk_dir).mkdir(parents=True, exist_ok=True) + for ep_idx in range(ep_chunk_start, ep_chunk_end): + ep_table = table.filter(pc.equal(table["episode_index"], ep_idx)) + episode_lengths.insert(ep_idx, len(ep_table)) + output_file = output_dir / DEFAULT_PARQUET_PATH.format( + episode_chunk=ep_chunk, episode_index=ep_idx + ) + pq.write_table(ep_table, output_file) + + return episode_lengths + + +def move_videos( + repo_id: str, + video_keys: list[str], + total_episodes: int, + total_chunks: int, + work_dir: Path, + clean_gittatributes: Path, + branch: str = "main", +) -> None: + """ + HACK: Since HfApi() doesn't provide a way to move files directly in a repo, this function will run git + commands to fetch git lfs video files references to move them into subdirectories without having to + actually download them. + """ + _lfs_clone(repo_id, work_dir, branch) + + videos_moved = False + video_files = [str(f.relative_to(work_dir)) for f in work_dir.glob("videos*/*.mp4")] + if len(video_files) == 0: + video_files = [str(f.relative_to(work_dir)) for f in work_dir.glob("videos*/*/*/*.mp4")] + videos_moved = True # Videos have already been moved + + assert len(video_files) == total_episodes * len(video_keys) + + lfs_untracked_videos = _get_lfs_untracked_videos(work_dir, video_files) + + current_gittatributes = work_dir / ".gitattributes" + if not filecmp.cmp(current_gittatributes, clean_gittatributes, shallow=False): + fix_gitattributes(work_dir, current_gittatributes, clean_gittatributes) + + if lfs_untracked_videos: + fix_lfs_video_files_tracking(work_dir, video_files) + + if videos_moved: + return + + video_dirs = sorted(work_dir.glob("videos*/")) + for ep_chunk in range(total_chunks): + ep_chunk_start = DEFAULT_CHUNK_SIZE * ep_chunk + ep_chunk_end = min(DEFAULT_CHUNK_SIZE * (ep_chunk + 1), total_episodes) + for vid_key in video_keys: + chunk_dir = "/".join(DEFAULT_VIDEO_PATH.split("/")[:-1]).format( + episode_chunk=ep_chunk, video_key=vid_key + ) + (work_dir / chunk_dir).mkdir(parents=True, exist_ok=True) + + for ep_idx in range(ep_chunk_start, ep_chunk_end): + target_path = DEFAULT_VIDEO_PATH.format( + episode_chunk=ep_chunk, video_key=vid_key, episode_index=ep_idx + ) + video_file = V1_VIDEO_FILE.format(video_key=vid_key, episode_index=ep_idx) + if len(video_dirs) == 1: + video_path = video_dirs[0] / video_file + else: + for dir in video_dirs: + if (dir / video_file).is_file(): + video_path = dir / video_file + break + + video_path.rename(work_dir / target_path) + + commit_message = "Move video files into chunk subdirectories" + subprocess.run(["git", "add", "."], cwd=work_dir, check=True) + subprocess.run(["git", "commit", "-m", commit_message], cwd=work_dir, check=True) + subprocess.run(["git", "push"], cwd=work_dir, check=True) + + +def fix_lfs_video_files_tracking(work_dir: Path, lfs_untracked_videos: list[str]) -> None: + """ + HACK: This function fixes the tracking by git lfs which was not properly set on some repos. In that case, + there's no other option than to download the actual files and reupload them with lfs tracking. + """ + for i in range(0, len(lfs_untracked_videos), 100): + files = lfs_untracked_videos[i : i + 100] + try: + subprocess.run(["git", "rm", "--cached", *files], cwd=work_dir, capture_output=True, check=True) + except subprocess.CalledProcessError as e: + print("git rm --cached ERROR:") + print(e.stderr) + subprocess.run(["git", "add", *files], cwd=work_dir, check=True) + + commit_message = "Track video files with git lfs" + subprocess.run(["git", "commit", "-m", commit_message], cwd=work_dir, check=True) + subprocess.run(["git", "push"], cwd=work_dir, check=True) + + +def fix_gitattributes(work_dir: Path, current_gittatributes: Path, clean_gittatributes: Path) -> None: + shutil.copyfile(clean_gittatributes, current_gittatributes) + subprocess.run(["git", "add", ".gitattributes"], cwd=work_dir, check=True) + subprocess.run(["git", "commit", "-m", "Fix .gitattributes"], cwd=work_dir, check=True) + subprocess.run(["git", "push"], cwd=work_dir, check=True) + + +def _lfs_clone(repo_id: str, work_dir: Path, branch: str) -> None: + subprocess.run(["git", "lfs", "install"], cwd=work_dir, check=True) + repo_url = f"https://huggingface.co/datasets/{repo_id}" + env = {"GIT_LFS_SKIP_SMUDGE": "1"} # Prevent downloading LFS files + subprocess.run( + ["git", "clone", "--branch", branch, "--single-branch", "--depth", "1", repo_url, str(work_dir)], + check=True, + env=env, + ) + + +def _get_lfs_untracked_videos(work_dir: Path, video_files: list[str]) -> list[str]: + lfs_tracked_files = subprocess.run( + ["git", "lfs", "ls-files", "-n"], cwd=work_dir, capture_output=True, text=True, check=True + ) + lfs_tracked_files = set(lfs_tracked_files.stdout.splitlines()) + return [f for f in video_files if f not in lfs_tracked_files] + + +def get_videos_info(repo_id: str, local_dir: Path, video_keys: list[str], branch: str) -> dict: + # Assumes first episode + video_files = [ + DEFAULT_VIDEO_PATH.format(episode_chunk=0, video_key=vid_key, episode_index=0) + for vid_key in video_keys + ] + hub_api = HfApi() + hub_api.snapshot_download( + repo_id=repo_id, repo_type="dataset", local_dir=local_dir, revision=branch, allow_patterns=video_files + ) + videos_info_dict = {} + for vid_key, vid_path in zip(video_keys, video_files, strict=True): + videos_info_dict[vid_key] = get_video_info(local_dir / vid_path) + + return videos_info_dict + + +def convert_dataset( + repo_id: str, + local_dir: Path, + single_task: str | None = None, + tasks_path: Path | None = None, + tasks_col: Path | None = None, + robot_config: dict | None = None, + test_branch: str | None = None, + **card_kwargs, +): + v1 = get_hub_safe_version(repo_id, V16) + v1x_dir = local_dir / V16 / repo_id + v20_dir = local_dir / V20 / repo_id + v1x_dir.mkdir(parents=True, exist_ok=True) + v20_dir.mkdir(parents=True, exist_ok=True) + + hub_api = HfApi() + hub_api.snapshot_download( + repo_id=repo_id, repo_type="dataset", revision=v1, local_dir=v1x_dir, ignore_patterns="videos*/" + ) + branch = "main" + if test_branch: + branch = test_branch + create_branch(repo_id=repo_id, branch=test_branch, repo_type="dataset") + + metadata_v1 = load_json(v1x_dir / V1_INFO_PATH) + dataset = datasets.load_dataset("parquet", data_dir=v1x_dir / "data", split="train") + features = get_features_from_hf_dataset(dataset, robot_config) + video_keys = [key for key, ft in features.items() if ft["dtype"] == "video"] + + if single_task and "language_instruction" in dataset.column_names: + logging.warning( + "'single_task' provided but 'language_instruction' tasks_col found. Using 'language_instruction'.", + ) + single_task = None + tasks_col = "language_instruction" + + # Episodes & chunks + episode_indices = sorted(dataset.unique("episode_index")) + total_episodes = len(episode_indices) + assert episode_indices == list(range(total_episodes)) + total_videos = total_episodes * len(video_keys) + total_chunks = total_episodes // DEFAULT_CHUNK_SIZE + if total_episodes % DEFAULT_CHUNK_SIZE != 0: + total_chunks += 1 + + # Tasks + if single_task: + tasks_by_episodes = {ep_idx: single_task for ep_idx in episode_indices} + dataset, tasks = add_task_index_by_episodes(dataset, tasks_by_episodes) + tasks_by_episodes = {ep_idx: [task] for ep_idx, task in tasks_by_episodes.items()} + elif tasks_path: + tasks_by_episodes = load_json(tasks_path) + tasks_by_episodes = {int(ep_idx): task for ep_idx, task in tasks_by_episodes.items()} + dataset, tasks = add_task_index_by_episodes(dataset, tasks_by_episodes) + tasks_by_episodes = {ep_idx: [task] for ep_idx, task in tasks_by_episodes.items()} + elif tasks_col: + dataset, tasks, tasks_by_episodes = add_task_index_from_tasks_col(dataset, tasks_col) + else: + raise ValueError + + assert set(tasks) == {task for ep_tasks in tasks_by_episodes.values() for task in ep_tasks} + tasks = [{"task_index": task_idx, "task": task} for task_idx, task in enumerate(tasks)] + write_jsonlines(tasks, v20_dir / TASKS_PATH) + features["task_index"] = { + "dtype": "int64", + "shape": (1,), + "names": None, + } + + # Videos + if video_keys: + assert metadata_v1.get("video", False) + dataset = dataset.remove_columns(video_keys) + clean_gitattr = Path( + hub_api.hf_hub_download( + repo_id=GITATTRIBUTES_REF, repo_type="dataset", local_dir=local_dir, filename=".gitattributes" + ) + ).absolute() + with tempfile.TemporaryDirectory() as tmp_video_dir: + move_videos( + repo_id, video_keys, total_episodes, total_chunks, Path(tmp_video_dir), clean_gitattr, branch + ) + videos_info = get_videos_info(repo_id, v1x_dir, video_keys=video_keys, branch=branch) + for key in video_keys: + features[key]["shape"] = ( + videos_info[key].pop("video.height"), + videos_info[key].pop("video.width"), + videos_info[key].pop("video.channels"), + ) + features[key]["video_info"] = videos_info[key] + assert math.isclose(videos_info[key]["video.fps"], metadata_v1["fps"], rel_tol=1e-3) + if "encoding" in metadata_v1: + assert videos_info[key]["video.pix_fmt"] == metadata_v1["encoding"]["pix_fmt"] + else: + assert metadata_v1.get("video", 0) == 0 + videos_info = None + + # Split data into 1 parquet file by episode + episode_lengths = split_parquet_by_episodes(dataset, total_episodes, total_chunks, v20_dir) + + if robot_config is not None: + robot_type = robot_config["robot_type"] + repo_tags = [robot_type] + else: + robot_type = "unknown" + repo_tags = None + + # Episodes + episodes = [ + {"episode_index": ep_idx, "tasks": tasks_by_episodes[ep_idx], "length": episode_lengths[ep_idx]} + for ep_idx in episode_indices + ] + write_jsonlines(episodes, v20_dir / EPISODES_PATH) + + # Assemble metadata v2.0 + metadata_v2_0 = { + "codebase_version": V20, + "robot_type": robot_type, + "total_episodes": total_episodes, + "total_frames": len(dataset), + "total_tasks": len(tasks), + "total_videos": total_videos, + "total_chunks": total_chunks, + "chunks_size": DEFAULT_CHUNK_SIZE, + "fps": metadata_v1["fps"], + "splits": {"train": f"0:{total_episodes}"}, + "data_path": DEFAULT_PARQUET_PATH, + "video_path": DEFAULT_VIDEO_PATH if video_keys else None, + "features": features, + } + write_json(metadata_v2_0, v20_dir / INFO_PATH) + convert_stats_to_json(v1x_dir, v20_dir) + card = create_lerobot_dataset_card(tags=repo_tags, dataset_info=metadata_v2_0, **card_kwargs) + + with contextlib.suppress(EntryNotFoundError, HfHubHTTPError): + hub_api.delete_folder(repo_id=repo_id, path_in_repo="data", repo_type="dataset", revision=branch) + + with contextlib.suppress(EntryNotFoundError, HfHubHTTPError): + hub_api.delete_folder(repo_id=repo_id, path_in_repo="meta_data", repo_type="dataset", revision=branch) + + with contextlib.suppress(EntryNotFoundError, HfHubHTTPError): + hub_api.delete_folder(repo_id=repo_id, path_in_repo="meta", repo_type="dataset", revision=branch) + + hub_api.upload_folder( + repo_id=repo_id, + path_in_repo="data", + folder_path=v20_dir / "data", + repo_type="dataset", + revision=branch, + ) + hub_api.upload_folder( + repo_id=repo_id, + path_in_repo="meta", + folder_path=v20_dir / "meta", + repo_type="dataset", + revision=branch, + ) + + card.push_to_hub(repo_id=repo_id, repo_type="dataset", revision=branch) + + if not test_branch: + create_branch(repo_id=repo_id, branch=V20, repo_type="dataset") + + +def main(): + parser = argparse.ArgumentParser() + task_args = parser.add_mutually_exclusive_group(required=True) + + parser.add_argument( + "--repo-id", + type=str, + required=True, + help="Repository identifier on Hugging Face: a community or a user name `/` the name of the dataset (e.g. `lerobot/pusht`, `cadene/aloha_sim_insertion_human`).", + ) + task_args.add_argument( + "--single-task", + type=str, + help="A short but accurate description of the single task performed in the dataset.", + ) + task_args.add_argument( + "--tasks-col", + type=str, + help="The name of the column containing language instructions", + ) + task_args.add_argument( + "--tasks-path", + type=Path, + help="The path to a .json file containing one language instruction for each episode_index", + ) + parser.add_argument( + "--robot-config", + type=Path, + default=None, + help="Path to the robot's config yaml the dataset during conversion.", + ) + parser.add_argument( + "--robot-overrides", + type=str, + nargs="*", + help="Any key=value arguments to override the robot config values (use dots for.nested=overrides)", + ) + parser.add_argument( + "--local-dir", + type=Path, + default=None, + help="Local directory to store the dataset during conversion. Defaults to /tmp/lerobot_dataset_v2", + ) + parser.add_argument( + "--license", + type=str, + default="apache-2.0", + help="Repo license. Must be one of https://huggingface.co/docs/hub/repositories-licenses. Defaults to mit.", + ) + parser.add_argument( + "--test-branch", + type=str, + default=None, + help="Repo branch to test your conversion first (e.g. 'v2.0.test')", + ) + + args = parser.parse_args() + if not args.local_dir: + args.local_dir = Path("/tmp/lerobot_dataset_v2") + + robot_config = parse_robot_config(args.robot_config, args.robot_overrides) if args.robot_config else None + del args.robot_config, args.robot_overrides + + convert_dataset(**vars(args), robot_config=robot_config) + + +if __name__ == "__main__": + main() diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py index 4d4ac6b0a..8ed3318dd 100644 --- a/lerobot/common/datasets/video_utils.py +++ b/lerobot/common/datasets/video_utils.py @@ -13,6 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import json import logging import subprocess import warnings @@ -25,47 +26,11 @@ import torch import torchvision from datasets.features.features import register_feature - - -def load_from_videos( - item: dict[str, torch.Tensor], - video_frame_keys: list[str], - videos_dir: Path, - tolerance_s: float, - backend: str = "pyav", -): - """Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function - in the main process (e.g. by using a second Dataloader with num_workers=0). It will result in a Segmentation Fault. - This probably happens because a memory reference to the video loader is created in the main process and a - subprocess fails to access it. - """ - # since video path already contains "videos" (e.g. videos_dir="data/videos", path="videos/episode_0.mp4") - data_dir = videos_dir.parent - - for key in video_frame_keys: - if isinstance(item[key], list): - # load multiple frames at once (expected when delta_timestamps is not None) - timestamps = [frame["timestamp"] for frame in item[key]] - paths = [frame["path"] for frame in item[key]] - if len(set(paths)) > 1: - raise NotImplementedError("All video paths are expected to be the same for now.") - video_path = data_dir / paths[0] - - frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend) - item[key] = frames - else: - # load one frame - timestamps = [item[key]["timestamp"]] - video_path = data_dir / item[key]["path"] - - frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend) - item[key] = frames[0] - - return item +from PIL import Image def decode_video_frames_torchvision( - video_path: str, + video_path: Path | str, timestamps: list[float], tolerance_s: float, backend: str = "pyav", @@ -163,8 +128,8 @@ def decode_video_frames_torchvision( def encode_video_frames( - imgs_dir: Path, - video_path: Path, + imgs_dir: Path | str, + video_path: Path | str, fps: int, vcodec: str = "libsvtav1", pix_fmt: str = "yuv420p", @@ -247,3 +212,104 @@ def __call__(self): ) # to make VideoFrame available in HuggingFace `datasets` register_feature(VideoFrame, "VideoFrame") + + +def get_audio_info(video_path: Path | str) -> dict: + ffprobe_audio_cmd = [ + "ffprobe", + "-v", + "error", + "-select_streams", + "a:0", + "-show_entries", + "stream=channels,codec_name,bit_rate,sample_rate,bit_depth,channel_layout,duration", + "-of", + "json", + str(video_path), + ] + result = subprocess.run(ffprobe_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + raise RuntimeError(f"Error running ffprobe: {result.stderr}") + + info = json.loads(result.stdout) + audio_stream_info = info["streams"][0] if info.get("streams") else None + if audio_stream_info is None: + return {"has_audio": False} + + # Return the information, defaulting to None if no audio stream is present + return { + "has_audio": True, + "audio.channels": audio_stream_info.get("channels", None), + "audio.codec": audio_stream_info.get("codec_name", None), + "audio.bit_rate": int(audio_stream_info["bit_rate"]) if audio_stream_info.get("bit_rate") else None, + "audio.sample_rate": int(audio_stream_info["sample_rate"]) + if audio_stream_info.get("sample_rate") + else None, + "audio.bit_depth": audio_stream_info.get("bit_depth", None), + "audio.channel_layout": audio_stream_info.get("channel_layout", None), + } + + +def get_video_info(video_path: Path | str) -> dict: + ffprobe_video_cmd = [ + "ffprobe", + "-v", + "error", + "-select_streams", + "v:0", + "-show_entries", + "stream=r_frame_rate,width,height,codec_name,nb_frames,duration,pix_fmt", + "-of", + "json", + str(video_path), + ] + result = subprocess.run(ffprobe_video_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if result.returncode != 0: + raise RuntimeError(f"Error running ffprobe: {result.stderr}") + + info = json.loads(result.stdout) + video_stream_info = info["streams"][0] + + # Calculate fps from r_frame_rate + r_frame_rate = video_stream_info["r_frame_rate"] + num, denom = map(int, r_frame_rate.split("/")) + fps = num / denom + + pixel_channels = get_video_pixel_channels(video_stream_info["pix_fmt"]) + + video_info = { + "video.fps": fps, + "video.height": video_stream_info["height"], + "video.width": video_stream_info["width"], + "video.channels": pixel_channels, + "video.codec": video_stream_info["codec_name"], + "video.pix_fmt": video_stream_info["pix_fmt"], + "video.is_depth_map": False, + **get_audio_info(video_path), + } + + return video_info + + +def get_video_pixel_channels(pix_fmt: str) -> int: + if "gray" in pix_fmt or "depth" in pix_fmt or "monochrome" in pix_fmt: + return 1 + elif "rgba" in pix_fmt or "yuva" in pix_fmt: + return 4 + elif "rgb" in pix_fmt or "yuv" in pix_fmt: + return 3 + else: + raise ValueError("Unknown format") + + +def get_image_pixel_channels(image: Image): + if image.mode == "L": + return 1 # Grayscale + elif image.mode == "LA": + return 2 # Grayscale + Alpha + elif image.mode == "RGB": + return 3 # RGB + elif image.mode == "RGBA": + return 4 # RGBA + else: + raise ValueError("Unknown format") diff --git a/lerobot/common/robot_devices/cameras/intelrealsense.py b/lerobot/common/robot_devices/cameras/intelrealsense.py index 479694277..84ac540f2 100644 --- a/lerobot/common/robot_devices/cameras/intelrealsense.py +++ b/lerobot/common/robot_devices/cameras/intelrealsense.py @@ -168,6 +168,7 @@ class IntelRealSenseCameraConfig: width: int | None = None height: int | None = None color_mode: str = "rgb" + channels: int | None = None use_depth: bool = False force_hardware_reset: bool = True rotation: int | None = None @@ -179,6 +180,8 @@ def __post_init__(self): f"`color_mode` is expected to be 'rgb' or 'bgr', but {self.color_mode} is provided." ) + self.channels = 3 + at_least_one_is_not_none = self.fps is not None or self.width is not None or self.height is not None at_least_one_is_none = self.fps is None or self.width is None or self.height is None if at_least_one_is_not_none and at_least_one_is_none: @@ -254,6 +257,7 @@ def __init__( self.fps = config.fps self.width = config.width self.height = config.height + self.channels = config.channels self.color_mode = config.color_mode self.use_depth = config.use_depth self.force_hardware_reset = config.force_hardware_reset diff --git a/lerobot/common/robot_devices/cameras/opencv.py b/lerobot/common/robot_devices/cameras/opencv.py index 11a050696..3b46c8f55 100644 --- a/lerobot/common/robot_devices/cameras/opencv.py +++ b/lerobot/common/robot_devices/cameras/opencv.py @@ -198,6 +198,7 @@ class OpenCVCameraConfig: width: int | None = None height: int | None = None color_mode: str = "rgb" + channels: int | None = None rotation: int | None = None mock: bool = False @@ -207,6 +208,8 @@ def __post_init__(self): f"`color_mode` is expected to be 'rgb' or 'bgr', but {self.color_mode} is provided." ) + self.channels = 3 + if self.rotation not in [-90, None, 90, 180]: raise ValueError( f"`rotation` must be in [-90, None, 90, 180] (got {self.rotation})" @@ -285,6 +288,7 @@ def __init__( self.fps = config.fps self.width = config.width self.height = config.height + self.channels = config.channels self.color_mode = config.color_mode self.mock = config.mock diff --git a/lerobot/common/robot_devices/control_utils.py b/lerobot/common/robot_devices/control_utils.py index 4b074b4f2..33f69cc68 100644 --- a/lerobot/common/robot_devices/control_utils.py +++ b/lerobot/common/robot_devices/control_utils.py @@ -14,10 +14,12 @@ import cv2 import torch import tqdm +from deepdiff import DeepDiff from termcolor import colored +from lerobot.common.datasets.image_writer import safe_stop_image_writer from lerobot.common.datasets.lerobot_dataset import LeRobotDataset -from lerobot.common.datasets.populate_dataset import add_frame, safe_stop_image_writer +from lerobot.common.datasets.utils import get_features_from_robot from lerobot.common.policies.factory import make_policy from lerobot.common.robot_devices.robots.utils import Robot from lerobot.common.robot_devices.utils import busy_wait @@ -245,7 +247,7 @@ def control_loop( control_time_s: float | None = None, teleoperate: bool = False, display_cameras: bool = False, - dataset: Dict[str, Any] | None = None, + dataset: LeRobotDataset | None = None, events: Dict[str, Any] | None = None, policy=None, device=None, @@ -265,7 +267,7 @@ def control_loop( if teleoperate and policy is not None: raise ValueError("When `teleoperate` is True, `policy` should be None.") - if dataset is not None and fps is not None and dataset["fps"] != fps: + if dataset is not None and fps is not None and dataset.fps != fps: raise ValueError( f"The dataset fps should be equal to requested fps ({dataset['fps']} != {fps})." ) @@ -290,7 +292,8 @@ def control_loop( action = {"action": action} if dataset is not None: - add_frame(dataset, observation, action) + frame = {**observation, **action} + dataset.add_frame(frame) if display_cameras and not is_headless(): image_keys = [key for key in observation if "image" in key] @@ -360,3 +363,27 @@ def sanity_check_dataset_name(repo_id, policy): raise ValueError( f"Your dataset name does not begin with 'eval_' ({dataset_name}), but a policy is provided ({policy})." ) + + +def sanity_check_dataset_robot_compatibility( + dataset: LeRobotDataset, robot: Robot, fps: int, use_videos: bool +) -> None: + fields = [ + ("robot_type", dataset.meta.robot_type, robot.robot_type), + ("fps", dataset.fps, fps), + ("features", dataset.features, get_features_from_robot(robot, use_videos)), + ] + + mismatches = [] + for field, dataset_value, present_value in fields: + diff = DeepDiff( + dataset_value, present_value, exclude_regex_paths=[r".*\['info'\]$"] + ) + if diff: + mismatches.append(f"{field}: expected {present_value}, got {dataset_value}") + + if mismatches: + raise ValueError( + "Dataset metadata compatibility check failed with mismatches:\n" + + "\n".join(mismatches) + ) diff --git a/lerobot/common/robot_devices/robots/manipulator.py b/lerobot/common/robot_devices/robots/manipulator.py index 9d63d09a5..a1725c2b8 100644 --- a/lerobot/common/robot_devices/robots/manipulator.py +++ b/lerobot/common/robot_devices/robots/manipulator.py @@ -245,6 +245,42 @@ def __init__( self.is_connected = False self.logs = {} + def get_motor_names(self, arm: dict[str, MotorsBus]) -> list: + return [f"{arm}_{motor}" for arm, bus in arm.items() for motor in bus.motors] + + @property + def camera_features(self) -> dict: + cam_ft = {} + for cam_key, cam in self.cameras.items(): + key = f"observation.images.{cam_key}" + cam_ft[key] = { + "shape": (cam.height, cam.width, cam.channels), + "names": ["height", "width", "channels"], + "info": None, + } + return cam_ft + + @property + def motor_features(self) -> dict: + action_names = self.get_motor_names(self.leader_arms) + state_names = self.get_motor_names(self.leader_arms) + return { + "action": { + "dtype": "float32", + "shape": (len(action_names),), + "names": action_names, + }, + "observation.state": { + "dtype": "float32", + "shape": (len(state_names),), + "names": state_names, + }, + } + + @property + def features(self): + return {**self.motor_features, **self.camera_features} + @property def has_camera(self): return len(self.cameras) > 0 diff --git a/lerobot/common/robot_devices/robots/utils.py b/lerobot/common/robot_devices/robots/utils.py index 1d2e0a5b4..5791adb6b 100644 --- a/lerobot/common/robot_devices/robots/utils.py +++ b/lerobot/common/robot_devices/robots/utils.py @@ -14,6 +14,7 @@ class Robot(Protocol): num_cameras: int is_connected: bool has_camera: bool + features: dict def connect(self): ... def run_calibration(self): ... diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py index 031e608e7..563023f48 100644 --- a/lerobot/scripts/control_robot.py +++ b/lerobot/scripts/control_robot.py @@ -29,7 +29,6 @@ ```bash python lerobot/scripts/control_robot.py record \ --fps 30 \ - --root tmp/data \ --repo-id $USER/koch_test \ --num-episodes 1 \ --run-compute-stats 0 @@ -38,7 +37,6 @@ - Visualize dataset: ```bash python lerobot/scripts/visualize_dataset.py \ - --root tmp/data \ --repo-id $USER/koch_test \ --episode-index 0 ``` @@ -47,7 +45,6 @@ ```bash python lerobot/scripts/control_robot.py replay \ --fps 30 \ - --root tmp/data \ --repo-id $USER/koch_test \ --episode 0 ``` @@ -57,7 +54,6 @@ ```bash python lerobot/scripts/control_robot.py record \ --fps 30 \ - --root data \ --repo-id $USER/koch_pick_place_lego \ --num-episodes 50 \ --warmup-time-s 2 \ @@ -77,7 +73,7 @@ - Train on this dataset with the ACT policy: ```bash -DATA_DIR=data python lerobot/scripts/train.py \ +python lerobot/scripts/train.py \ policy=act_koch_real \ env=koch_real \ dataset_repo_id=$USER/koch_pick_place_lego \ @@ -88,7 +84,6 @@ ```bash python lerobot/scripts/control_robot.py record \ --fps 30 \ - --root data \ --repo-id $USER/eval_act_koch_real \ --num-episodes 10 \ --warmup-time-s 2 \ @@ -106,12 +101,6 @@ # from safetensors.torch import load_file, save_file from lerobot.common.datasets.lerobot_dataset import LeRobotDataset -from lerobot.common.datasets.populate_dataset import ( - create_lerobot_dataset, - delete_current_episode, - init_dataset, - save_current_episode, -) from lerobot.common.robot_devices.control_utils import ( control_loop, has_method, @@ -121,6 +110,7 @@ record_episode, reset_environment, sanity_check_dataset_name, + sanity_check_dataset_robot_compatibility, stop_recording, warmup_record, ) @@ -196,25 +186,28 @@ def teleoperate( @safe_disconnect def record( robot: Robot, - root: str, + root: Path, repo_id: str, + single_task: str, pretrained_policy_name_or_path: str | None = None, policy_overrides: List[str] | None = None, fps: int | None = None, - warmup_time_s=2, - episode_time_s=10, - reset_time_s=5, - num_episodes=50, - video=True, - run_compute_stats=True, - push_to_hub=True, - tags=None, - num_image_writer_processes=0, - num_image_writer_threads_per_camera=4, - force_override=False, - display_cameras=True, - play_sounds=True, -): + warmup_time_s: int | float = 2, + episode_time_s: int | float = 10, + reset_time_s: int | float = 5, + num_episodes: int = 50, + video: bool = True, + run_compute_stats: bool = True, + push_to_hub: bool = True, + tags: list[str] | None = None, + num_image_writer_processes: int = 0, + num_image_writer_threads_per_camera: int = 4, + display_cameras: bool = True, + play_sounds: bool = True, + resume: bool = False, + # TODO(rcadene, aliberts): remove local_files_only when refactor with dataset as argument + local_files_only: bool = False, +) -> LeRobotDataset: # TODO(rcadene): Add option to record logs listener = None events = None @@ -222,6 +215,11 @@ def record( device = None use_amp = None + if single_task: + task = single_task + else: + raise NotImplementedError("Only single-task recording is supported for now") + # Load pretrained policy if pretrained_policy_name_or_path is not None: policy, policy_fps, device, use_amp = init_policy(pretrained_policy_name_or_path, policy_overrides) @@ -234,18 +232,29 @@ def record( f"There is a mismatch between the provided fps ({fps}) and the one from policy config ({policy_fps})." ) - # Create empty dataset or load existing saved episodes - sanity_check_dataset_name(repo_id, policy) - dataset = init_dataset( - repo_id, - root, - force_override, - fps, - video, - write_images=robot.has_camera, - num_image_writer_processes=num_image_writer_processes, - num_image_writer_threads=num_image_writer_threads_per_camera * robot.num_cameras, - ) + if resume: + dataset = LeRobotDataset( + repo_id, + root=root, + local_files_only=local_files_only, + ) + dataset.start_image_writer( + num_processes=num_image_writer_processes, + num_threads=num_image_writer_threads_per_camera * len(robot.cameras), + ) + sanity_check_dataset_robot_compatibility(dataset, robot, fps, video) + else: + # Create empty dataset or load existing saved episodes + sanity_check_dataset_name(repo_id, policy) + dataset = LeRobotDataset.create( + repo_id, + fps, + root=root, + robot=robot, + use_videos=video, + image_writer_processes=num_image_writer_processes, + image_writer_threads=num_image_writer_threads_per_camera * len(robot.cameras), + ) if not robot.is_connected: robot.connect() @@ -263,12 +272,17 @@ def record( if has_method(robot, "teleop_safety_stop"): robot.teleop_safety_stop() + recorded_episodes = 0 while True: - if dataset["num_episodes"] >= num_episodes: + if recorded_episodes >= num_episodes: break - episode_index = dataset["num_episodes"] - log_say(f"Recording episode {episode_index}", play_sounds) + # TODO(aliberts): add task prompt for multitask here. Might need to temporarily disable event if + # input() messes with them. + # if multi_task: + # task = input("Enter your task description: ") + + log_say(f"Recording episode {dataset.num_episodes}", play_sounds) record_episode( dataset=dataset, robot=robot, @@ -286,7 +300,7 @@ def record( # TODO(rcadene): add an option to enable teleoperation during reset # Skip reset for the last episode to be recorded if not events["stop_recording"] and ( - (episode_index < num_episodes - 1) or events["rerecord_episode"] + (dataset.num_episodes < num_episodes - 1) or events["rerecord_episode"] ): log_say("Reset the environment", play_sounds) reset_environment(robot, events, reset_time_s) @@ -295,11 +309,11 @@ def record( log_say("Re-record episode", play_sounds) events["rerecord_episode"] = False events["exit_early"] = False - delete_current_episode(dataset) + dataset.clear_episode_buffer() continue - # Increment by one dataset["current_episode_index"] - save_current_episode(dataset) + dataset.save_episode(task) + recorded_episodes += 1 if events["stop_recording"]: break @@ -307,35 +321,42 @@ def record( log_say("Stop recording", play_sounds, blocking=True) stop_recording(robot, listener, display_cameras) - lerobot_dataset = create_lerobot_dataset(dataset, run_compute_stats, push_to_hub, tags, play_sounds) + if run_compute_stats: + logging.info("Computing dataset statistics") + + dataset.consolidate(run_compute_stats) + + if push_to_hub: + dataset.push_to_hub(tags=tags) log_say("Exiting", play_sounds) - return lerobot_dataset + return dataset @safe_disconnect def replay( - robot: Robot, episode: int, fps: int | None = None, root="data", repo_id="lerobot/debug", play_sounds=True + robot: Robot, + root: Path, + repo_id: str, + episode: int, + fps: int | None = None, + play_sounds: bool = True, + local_files_only: bool = True, ): # TODO(rcadene, aliberts): refactor with control_loop, once `dataset` is an instance of LeRobotDataset # TODO(rcadene): Add option to record logs - local_dir = Path(root) / repo_id - if not local_dir.exists(): - raise ValueError(local_dir) - dataset = LeRobotDataset(repo_id, root=root) - items = dataset.hf_dataset.select_columns("action") - from_idx = dataset.episode_data_index["from"][episode].item() - to_idx = dataset.episode_data_index["to"][episode].item() + dataset = LeRobotDataset(repo_id, root=root, episodes=[episode], local_files_only=local_files_only) + actions = dataset.hf_dataset.select_columns("action") if not robot.is_connected: robot.connect() log_say("Replaying episode", play_sounds, blocking=True) - for idx in range(from_idx, to_idx): + for idx in range(dataset.num_frames): start_episode_t = time.perf_counter() - action = items[idx]["action"] + action = actions[idx]["action"] robot.send_action(action) dt_s = time.perf_counter() - start_episode_t @@ -384,13 +405,25 @@ def replay( ) parser_record = subparsers.add_parser("record", parents=[base_parser]) + task_args = parser_record.add_mutually_exclusive_group(required=True) parser_record.add_argument( "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)" ) + task_args.add_argument( + "--single-task", + type=str, + help="A short but accurate description of the task performed during the recording.", + ) + # TODO(aliberts): add multi-task support + # task_args.add_argument( + # "--multi-task", + # type=int, + # help="You will need to enter the task performed at the start of each episode.", + # ) parser_record.add_argument( "--root", type=Path, - default="data", + default=None, help="Root directory where the dataset will be stored locally at '{root}/{repo_id}' (e.g. 'data/hf_username/dataset_name').", ) parser_record.add_argument( @@ -458,10 +491,10 @@ def replay( ), ) parser_record.add_argument( - "--force-override", + "--resume", type=int, default=0, - help="By default, data recording is resumed. When set to 1, delete the local directory and start data recording from scratch.", + help="Resume recording on an existing dataset.", ) parser_record.add_argument( "-p", @@ -486,7 +519,7 @@ def replay( parser_replay.add_argument( "--root", type=Path, - default="data", + default=None, help="Root directory where the dataset will be stored locally at '{root}/{repo_id}' (e.g. 'data/hf_username/dataset_name').", ) parser_replay.add_argument( diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py index 0aec84720..040f92d96 100644 --- a/lerobot/scripts/eval.py +++ b/lerobot/scripts/eval.py @@ -484,7 +484,7 @@ def main( policy = make_policy(hydra_cfg=hydra_cfg, pretrained_policy_name_or_path=str(pretrained_policy_path)) else: # Note: We need the dataset stats to pass to the policy's normalization modules. - policy = make_policy(hydra_cfg=hydra_cfg, dataset_stats=make_dataset(hydra_cfg).stats) + policy = make_policy(hydra_cfg=hydra_cfg, dataset_stats=make_dataset(hydra_cfg).meta.stats) assert isinstance(policy, nn.Module) policy.eval() diff --git a/lerobot/scripts/push_dataset_to_hub.py b/lerobot/scripts/push_dataset_to_hub.py index adc4c72ad..2bb641a4d 100644 --- a/lerobot/scripts/push_dataset_to_hub.py +++ b/lerobot/scripts/push_dataset_to_hub.py @@ -117,10 +117,14 @@ def push_meta_data_to_hub(repo_id: str, meta_data_dir: str | Path, revision: str def push_dataset_card_to_hub( - repo_id: str, revision: str | None, tags: list | None = None, text: str | None = None + repo_id: str, + revision: str | None, + tags: list | None = None, + license: str = "apache-2.0", + **card_kwargs, ): """Creates and pushes a LeRobotDataset Card with appropriate tags to easily find it on the hub.""" - card = create_lerobot_dataset_card(tags=tags, text=text) + card = create_lerobot_dataset_card(tags=tags, license=license, **card_kwargs) card.push_to_hub(repo_id=repo_id, repo_type="dataset", revision=revision) @@ -260,7 +264,7 @@ def push_dataset_to_hub( episode_index = 0 tests_videos_dir = tests_data_dir / repo_id / "videos" tests_videos_dir.mkdir(parents=True, exist_ok=True) - for key in lerobot_dataset.video_frame_keys: + for key in lerobot_dataset.camera_keys: fname = f"{key}_episode_{episode_index:06d}.mp4" shutil.copy(videos_dir / fname, tests_videos_dir / fname) diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py index f60f904eb..9a0b7e4cb 100644 --- a/lerobot/scripts/train.py +++ b/lerobot/scripts/train.py @@ -171,9 +171,9 @@ def log_train_info(logger: Logger, info, step, cfg, dataset, is_online): # A sample is an (observation,action) pair, where observation and action # can be on multiple timestamps. In a batch, we have `batch_size`` number of samples. num_samples = (step + 1) * cfg.training.batch_size - avg_samples_per_ep = dataset.num_samples / dataset.num_episodes + avg_samples_per_ep = dataset.num_frames / dataset.num_episodes num_episodes = num_samples / avg_samples_per_ep - num_epochs = num_samples / dataset.num_samples + num_epochs = num_samples / dataset.num_frames log_items = [ f"step:{format_big_number(step)}", # number of samples seen during training @@ -208,9 +208,9 @@ def log_eval_info(logger, info, step, cfg, dataset, is_online): # A sample is an (observation,action) pair, where observation and action # can be on multiple timestamps. In a batch, we have `batch_size`` number of samples. num_samples = (step + 1) * cfg.training.batch_size - avg_samples_per_ep = dataset.num_samples / dataset.num_episodes + avg_samples_per_ep = dataset.num_frames / dataset.num_episodes num_episodes = num_samples / avg_samples_per_ep - num_epochs = num_samples / dataset.num_samples + num_epochs = num_samples / dataset.num_frames log_items = [ f"step:{format_big_number(step)}", # number of samples seen during training @@ -328,7 +328,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No logging.info("make_policy") policy = make_policy( hydra_cfg=cfg, - dataset_stats=offline_dataset.stats if not cfg.resume else None, + dataset_stats=offline_dataset.meta.stats if not cfg.resume else None, pretrained_policy_name_or_path=str(logger.last_pretrained_model_dir) if cfg.resume else None, ) assert isinstance(policy, nn.Module) @@ -349,7 +349,7 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No logging.info(f"{cfg.env.task=}") logging.info(f"{cfg.training.offline_steps=} ({format_big_number(cfg.training.offline_steps)})") logging.info(f"{cfg.training.online_steps=}") - logging.info(f"{offline_dataset.num_samples=} ({format_big_number(offline_dataset.num_samples)})") + logging.info(f"{offline_dataset.num_frames=} ({format_big_number(offline_dataset.num_frames)})") logging.info(f"{offline_dataset.num_episodes=}") logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})") logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})") @@ -573,7 +573,7 @@ def sample_trajectory_and_update_buffer(): online_drop_n_last_frames=cfg.training.get("drop_n_last_frames", 0) + 1, online_sampling_ratio=cfg.training.online_sampling_ratio, ) - sampler.num_samples = len(concat_dataset) + sampler.num_frames = len(concat_dataset) update_online_buffer_s = time.perf_counter() - start_update_buffer_time diff --git a/lerobot/scripts/visualize_dataset.py b/lerobot/scripts/visualize_dataset.py index 6cff5752a..cdd5ce605 100644 --- a/lerobot/scripts/visualize_dataset.py +++ b/lerobot/scripts/visualize_dataset.py @@ -100,7 +100,7 @@ def to_hwc_uint8_numpy(chw_float32_torch: torch.Tensor) -> np.ndarray: def visualize_dataset( - repo_id: str, + dataset: LeRobotDataset, episode_index: int, batch_size: int = 32, num_workers: int = 0, @@ -108,7 +108,6 @@ def visualize_dataset( web_port: int = 9090, ws_port: int = 9087, save: bool = False, - root: Path | None = None, output_dir: Path | None = None, ) -> Path | None: if save: @@ -116,8 +115,7 @@ def visualize_dataset( output_dir is not None ), "Set an output directory where to write .rrd files with `--output-dir path/to/directory`." - logging.info("Loading dataset") - dataset = LeRobotDataset(repo_id, root=root) + repo_id = dataset.repo_id logging.info("Loading dataloader") episode_sampler = EpisodeSampler(dataset, episode_index) @@ -153,7 +151,7 @@ def visualize_dataset( rr.set_time_seconds("timestamp", batch["timestamp"][i].item()) # display each camera image - for key in dataset.camera_keys: + for key in dataset.meta.camera_keys: # TODO(rcadene): add `.compress()`? is it lossless? rr.log(key, rr.Image(to_hwc_uint8_numpy(batch[key][i]))) @@ -209,11 +207,17 @@ def main(): required=True, help="Episode to visualize.", ) + parser.add_argument( + "--local-files-only", + type=int, + default=0, + help="Use local files only. By default, this script will try to fetch the dataset from the hub if it exists.", + ) parser.add_argument( "--root", type=Path, default=None, - help="Root directory for a dataset stored locally (e.g. `--root data`). By default, the dataset will be loaded from hugging face cache folder, or downloaded from the hub if available.", + help="Root directory for the dataset stored locally (e.g. `--root data`). By default, the dataset will be loaded from hugging face cache folder, or downloaded from the hub if available.", ) parser.add_argument( "--output-dir", @@ -268,7 +272,15 @@ def main(): ) args = parser.parse_args() - visualize_dataset(**vars(args)) + kwargs = vars(args) + repo_id = kwargs.pop("repo_id") + root = kwargs.pop("root") + local_files_only = kwargs.pop("local_files_only") + + logging.info("Loading dataset") + dataset = LeRobotDataset(repo_id, root=root, local_files_only=local_files_only) + + visualize_dataset(dataset, **vars(args)) if __name__ == "__main__": diff --git a/lerobot/scripts/visualize_dataset_html.py b/lerobot/scripts/visualize_dataset_html.py index c035e5626..2c81fbfc5 100644 --- a/lerobot/scripts/visualize_dataset_html.py +++ b/lerobot/scripts/visualize_dataset_html.py @@ -93,18 +93,17 @@ def index(): def show_episode(dataset_namespace, dataset_name, episode_id): dataset_info = { "repo_id": dataset.repo_id, - "num_samples": dataset.num_samples, + "num_samples": dataset.num_frames, "num_episodes": dataset.num_episodes, "fps": dataset.fps, } - video_paths = get_episode_video_paths(dataset, episode_id) - language_instruction = get_episode_language_instruction(dataset, episode_id) + video_paths = [dataset.meta.get_video_file_path(episode_id, key) for key in dataset.meta.video_keys] + tasks = dataset.meta.episodes[episode_id]["tasks"] videos_info = [ - {"url": url_for("static", filename=video_path), "filename": Path(video_path).name} + {"url": url_for("static", filename=video_path), "filename": video_path.name} for video_path in video_paths ] - if language_instruction: - videos_info[0]["language_instruction"] = language_instruction + videos_info[0]["language_instruction"] = tasks ep_csv_url = url_for("static", filename=get_ep_csv_fname(episode_id)) return render_template( @@ -131,16 +130,16 @@ def write_episode_data_csv(output_dir, file_name, episode_index, dataset): from_idx = dataset.episode_data_index["from"][episode_index] to_idx = dataset.episode_data_index["to"][episode_index] - has_state = "observation.state" in dataset.hf_dataset.features - has_action = "action" in dataset.hf_dataset.features + has_state = "observation.state" in dataset.features + has_action = "action" in dataset.features # init header of csv with state and action names header = ["timestamp"] if has_state: - dim_state = len(dataset.hf_dataset["observation.state"][0]) + dim_state = dataset.meta.shapes["observation.state"][0] header += [f"state_{i}" for i in range(dim_state)] if has_action: - dim_action = len(dataset.hf_dataset["action"][0]) + dim_action = dataset.meta.shapes["action"][0] header += [f"action_{i}" for i in range(dim_action)] columns = ["timestamp"] @@ -172,27 +171,12 @@ def get_episode_video_paths(dataset: LeRobotDataset, ep_index: int) -> list[str] first_frame_idx = dataset.episode_data_index["from"][ep_index].item() return [ dataset.hf_dataset.select_columns(key)[first_frame_idx][key]["path"] - for key in dataset.video_frame_keys + for key in dataset.meta.video_keys ] -def get_episode_language_instruction(dataset: LeRobotDataset, ep_index: int) -> list[str]: - # check if the dataset has language instructions - if "language_instruction" not in dataset.hf_dataset.features: - return None - - # get first frame index - first_frame_idx = dataset.episode_data_index["from"][ep_index].item() - - language_instruction = dataset.hf_dataset[first_frame_idx]["language_instruction"] - # TODO (michel-aractingi) hack to get the sentence, some strings in openx are badly stored - # with the tf.tensor appearing in the string - return language_instruction.removeprefix("tf.Tensor(b'").removesuffix("', shape=(), dtype=string)") - - def visualize_dataset_html( - repo_id: str, - root: Path | None = None, + dataset: LeRobotDataset, episodes: list[int] = None, output_dir: Path | None = None, serve: bool = True, @@ -202,13 +186,11 @@ def visualize_dataset_html( ) -> Path | None: init_logging() - dataset = LeRobotDataset(repo_id, root=root) - - if not dataset.video: - raise NotImplementedError(f"Image datasets ({dataset.video=}) are currently not supported.") + if len(dataset.meta.image_keys) > 0: + raise NotImplementedError(f"Image keys ({dataset.meta.image_keys=}) are currently not supported.") if output_dir is None: - output_dir = f"outputs/visualize_dataset_html/{repo_id}" + output_dir = f"outputs/visualize_dataset_html/{dataset.repo_id}" output_dir = Path(output_dir) if output_dir.exists(): @@ -225,7 +207,7 @@ def visualize_dataset_html( static_dir.mkdir(parents=True, exist_ok=True) ln_videos_dir = static_dir / "videos" if not ln_videos_dir.exists(): - ln_videos_dir.symlink_to(dataset.videos_dir.resolve()) + ln_videos_dir.symlink_to((dataset.root / "videos").resolve()) template_dir = Path(__file__).resolve().parent.parent / "templates" @@ -252,6 +234,12 @@ def main(): required=True, help="Name of hugging face repositery containing a LeRobotDataset dataset (e.g. `lerobot/pusht` for https://huggingface.co/datasets/lerobot/pusht).", ) + parser.add_argument( + "--local-files-only", + type=int, + default=0, + help="Use local files only. By default, this script will try to fetch the dataset from the hub if it exists.", + ) parser.add_argument( "--root", type=Path, @@ -297,7 +285,13 @@ def main(): ) args = parser.parse_args() - visualize_dataset_html(**vars(args)) + kwargs = vars(args) + repo_id = kwargs.pop("repo_id") + root = kwargs.pop("root") + local_files_only = kwargs.pop("local_files_only") + + dataset = LeRobotDataset(repo_id, root=root, local_files_only=local_files_only) + visualize_dataset_html(dataset, **kwargs) if __name__ == "__main__": diff --git a/lerobot/scripts/visualize_image_transforms.py b/lerobot/scripts/visualize_image_transforms.py index e7cd35827..f9fb5c08a 100644 --- a/lerobot/scripts/visualize_image_transforms.py +++ b/lerobot/scripts/visualize_image_transforms.py @@ -157,7 +157,7 @@ def visualize_transforms(cfg, output_dir: Path, n_examples: int = 5): output_dir.mkdir(parents=True, exist_ok=True) # Get 1st frame from 1st camera of 1st episode - original_frame = dataset[0][dataset.camera_keys[0]] + original_frame = dataset[0][dataset.meta.camera_keys[0]] to_pil(original_frame).save(output_dir / "original_frame.png", quality=100) print("\nOriginal frame saved to:") print(f" {output_dir / 'original_frame.png'}.") diff --git a/lerobot/templates/visualize_dataset_template.html b/lerobot/templates/visualize_dataset_template.html index 658d6ba6c..0fa1e713e 100644 --- a/lerobot/templates/visualize_dataset_template.html +++ b/lerobot/templates/visualize_dataset_template.html @@ -35,7 +35,7 @@

{{ dataset_info.repo_id }}