From a2c181992a131c92e1930f3ebc3014112fe03625 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Tue, 3 Dec 2024 00:51:55 +0100
Subject: [PATCH 1/3] Refactor OpenX (#505)

---
 .../push_dataset_to_hub/openx/configs.yaml    | 639 -------------
 .../push_dataset_to_hub/openx/data_utils.py   | 106 ---
 .../push_dataset_to_hub/openx/droid_utils.py  | 200 ----
 .../push_dataset_to_hub/openx/transforms.py   | 859 ------------------
 .../push_dataset_to_hub/openx_rlds_format.py  | 143 +--
 lerobot/scripts/push_dataset_to_hub.py        |  30 +-
 6 files changed, 58 insertions(+), 1919 deletions(-)
 delete mode 100644 lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml
 delete mode 100644 lerobot/common/datasets/push_dataset_to_hub/openx/data_utils.py
 delete mode 100644 lerobot/common/datasets/push_dataset_to_hub/openx/droid_utils.py
 delete mode 100644 lerobot/common/datasets/push_dataset_to_hub/openx/transforms.py

diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml b/lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml
deleted file mode 100644
index f706270a2..000000000
--- a/lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml
+++ /dev/null
@@ -1,639 +0,0 @@
-OPENX_DATASET_CONFIGS:
-  fractal20220817_data:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - base_pose_tool_reached
-      - gripper_closed
-    fps: 3
-
-  kuka:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - clip_function_input/base_pose_tool_reached
-      - gripper_closed
-    fps: 10
-
-  bridge_openx:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - EEF_state
-      - gripper_state
-    fps: 5
-
-  taco_play:
-    image_obs_keys:
-      - rgb_static
-      - rgb_gripper
-    depth_obs_keys:
-      - depth_static
-      - depth_gripper
-    state_obs_keys:
-      - state_eef
-      - state_gripper
-    fps: 15
-
-  jaco_play:
-    image_obs_keys:
-      - image
-      - image_wrist
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state_eef
-      - state_gripper
-    fps: 10
-
-  berkeley_cable_routing:
-    image_obs_keys:
-      - image
-      - top_image
-      - wrist45_image
-      - wrist225_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - robot_state
-    fps: 10
-
-  roboturk:
-    image_obs_keys:
-      - front_rgb
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - null
-    fps: 10
-
-  nyu_door_opening_surprising_effectiveness:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - null
-    fps: 3
-
-  viola:
-    image_obs_keys:
-      - agentview_rgb
-      - eye_in_hand_rgb
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - joint_states
-      - gripper_states
-    fps: 20
-
-  berkeley_autolab_ur5:
-    image_obs_keys:
-      - image
-      - hand_image
-    depth_obs_keys:
-      - image_with_depth
-    state_obs_keys:
-      - state
-    fps: 5
-
-  toto:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 30
-
-  language_table:
-    image_obs_keys:
-      - rgb
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - effector_translation
-    fps: 10
-
-  columbia_cairlab_pusht_real:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - robot_state
-    fps: 10
-
-  stanford_kuka_multimodal_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - depth_image
-    state_obs_keys:
-      - ee_position
-      - ee_orientation
-    fps: 20
-
-  nyu_rot_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - eef_state
-      - gripper_state
-    fps: 3
-
-  io_ai_tech:
-    image_obs_keys:
-      - image
-      - image_fisheye
-      - image_left_side
-      - image_right_side
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 3
-
-  stanford_hydra_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - eef_state
-      - gripper_state
-    fps: 10
-
-  austin_buds_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 20
-
-  nyu_franka_play_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-      - image_additional_view
-    depth_obs_keys:
-      - depth
-      - depth_additional_view
-    state_obs_keys:
-      - eef_state
-    fps: 3
-
-  maniskill_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - depth
-      - wrist_depth
-    state_obs_keys:
-      - tcp_pose
-      - gripper_state
-    fps: 20
-
-  furniture_bench_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 10
-
-  cmu_franka_exploration_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - highres_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - null
-    fps: 10
-
-  ucsd_kitchen_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - joint_state
-    fps: 2
-
-  ucsd_pick_and_place_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - eef_state
-      - gripper_state
-    fps: 3
-
-  spoc:
-    image_obs_keys:
-      - image
-      - image_manipulation
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - null
-    fps: 3
-
-  austin_sailor_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 20
-
-  austin_sirius_dataset_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 20
-
-  bc_z:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - present/xyz
-      - present/axis_angle
-      - present/sensed_close
-    fps: 10
-
-  utokyo_pr2_opening_fridge_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - eef_state
-      - gripper_state
-    fps: 10
-
-  utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - eef_state
-      - gripper_state
-    fps: 10
-
-  utokyo_xarm_pick_and_place_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-      - image2
-      - hand_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - end_effector_pose
-    fps: 10
-
-  utokyo_xarm_bimanual_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - pose_r
-    fps: 10
-
-  robo_net:
-    image_obs_keys:
-      - image
-      - image1
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - eef_state
-      - gripper_state
-    fps: 1
-
-  robo_set:
-    image_obs_keys:
-      - image_left
-      - image_right
-      - image_wrist
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-      - state_velocity
-    fps: 5
-
-  berkeley_mvp_converted_externally_to_rlds:
-    image_obs_keys:
-      - hand_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - gripper
-      - pose
-      - joint_pos
-    fps: 5
-
-  berkeley_rpt_converted_externally_to_rlds:
-    image_obs_keys:
-      - hand_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - joint_pos
-      - gripper
-    fps: 30
-
-  kaist_nonprehensile_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 10
-
-  stanford_mask_vit_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - eef_state
-      - gripper_state
-
-  tokyo_u_lsmo_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - eef_state
-      - gripper_state
-    fps: 10
-
-  dlr_sara_pour_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 10
-
-  dlr_sara_grid_clamp_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 10
-
-  dlr_edan_shared_control_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 5
-
-  asu_table_top_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - eef_state
-      - gripper_state
-    fps: 12.5
-
-  stanford_robocook_converted_externally_to_rlds:
-    image_obs_keys:
-      - image_1
-      - image_2
-    depth_obs_keys:
-      - depth_1
-      - depth_2
-    state_obs_keys:
-      - eef_state
-      - gripper_state
-    fps: 5
-
-  imperialcollege_sawyer_wrist_cam:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 10
-
-  iamlab_cmu_pickup_insert_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - joint_state
-      - gripper_state
-    fps: 20
-
-  uiuc_d3field:
-    image_obs_keys:
-      - image_1
-      - image_2
-    depth_obs_keys:
-      - depth_1
-      - depth_2
-    state_obs_keys:
-      - null
-    fps: 1
-
-  utaustin_mutex:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 20
-
-  berkeley_fanuc_manipulation:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - joint_state
-      - gripper_state
-    fps: 10
-
-  cmu_playing_with_food:
-    image_obs_keys:
-      - image
-      - finger_vision_1
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 10
-
-  cmu_play_fusion:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 5
-
-  cmu_stretch:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - eef_state
-      - gripper_state
-    fps: 10
-
-  berkeley_gnm_recon:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-      - position
-      - yaw
-    fps: 3
-
-  berkeley_gnm_cory_hall:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-      - position
-      - yaw
-    fps: 5
-
-  berkeley_gnm_sac_son:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-      - position
-      - yaw
-    fps: 10
-
-  droid:
-    image_obs_keys:
-      - exterior_image_1_left
-      - exterior_image_2_left
-      - wrist_image_left
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - proprio
-    fps: 15
-
-  droid_100:
-    image_obs_keys:
-      - exterior_image_1_left
-      - exterior_image_2_left
-      - wrist_image_left
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - proprio
-    fps: 15
-
-  fmb:
-    image_obs_keys:
-      - image_side_1
-      - image_side_2
-      - image_wrist_1
-      - image_wrist_2
-    depth_obs_keys:
-      - image_side_1_depth
-      - image_side_2_depth
-      - image_wrist_1_depth
-      - image_wrist_2_depth
-    state_obs_keys:
-      - proprio
-    fps: 10
-
-  dobbe:
-    image_obs_keys:
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - proprio
-    fps: 3.75
-
-  usc_cloth_sim_converted_externally_to_rlds:
-    image_obs_keys:
-      - image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - null
-    fps: 10
-
-  plex_robosuite:
-    image_obs_keys:
-      - image
-      - wrist_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 20
-
-  conq_hose_manipulation:
-    image_obs_keys:
-      - frontleft_fisheye_image
-      - frontright_fisheye_image
-      - hand_color_image
-    depth_obs_keys:
-      - null
-    state_obs_keys:
-      - state
-    fps: 30
diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx/data_utils.py b/lerobot/common/datasets/push_dataset_to_hub/openx/data_utils.py
deleted file mode 100644
index 1582c67c2..000000000
--- a/lerobot/common/datasets/push_dataset_to_hub/openx/data_utils.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the Licens    e.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-NOTE(YL): Adapted from:
-    Octo: https://github.com/octo-models/octo/blob/main/octo/data/utils/data_utils.py
-
-data_utils.py
-
-Additional utils for data processing.
-"""
-
-from typing import Any, Dict, List
-
-import tensorflow as tf
-
-
-def binarize_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
-    """
-    Converts gripper actions from continuous to binary values (0 and 1).
-
-    We exploit that fact that most of the time, the gripper is fully open (near 1.0) or fully closed (near 0.0). As it
-    transitions between the two, it sometimes passes through a few intermediate values. We relabel those intermediate
-    values based on the state that is reached _after_ those intermediate values.
-
-    In the edge case that the trajectory ends with an intermediate value, we give up on binarizing and relabel that
-    chunk of intermediate values as the last action in the trajectory.
-
-    The `scan_fn` implements the following logic:
-        new_actions = np.empty_like(actions)
-        carry = actions[-1]
-        for i in reversed(range(actions.shape[0])):
-            if in_between_mask[i]:
-                carry = carry
-            else:
-                carry = float(open_mask[i])
-            new_actions[i] = carry
-    """
-    open_mask, closed_mask = actions > 0.95, actions < 0.05
-    in_between_mask = tf.logical_not(tf.logical_or(open_mask, closed_mask))
-    is_open_float = tf.cast(open_mask, tf.float32)
-
-    def scan_fn(carry, i):
-        return tf.cond(in_between_mask[i], lambda: tf.cast(carry, tf.float32), lambda: is_open_float[i])
-
-    return tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), actions[-1], reverse=True)
-
-
-def invert_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
-    return 1 - actions
-
-
-def rel2abs_gripper_actions(actions: tf.Tensor) -> tf.Tensor:
-    """
-    Converts relative gripper actions (+1 for closing, -1 for opening) to absolute actions (0 = closed; 1 = open).
-
-    Assumes that the first relative gripper is not redundant (i.e. close when already closed)!
-    """
-    # Note =>> -1 for closing, 1 for opening, 0 for no change
-    opening_mask, closing_mask = actions < -0.1, actions > 0.1
-    thresholded_actions = tf.where(opening_mask, 1, tf.where(closing_mask, -1, 0))
-
-    def scan_fn(carry, i):
-        return tf.cond(thresholded_actions[i] == 0, lambda: carry, lambda: thresholded_actions[i])
-
-    # If no relative grasp, assumes open for whole trajectory
-    start = -1 * thresholded_actions[tf.argmax(thresholded_actions != 0, axis=0)]
-    start = tf.cond(start == 0, lambda: 1, lambda: start)
-
-    # Note =>> -1 for closed, 1 for open
-    new_actions = tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), start)
-    new_actions = tf.cast(new_actions, tf.float32) / 2 + 0.5
-
-    return new_actions
-
-
-# === Bridge-V2 =>> Dataset-Specific Transform ===
-def relabel_bridge_actions(traj: Dict[str, Any]) -> Dict[str, Any]:
-    """Relabels actions to use reached proprioceptive state; discards last timestep (no-action)."""
-    movement_actions = traj["observation"]["state"][1:, :6] - traj["observation"]["state"][:-1, :6]
-    traj_truncated = tf.nest.map_structure(lambda x: x[:-1], traj)
-    traj_truncated["action"] = tf.concat([movement_actions, traj["action"][:-1, -1:]], axis=1)
-
-    return traj_truncated
-
-
-# === RLDS Dataset Initialization Utilities ===
-def pprint_data_mixture(dataset_kwargs_list: List[Dict[str, Any]], dataset_weights: List[int]) -> None:
-    print("\n######################################################################################")
-    print(f"# Loading the following {len(dataset_kwargs_list)} datasets (incl. sampling weight):{'': >24} #")
-    for dataset_kwargs, weight in zip(dataset_kwargs_list, dataset_weights, strict=False):
-        pad = 80 - len(dataset_kwargs["name"])
-        print(f"# {dataset_kwargs['name']}: {weight:=>{pad}f} #")
-    print("######################################################################################\n")
diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx/droid_utils.py b/lerobot/common/datasets/push_dataset_to_hub/openx/droid_utils.py
deleted file mode 100644
index 22ac4d9e3..000000000
--- a/lerobot/common/datasets/push_dataset_to_hub/openx/droid_utils.py
+++ /dev/null
@@ -1,200 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-NOTE(YL): Adapted from:
-    OpenVLA: https://github.com/openvla/openvla
-
-Episode transforms for DROID dataset.
-"""
-
-from typing import Any, Dict
-
-import tensorflow as tf
-import tensorflow_graphics.geometry.transformation as tfg
-
-
-def rmat_to_euler(rot_mat):
-    return tfg.euler.from_rotation_matrix(rot_mat)
-
-
-def euler_to_rmat(euler):
-    return tfg.rotation_matrix_3d.from_euler(euler)
-
-
-def invert_rmat(rot_mat):
-    return tfg.rotation_matrix_3d.inverse(rot_mat)
-
-
-def rotmat_to_rot6d(mat):
-    """
-    Converts rotation matrix to R6 rotation representation (first two rows in rotation matrix).
-    Args:
-        mat: rotation matrix
-
-    Returns: 6d vector (first two rows of rotation matrix)
-
-    """
-    r6 = mat[..., :2, :]
-    r6_0, r6_1 = r6[..., 0, :], r6[..., 1, :]
-    r6_flat = tf.concat([r6_0, r6_1], axis=-1)
-    return r6_flat
-
-
-def velocity_act_to_wrist_frame(velocity, wrist_in_robot_frame):
-    """
-    Translates velocity actions (translation + rotation) from base frame of the robot to wrist frame.
-    Args:
-        velocity: 6d velocity action (3 x translation, 3 x rotation)
-        wrist_in_robot_frame: 6d pose of the end-effector in robot base frame
-
-    Returns: 9d velocity action in robot wrist frame (3 x translation, 6 x rotation as R6)
-
-    """
-    r_frame = euler_to_rmat(wrist_in_robot_frame[:, 3:6])
-    r_frame_inv = invert_rmat(r_frame)
-
-    # world to wrist: dT_pi = R^-1 dT_rbt
-    vel_t = (r_frame_inv @ velocity[:, :3][..., None])[..., 0]
-
-    # world to wrist: dR_pi = R^-1 dR_rbt R
-    dr_ = euler_to_rmat(velocity[:, 3:6])
-    dr_ = r_frame_inv @ (dr_ @ r_frame)
-    dr_r6 = rotmat_to_rot6d(dr_)
-    return tf.concat([vel_t, dr_r6], axis=-1)
-
-
-def rand_swap_exterior_images(img1, img2):
-    """
-    Randomly swaps the two exterior images (for training with single exterior input).
-    """
-    return tf.cond(tf.random.uniform(shape=[]) > 0.5, lambda: (img1, img2), lambda: (img2, img1))
-
-
-def droid_baseact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    DROID dataset transformation for actions expressed in *base* frame of the robot.
-    """
-    dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
-    dr_ = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
-
-    trajectory["action"] = tf.concat(
-        (
-            dt,
-            dr_,
-            1 - trajectory["action_dict"]["gripper_position"],
-        ),
-        axis=-1,
-    )
-    trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
-        rand_swap_exterior_images(
-            trajectory["observation"]["exterior_image_1_left"],
-            trajectory["observation"]["exterior_image_2_left"],
-        )
-    )
-    trajectory["observation"]["proprio"] = tf.concat(
-        (
-            trajectory["observation"]["cartesian_position"],
-            trajectory["observation"]["gripper_position"],
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def droid_wristact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    DROID dataset transformation for actions expressed in *wrist* frame of the robot.
-    """
-    wrist_act = velocity_act_to_wrist_frame(
-        trajectory["action_dict"]["cartesian_velocity"], trajectory["observation"]["cartesian_position"]
-    )
-    trajectory["action"] = tf.concat(
-        (
-            wrist_act,
-            trajectory["action_dict"]["gripper_position"],
-        ),
-        axis=-1,
-    )
-    trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
-        rand_swap_exterior_images(
-            trajectory["observation"]["exterior_image_1_left"],
-            trajectory["observation"]["exterior_image_2_left"],
-        )
-    )
-    trajectory["observation"]["proprio"] = tf.concat(
-        (
-            trajectory["observation"]["cartesian_position"],
-            trajectory["observation"]["gripper_position"],
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def droid_finetuning_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    DROID dataset transformation for actions expressed in *base* frame of the robot.
-    """
-    dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
-    dr_ = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
-    trajectory["action"] = tf.concat(
-        (
-            dt,
-            dr_,
-            1 - trajectory["action_dict"]["gripper_position"],
-        ),
-        axis=-1,
-    )
-    trajectory["observation"]["proprio"] = tf.concat(
-        (
-            trajectory["observation"]["cartesian_position"],
-            trajectory["observation"]["gripper_position"],
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def zero_action_filter(traj: Dict) -> bool:
-    """
-    Filters transitions whose actions are all-0 (only relative actions, no gripper action).
-    Note: this filter is applied *after* action normalization, so need to compare to "normalized 0".
-    """
-    droid_q01 = tf.convert_to_tensor(
-        [
-            -0.7776297926902771,
-            -0.5803514122962952,
-            -0.5795090794563293,
-            -0.6464047729969025,
-            -0.7041108310222626,
-            -0.8895104378461838,
-        ]
-    )
-    droid_q99 = tf.convert_to_tensor(
-        [
-            0.7597932070493698,
-            0.5726242214441299,
-            0.7351000607013702,
-            0.6705610305070877,
-            0.6464948207139969,
-            0.8897542208433151,
-        ]
-    )
-    droid_norm_0_act = (
-        2 * (tf.zeros_like(traj["action"][:, :6]) - droid_q01) / (droid_q99 - droid_q01 + 1e-8) - 1
-    )
-
-    return tf.reduce_any(tf.math.abs(traj["action"][:, :6] - droid_norm_0_act) > 1e-5)
diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx/transforms.py b/lerobot/common/datasets/push_dataset_to_hub/openx/transforms.py
deleted file mode 100644
index a0c1e30f6..000000000
--- a/lerobot/common/datasets/push_dataset_to_hub/openx/transforms.py
+++ /dev/null
@@ -1,859 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-NOTE(YL): Adapted from:
-    OpenVLA: https://github.com/openvla/openvla
-    Octo: https://github.com/octo-models/octo
-
-transforms.py
-
-Defines a registry of per-dataset standardization transforms for each dataset in Open-X Embodiment.
-
-Transforms adopt the following structure:
-    Input: Dictionary of *batched* features (i.e., has leading time dimension)
-    Output: Dictionary `step` =>> {
-        "observation": {
-            <image_keys, depth_image_keys>
-            State (in chosen state representation)
-        },
-        "action": Action (in chosen action representation),
-        "language_instruction": str
-    }
-"""
-
-from typing import Any, Dict
-
-import tensorflow as tf
-
-from lerobot.common.datasets.push_dataset_to_hub.openx.data_utils import (
-    binarize_gripper_actions,
-    invert_gripper_actions,
-    rel2abs_gripper_actions,
-    relabel_bridge_actions,
-)
-
-
-def droid_baseact_transform_fn():
-    from lerobot.common.datasets.push_dataset_to_hub.openx.droid_utils import droid_baseact_transform
-
-    return droid_baseact_transform
-
-
-def bridge_openx_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Applies to version of Bridge V2 in Open X-Embodiment mixture.
-
-    Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
-    """
-    for key in trajectory:
-        if key == "traj_metadata":
-            continue
-        elif key in ["observation", "action"]:
-            for key2 in trajectory[key]:
-                trajectory[key][key2] = trajectory[key][key2][1:]
-        else:
-            trajectory[key] = trajectory[key][1:]
-
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["world_vector"],
-            trajectory["action"]["rotation_delta"],
-            tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
-        ),
-        axis=-1,
-    )
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    trajectory = relabel_bridge_actions(trajectory)
-    trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
-    return trajectory
-
-
-def bridge_orig_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Applies to original version of Bridge V2 from the official project website.
-
-    Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
-    """
-    for key in trajectory:
-        if key == "traj_metadata":
-            continue
-        elif key == "observation":
-            for key2 in trajectory[key]:
-                trajectory[key][key2] = trajectory[key][key2][1:]
-        else:
-            trajectory[key] = trajectory[key][1:]
-
-    trajectory["action"] = tf.concat(
-        [
-            trajectory["action"][:, :6],
-            binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
-        ],
-        axis=1,
-    )
-    trajectory = relabel_bridge_actions(trajectory)
-    trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
-    return trajectory
-
-
-def ppgm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["action"] = tf.concat(
-        [
-            trajectory["action"][:, :6],
-            binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
-        ],
-        axis=1,
-    )
-    trajectory["observation"]["EEF_state"] = trajectory["observation"]["cartesian_position"][:, :6]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["gripper_position"][:, -1:]
-    return trajectory
-
-
-def rt1_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # make gripper action absolute action, +1 = open, 0 = close
-    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
-    gripper_action = rel2abs_gripper_actions(gripper_action)
-
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["world_vector"],
-            trajectory["action"]["rotation_delta"],
-            gripper_action[:, None],
-        ),
-        axis=-1,
-    )
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    return trajectory
-
-
-def kuka_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # make gripper action absolute action, +1 = open, 0 = close
-    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
-    gripper_action = rel2abs_gripper_actions(gripper_action)
-
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["world_vector"],
-            trajectory["action"]["rotation_delta"],
-            gripper_action[:, None],
-        ),
-        axis=-1,
-    )
-    # decode compressed state
-    eef_value = tf.io.decode_compressed(
-        trajectory["observation"]["clip_function_input/base_pose_tool_reached"],
-        compression_type="ZLIB",
-    )
-    eef_value = tf.io.decode_raw(eef_value, tf.float32)
-    trajectory["observation"]["clip_function_input/base_pose_tool_reached"] = tf.reshape(eef_value, (-1, 7))
-    gripper_value = tf.io.decode_compressed(
-        trajectory["observation"]["gripper_closed"], compression_type="ZLIB"
-    )
-    gripper_value = tf.io.decode_raw(gripper_value, tf.float32)
-    trajectory["observation"]["gripper_closed"] = tf.reshape(gripper_value, (-1, 1))
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    return trajectory
-
-
-def taco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["state_eef"] = trajectory["observation"]["robot_obs"][:, :6]
-    trajectory["observation"]["state_gripper"] = trajectory["observation"]["robot_obs"][:, 7:8]
-    trajectory["action"] = trajectory["action"]["rel_actions_world"]
-
-    # invert gripper action + clip, +1 = open, 0 = close
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :6],
-            tf.clip_by_value(trajectory["action"][:, -1:], 0, 1),
-        ),
-        axis=-1,
-    )
-
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    return trajectory
-
-
-def jaco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["state_eef"] = trajectory["observation"]["end_effector_cartesian_pos"][:, :6]
-    trajectory["observation"]["state_gripper"] = trajectory["observation"]["end_effector_cartesian_pos"][
-        :, -1:
-    ]
-
-    # make gripper action absolute action, +1 = open, 0 = close
-    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
-    gripper_action = rel2abs_gripper_actions(gripper_action)
-
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["world_vector"],
-            tf.zeros_like(trajectory["action"]["world_vector"]),
-            gripper_action[:, None],
-        ),
-        axis=-1,
-    )
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    return trajectory
-
-
-def berkeley_cable_routing_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["world_vector"],
-            trajectory["action"]["rotation_delta"],
-            tf.zeros_like(trajectory["action"]["world_vector"][:, :1]),
-        ),
-        axis=-1,
-    )
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    return trajectory
-
-
-def roboturk_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # invert absolute gripper action, +1 = open, 0 = close
-    gripper_action = invert_gripper_actions(
-        tf.clip_by_value(trajectory["action"]["gripper_closedness_action"], 0, 1)
-    )
-
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["world_vector"],
-            trajectory["action"]["rotation_delta"],
-            gripper_action,
-        ),
-        axis=-1,
-    )
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    trajectory["language_embedding"] = trajectory["observation"]["natural_language_embedding"]
-    return trajectory
-
-
-def nyu_door_opening_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # make gripper action absolute action, +1 = open, 0 = close
-    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
-    gripper_action = rel2abs_gripper_actions(gripper_action)
-
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["world_vector"],
-            trajectory["action"]["rotation_delta"],
-            gripper_action[:, None],
-        ),
-        axis=-1,
-    )
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    return trajectory
-
-
-def viola_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # make gripper action, +1 = open, 0 = close
-    gripper_action = trajectory["action"]["gripper_closedness_action"][:, None]
-    gripper_action = tf.clip_by_value(gripper_action, 0, 1)
-    gripper_action = invert_gripper_actions(gripper_action)
-
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["world_vector"],
-            trajectory["action"]["rotation_delta"],
-            gripper_action,
-        ),
-        axis=-1,
-    )
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    return trajectory
-
-
-def berkeley_autolab_ur5_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["state"] = trajectory["observation"]["robot_state"][:, 6:14]
-
-    # make gripper action absolute action, +1 = open, 0 = close
-    gripper_action = trajectory["action"]["gripper_closedness_action"]
-    gripper_action = rel2abs_gripper_actions(gripper_action)
-
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["world_vector"],
-            trajectory["action"]["rotation_delta"],
-            gripper_action[:, None],
-        ),
-        axis=-1,
-    )
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    return trajectory
-
-
-def toto_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["world_vector"],
-            trajectory["action"]["rotation_delta"],
-            tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
-        ),
-        axis=-1,
-    )
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    return trajectory
-
-
-def language_table_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # default to "open" gripper
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"],
-            tf.zeros_like(trajectory["action"]),
-            tf.zeros_like(trajectory["action"]),
-            tf.ones_like(trajectory["action"][:, :1]),
-        ),
-        axis=-1,
-    )
-
-    # decode language instruction
-    instruction_bytes = trajectory["observation"]["instruction"]
-    instruction_encoded = tf.strings.unicode_encode(instruction_bytes, output_encoding="UTF-8")
-    # Remove trailing padding --> convert RaggedTensor to regular Tensor.
-    trajectory["language_instruction"] = tf.strings.split(instruction_encoded, "\x00")[:, :1].to_tensor()[
-        :, 0
-    ]
-    return trajectory
-
-
-def pusht_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["world_vector"],
-            trajectory["action"]["rotation_delta"],
-            trajectory["action"]["gripper_closedness_action"][:, None],
-        ),
-        axis=-1,
-    )
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    return trajectory
-
-
-def stanford_kuka_multimodal_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["depth_image"] = trajectory["observation"]["depth_image"][..., 0]
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :3],
-            tf.zeros_like(trajectory["action"][:, :3]),
-            trajectory["action"][:, -1:],
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def nyu_rot_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][..., :6]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., -1:]
-    trajectory["action"] = trajectory["action"][..., :7]
-    return trajectory
-
-
-def stanford_hydra_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # invert gripper action, +1 = open, 0 = close
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :6],
-            invert_gripper_actions(trajectory["action"][:, -1:]),
-        ),
-        axis=-1,
-    )
-
-    trajectory["observation"]["eef_state"] = tf.concat(
-        (
-            trajectory["observation"]["state"][:, :3],
-            trajectory["observation"]["state"][:, 7:10],
-        ),
-        axis=-1,
-    )
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -3:-2]
-    return trajectory
-
-
-def austin_buds_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # invert gripper action + clip, +1 = open, 0 = close
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :6],
-            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
-        ),
-        axis=-1,
-    )
-
-    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
-    return trajectory
-
-
-def nyu_franka_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["depth"] = tf.cast(trajectory["observation"]["depth"][..., 0], tf.float32)
-    trajectory["observation"]["depth_additional_view"] = tf.cast(
-        trajectory["observation"]["depth_additional_view"][..., 0], tf.float32
-    )
-    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, -6:]
-
-    # clip gripper action, +1 = open, 0 = close
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, -8:-2],
-            tf.clip_by_value(trajectory["action"][:, -2:-1], 0, 1),
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def maniskill_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., 7:8]
-    return trajectory
-
-
-def furniture_bench_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    import tensorflow_graphics.geometry.transformation as tft
-
-    trajectory["observation"]["state"] = tf.concat(
-        (
-            trajectory["observation"]["state"][:, :7],
-            trajectory["observation"]["state"][:, -1:],
-        ),
-        axis=-1,
-    )
-
-    # invert gripper action + clip, +1 = open, 0 = close
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :3],
-            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
-            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def cmu_franka_exploration_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["action"] = trajectory["action"][..., :-1]
-    return trajectory
-
-
-def ucsd_kitchen_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
-    trajectory["action"] = trajectory["action"][..., :-1]
-    return trajectory
-
-
-def ucsd_pick_place_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :3],
-            tf.zeros_like(trajectory["action"][:, :3]),
-            trajectory["action"][:, -1:],
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def austin_sailor_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # invert gripper action + clip, +1 = open, 0 = close
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :6],
-            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def austin_sirius_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # invert gripper action + clip, +1 = open, 0 = close
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :6],
-            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def bc_z_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"]["future/xyz_residual"][:, :3],
-            trajectory["action"]["future/axis_angle_residual"][:, :3],
-            invert_gripper_actions(tf.cast(trajectory["action"]["future/target_close"][:, :1], tf.float32)),
-        ),
-        axis=-1,
-    )
-    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
-    return trajectory
-
-
-def tokyo_pr2_opening_fridge_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
-    trajectory["action"] = trajectory["action"][..., :-1]
-    return trajectory
-
-
-def tokyo_pr2_tabletop_manipulation_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
-    trajectory["action"] = trajectory["action"][..., :-1]
-    return trajectory
-
-
-def utokyo_xarm_bimanual_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["action"] = trajectory["action"][..., -7:]
-    return trajectory
-
-
-def robo_net_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["eef_state"] = tf.concat(
-        (
-            trajectory["observation"]["state"][:, :4],
-            tf.zeros_like(trajectory["observation"]["state"][:, :2]),
-        ),
-        axis=-1,
-    )
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :4],
-            tf.zeros_like(trajectory["action"][:, :2]),
-            trajectory["action"][:, -1:],
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def berkeley_mvp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    trajectory["observation"]["state"] = tf.concat((
-        tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32),
-                        trajectory["observation"]["pose"],
-                        trajectory["observation"]["joint_pos"],),
-                        axis=-1,)
-    """
-    trajectory["observation"]["gripper"] = tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32)
-    return trajectory
-
-
-def berkeley_rpt_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["gripper"] = tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32)
-    return trajectory
-
-
-def kaist_nonprehensible_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, -7:]
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :6],
-            tf.zeros_like(trajectory["action"][:, :1]),
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def stanford_mask_vit_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["eef_state"] = tf.concat(
-        (
-            trajectory["observation"]["end_effector_pose"][:, :4],
-            tf.zeros_like(trajectory["observation"]["end_effector_pose"][:, :2]),
-        ),
-        axis=-1,
-    )
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["end_effector_pose"][:, -1:]
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :4],
-            tf.zeros_like(trajectory["action"][:, :2]),
-            trajectory["action"][:, -1:],
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def tokyo_lsmo_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
-    return trajectory
-
-
-def dlr_sara_grid_clamp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :6]
-    return trajectory
-
-
-def dlr_edan_shared_control_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # invert gripper action, +1 = open, 0 = close
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :6],
-            invert_gripper_actions(trajectory["action"][:, -1:]),
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def asu_table_top_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["eef_state"] = trajectory["ground_truth_states"]["EE"]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
-    return trajectory
-
-
-def robocook_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
-    return trajectory
-
-
-def imperial_wristcam_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["action"] = trajectory["action"][..., :-1]
-    return trajectory
-
-
-def iamlab_pick_insert_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    import tensorflow_graphics.geometry.transformation as tft
-
-    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 7:8]
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :3],
-            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
-            trajectory["action"][:, 7:8],
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def uiuc_d3field_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"],
-            tf.zeros_like(trajectory["action"]),
-            tf.zeros_like(trajectory["action"][:, :1]),
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def utaustin_mutex_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
-
-    # invert gripper action + clip, +1 = open, 0 = close
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :6],
-            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def berkeley_fanuc_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :6]
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 6:7]
-
-    # dataset does not store gripper actions, so use gripper state info, invert so +1 = open, 0 = close
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"],
-            invert_gripper_actions(trajectory["observation"]["gripper_state"]),
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def cmu_playing_with_food_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    import tensorflow_graphics.geometry.transformation as tft
-
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :3],
-            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
-            trajectory["action"][:, -1:],
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def playfusion_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :3],
-            trajectory["action"][:, -4:],
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def cmu_stretch_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["eef_state"] = tf.concat(
-        (
-            trajectory["observation"]["state"][:, :3],
-            tf.zeros_like(trajectory["observation"]["state"][:, :3]),
-        ),
-        axis=-1,
-    )
-    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
-    trajectory["action"] = trajectory["action"][..., :-1]
-    return trajectory
-
-
-def gnm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    trajectory["observation"]["state"] = tf.concat(
-        (
-            trajectory["observation"]["position"],
-            tf.zeros_like(trajectory["observation"]["state"][:, :3]),
-            trajectory["observation"]["yaw"],
-        ),
-        axis=-1,
-    )
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"],
-            tf.zeros_like(trajectory["action"]),
-            tf.zeros_like(trajectory["action"]),
-            tf.zeros_like(trajectory["action"][:, :1]),
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def fmb_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # every input feature is batched, ie has leading batch dimension
-    trajectory["observation"]["proprio"] = tf.concat(
-        (
-            trajectory["observation"]["eef_pose"],
-            trajectory["observation"]["state_gripper_pose"][..., None],
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def dobbe_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # every input feature is batched, ie has leading batch dimension
-    trajectory["observation"]["proprio"] = trajectory["observation"]["state"]
-    return trajectory
-
-
-def robo_set_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    # gripper action is in -1...1 --> clip to 0...1, flip
-    gripper_action = trajectory["action"][:, -1:]
-    gripper_action = invert_gripper_actions(tf.clip_by_value(gripper_action, 0, 1))
-
-    trajectory["action"] = tf.concat(
-        (
-            trajectory["action"][:, :7],
-            gripper_action,
-        ),
-        axis=-1,
-    )
-    return trajectory
-
-
-def identity_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
-    return trajectory
-
-
-# === Registry ===
-OPENX_STANDARDIZATION_TRANSFORMS = {
-    "bridge_openx": bridge_openx_dataset_transform,
-    "bridge_orig": bridge_orig_dataset_transform,
-    "bridge_dataset": bridge_orig_dataset_transform,
-    "ppgm": ppgm_dataset_transform,
-    "ppgm_static": ppgm_dataset_transform,
-    "ppgm_wrist": ppgm_dataset_transform,
-    "fractal20220817_data": rt1_dataset_transform,
-    "kuka": kuka_dataset_transform,
-    "taco_play": taco_play_dataset_transform,
-    "jaco_play": jaco_play_dataset_transform,
-    "berkeley_cable_routing": berkeley_cable_routing_dataset_transform,
-    "roboturk": roboturk_dataset_transform,
-    "nyu_door_opening_surprising_effectiveness": nyu_door_opening_dataset_transform,
-    "viola": viola_dataset_transform,
-    "berkeley_autolab_ur5": berkeley_autolab_ur5_dataset_transform,
-    "toto": toto_dataset_transform,
-    "language_table": language_table_dataset_transform,
-    "columbia_cairlab_pusht_real": pusht_dataset_transform,
-    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": stanford_kuka_multimodal_dataset_transform,
-    "nyu_rot_dataset_converted_externally_to_rlds": nyu_rot_dataset_transform,
-    "stanford_hydra_dataset_converted_externally_to_rlds": stanford_hydra_dataset_transform,
-    "austin_buds_dataset_converted_externally_to_rlds": austin_buds_dataset_transform,
-    "nyu_franka_play_dataset_converted_externally_to_rlds": nyu_franka_play_dataset_transform,
-    "maniskill_dataset_converted_externally_to_rlds": maniskill_dataset_transform,
-    "furniture_bench_dataset_converted_externally_to_rlds": furniture_bench_dataset_transform,
-    "cmu_franka_exploration_dataset_converted_externally_to_rlds": cmu_franka_exploration_dataset_transform,
-    "ucsd_kitchen_dataset_converted_externally_to_rlds": ucsd_kitchen_dataset_transform,
-    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": ucsd_pick_place_dataset_transform,
-    "austin_sailor_dataset_converted_externally_to_rlds": austin_sailor_dataset_transform,
-    "austin_sirius_dataset_converted_externally_to_rlds": austin_sirius_dataset_transform,
-    "bc_z": bc_z_dataset_transform,
-    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": tokyo_pr2_opening_fridge_dataset_transform,
-    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": tokyo_pr2_tabletop_manipulation_dataset_transform,
-    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": identity_transform,
-    "utokyo_xarm_bimanual_converted_externally_to_rlds": utokyo_xarm_bimanual_dataset_transform,
-    "robo_net": robo_net_dataset_transform,
-    "berkeley_mvp_converted_externally_to_rlds": berkeley_mvp_dataset_transform,
-    "berkeley_rpt_converted_externally_to_rlds": berkeley_rpt_dataset_transform,
-    "kaist_nonprehensile_converted_externally_to_rlds": kaist_nonprehensible_dataset_transform,
-    "stanford_mask_vit_converted_externally_to_rlds": stanford_mask_vit_dataset_transform,
-    "tokyo_u_lsmo_converted_externally_to_rlds": tokyo_lsmo_dataset_transform,
-    "dlr_sara_pour_converted_externally_to_rlds": identity_transform,
-    "dlr_sara_grid_clamp_converted_externally_to_rlds": dlr_sara_grid_clamp_dataset_transform,
-    "dlr_edan_shared_control_converted_externally_to_rlds": dlr_edan_shared_control_dataset_transform,
-    "asu_table_top_converted_externally_to_rlds": asu_table_top_dataset_transform,
-    "stanford_robocook_converted_externally_to_rlds": robocook_dataset_transform,
-    "imperialcollege_sawyer_wrist_cam": imperial_wristcam_dataset_transform,
-    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": iamlab_pick_insert_dataset_transform,
-    "uiuc_d3field": uiuc_d3field_dataset_transform,
-    "utaustin_mutex": utaustin_mutex_dataset_transform,
-    "berkeley_fanuc_manipulation": berkeley_fanuc_dataset_transform,
-    "cmu_playing_with_food": cmu_playing_with_food_dataset_transform,
-    "cmu_play_fusion": playfusion_dataset_transform,
-    "cmu_stretch": cmu_stretch_dataset_transform,
-    "berkeley_gnm_recon": gnm_dataset_transform,
-    "berkeley_gnm_cory_hall": gnm_dataset_transform,
-    "berkeley_gnm_sac_son": gnm_dataset_transform,
-    "droid": droid_baseact_transform_fn(),
-    "droid_100": droid_baseact_transform_fn(),  # first 100 episodes of droid
-    "fmb": fmb_transform,
-    "dobbe": dobbe_dataset_transform,
-    "robo_set": robo_set_dataset_transform,
-    "usc_cloth_sim_converted_externally_to_rlds": identity_transform,
-    "plex_robosuite": identity_transform,
-    "conq_hose_manipulation": identity_transform,
-    "io_ai_tech": identity_transform,
-    "spoc": identity_transform,
-}
diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py b/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py
index cfe115034..1f8a5d144 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py
@@ -14,13 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+For all datasets in the RLDS format.
 For https://github.com/google-deepmind/open_x_embodiment (OPENX) datasets.
 
+NOTE: You need to install tensorflow and tensorflow_datsets before running this script.
+
 Example:
     python lerobot/scripts/push_dataset_to_hub.py \
-        --raw-dir /hdd/tensorflow_datasets/bridge_dataset/1.0.0/ \
-        --repo-id youliangtan/sampled_bridge_data_v2 \
-        --raw-format openx_rlds.bridge_orig \
+        --raw-dir /path/to/data/bridge_dataset/1.0.0/ \
+        --repo-id your_hub/sampled_bridge_data_v2 \
+        --raw-format rlds \
         --episodes 3 4 5 8 9
 
 Exact dataset fps defined in openx/config.py, obtained from:
@@ -35,12 +38,10 @@
 import tensorflow_datasets as tfds
 import torch
 import tqdm
-import yaml
 from datasets import Dataset, Features, Image, Sequence, Value
 from PIL import Image as PILImage
 
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
-from lerobot.common.datasets.push_dataset_to_hub.openx.transforms import OPENX_STANDARDIZATION_TRANSFORMS
 from lerobot.common.datasets.push_dataset_to_hub.utils import (
     calculate_episode_data_index,
     concatenate_episodes,
@@ -52,11 +53,6 @@
 )
 from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames
 
-with open("lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml") as f:
-    _openx_list = yaml.safe_load(f)
-
-OPENX_DATASET_CONFIGS = _openx_list["OPENX_DATASET_CONFIGS"]
-
 np.set_printoptions(precision=2)
 
 
@@ -108,7 +104,6 @@ def load_from_raw(
     video: bool,
     episodes: list[int] | None = None,
     encoding: dict | None = None,
-    openx_dataset_name: str | None = None,
 ):
     """
     Args:
@@ -136,16 +131,17 @@ def load_from_raw(
     # we will apply the standardization transform if the dataset_name is provided
     # if the dataset name is not provided and the goal is to convert any rlds formatted dataset
     # search for 'image' keys in the observations
-    if openx_dataset_name is not None:
-        print(" - applying standardization transform for dataset: ", openx_dataset_name)
-        assert openx_dataset_name in OPENX_STANDARDIZATION_TRANSFORMS
-        transform_fn = OPENX_STANDARDIZATION_TRANSFORMS[openx_dataset_name]
-        dataset = dataset.map(transform_fn)
-
-        image_keys = OPENX_DATASET_CONFIGS[openx_dataset_name]["image_obs_keys"]
-    else:
-        obs_keys = dataset_info.features["steps"]["observation"].keys()
-        image_keys = [key for key in obs_keys if "image" in key]
+    image_keys = []
+    state_keys = []
+    observation_info = dataset_info.features["steps"]["observation"]
+    for key in observation_info:
+        # check whether the key is for an image or a vector observation
+        if len(observation_info[key].shape) == 3:
+            # only adding uint8 images discards depth images
+            if observation_info[key].dtype == tf.uint8:
+                image_keys.append(key)
+        else:
+            state_keys.append(key)
 
     lang_key = "language_instruction" if "language_instruction" in dataset.element_spec else None
 
@@ -193,50 +189,31 @@ def load_from_raw(
 
         num_frames = episode["action"].shape[0]
 
-        ###########################################################
-        # Handle the episodic data
-
-        # last step of demonstration is considered done
-        done = torch.zeros(num_frames, dtype=torch.bool)
-        done[-1] = True
         ep_dict = {}
-        langs = []  # TODO: might be located in "observation"
-
-        image_array_dict = {key: [] for key in image_keys}
+        for key in state_keys:
+            ep_dict[f"observation.{key}"] = tf_to_torch(episode["observation"][key])
 
-        # We will create the state observation tensor by stacking the state
-        # obs keys defined in the openx/configs.py
-        if openx_dataset_name is not None:
-            state_obs_keys = OPENX_DATASET_CONFIGS[openx_dataset_name]["state_obs_keys"]
-            # stack the state observations, if is None, pad with zeros
-            states = []
-            for key in state_obs_keys:
-                if key in episode["observation"]:
-                    states.append(tf_to_torch(episode["observation"][key]))
-                else:
-                    states.append(torch.zeros(num_frames, 1))  # pad with zeros
-            states = torch.cat(states, dim=1)
-            # assert states.shape == (num_frames, 8), f"states shape: {states.shape}"
-        else:
-            states = tf_to_torch(episode["observation"]["state"])
-
-        actions = tf_to_torch(episode["action"])
-        rewards = tf_to_torch(episode["reward"]).float()
+        ep_dict["action"] = tf_to_torch(episode["action"])
+        ep_dict["next.reward"] = tf_to_torch(episode["reward"]).float()
+        ep_dict["next.done"] = tf_to_torch(episode["is_last"])
+        ep_dict["is_terminal"] = tf_to_torch(episode["is_terminal"])
+        ep_dict["is_first"] = tf_to_torch(episode["is_first"])
+        ep_dict["discount"] = tf_to_torch(episode["discount"])
 
         # If lang_key is present, convert the entire tensor at once
         if lang_key is not None:
-            langs = [str(x) for x in episode[lang_key]]
+            ep_dict["language_instruction"] = [x.numpy().decode("utf-8") for x in episode[lang_key]]
+
+        ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
+        ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
+        ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
+
+        image_array_dict = {key: [] for key in image_keys}
 
         for im_key in image_keys:
             imgs = episode["observation"][im_key]
             image_array_dict[im_key] = [tf_img_convert(img) for img in imgs]
 
-        # simple assertions
-        for item in [states, actions, rewards, done]:
-            assert len(item) == num_frames
-
-        ###########################################################
-
         # loop through all cameras
         for im_key in image_keys:
             img_key = f"observation.images.{im_key}"
@@ -262,17 +239,6 @@ def load_from_raw(
             else:
                 ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array]
 
-        if lang_key is not None:
-            ep_dict["language_instruction"] = langs
-
-        ep_dict["observation.state"] = states
-        ep_dict["action"] = actions
-        ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps
-        ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames)
-        ep_dict["frame_index"] = torch.arange(0, num_frames, 1)
-        ep_dict["next.reward"] = rewards
-        ep_dict["next.done"] = done
-
         path_ep_dict = tmp_ep_dicts_dir.joinpath(
             "ep_dict_" + "0" * (10 - len(str(ep_idx))) + str(ep_idx) + ".pt"
         )
@@ -290,30 +256,28 @@ def load_from_raw(
 def to_hf_dataset(data_dict, video) -> Dataset:
     features = {}
 
-    keys = [key for key in data_dict if "observation.images." in key]
-    for key in keys:
-        if video:
-            features[key] = VideoFrame()
-        else:
-            features[key] = Image()
+    for key in data_dict:
+        # check if vector state obs
+        if key.startswith("observation.") and "observation.images." not in key:
+            features[key] = Sequence(length=data_dict[key].shape[1], feature=Value(dtype="float32", id=None))
+        # check if image obs
+        elif "observation.images." in key:
+            if video:
+                features[key] = VideoFrame()
+            else:
+                features[key] = Image()
 
-    features["observation.state"] = Sequence(
-        length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None)
-    )
-    if "observation.velocity" in data_dict:
-        features["observation.velocity"] = Sequence(
-            length=data_dict["observation.velocity"].shape[1], feature=Value(dtype="float32", id=None)
-        )
-    if "observation.effort" in data_dict:
-        features["observation.effort"] = Sequence(
-            length=data_dict["observation.effort"].shape[1], feature=Value(dtype="float32", id=None)
-        )
     if "language_instruction" in data_dict:
         features["language_instruction"] = Value(dtype="string", id=None)
 
     features["action"] = Sequence(
         length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None)
     )
+
+    features["is_terminal"] = Value(dtype="bool", id=None)
+    features["is_first"] = Value(dtype="bool", id=None)
+    features["discount"] = Value(dtype="float32", id=None)
+
     features["episode_index"] = Value(dtype="int64", id=None)
     features["frame_index"] = Value(dtype="int64", id=None)
     features["timestamp"] = Value(dtype="float32", id=None)
@@ -333,19 +297,8 @@ def from_raw_to_lerobot_format(
     video: bool = True,
     episodes: list[int] | None = None,
     encoding: dict | None = None,
-    openx_dataset_name: str | None = None,
 ):
-    """This is a test impl for rlds conversion"""
-    if openx_dataset_name is None:
-        # set a default rlds frame rate if the dataset is not from openx
-        fps = 30
-    elif "fps" not in OPENX_DATASET_CONFIGS[openx_dataset_name]:
-        raise ValueError(
-            "fps for this dataset is not specified in openx/configs.py yet," "means it is not yet tested"
-        )
-    fps = OPENX_DATASET_CONFIGS[openx_dataset_name]["fps"]
-
-    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding, openx_dataset_name)
+    data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding)
     hf_dataset = to_hf_dataset(data_dict, video)
     episode_data_index = calculate_episode_data_index(hf_dataset)
     info = {
diff --git a/lerobot/scripts/push_dataset_to_hub.py b/lerobot/scripts/push_dataset_to_hub.py
index 2bb641a4d..0233ede69 100644
--- a/lerobot/scripts/push_dataset_to_hub.py
+++ b/lerobot/scripts/push_dataset_to_hub.py
@@ -66,7 +66,7 @@ def get_from_raw_to_lerobot_format_fn(raw_format: str):
         from lerobot.common.datasets.push_dataset_to_hub.umi_zarr_format import from_raw_to_lerobot_format
     elif raw_format == "aloha_hdf5":
         from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import from_raw_to_lerobot_format
-    elif "openx_rlds" in raw_format:
+    elif raw_format in ["rlds", "openx"]:
         from lerobot.common.datasets.push_dataset_to_hub.openx_rlds_format import from_raw_to_lerobot_format
     elif raw_format == "dora_parquet":
         from lerobot.common.datasets.push_dataset_to_hub.dora_parquet_format import from_raw_to_lerobot_format
@@ -204,24 +204,14 @@ def push_dataset_to_hub(
     # convert dataset from original raw format to LeRobot format
     from_raw_to_lerobot_format = get_from_raw_to_lerobot_format_fn(raw_format)
 
-    fmt_kwgs = {
-        "raw_dir": raw_dir,
-        "videos_dir": videos_dir,
-        "fps": fps,
-        "video": video,
-        "episodes": episodes,
-        "encoding": encoding,
-    }
-
-    if "openx_rlds." in raw_format:
-        # Support for official OXE dataset name inside `raw_format`.
-        # For instance, `raw_format="oxe_rlds"` uses the default formating (TODO what does that mean?),
-        # and `raw_format="oxe_rlds.bridge_orig"` uses the brdige_orig formating
-        _, openx_dataset_name = raw_format.split(".")
-        print(f"Converting dataset [{openx_dataset_name}] from 'openx_rlds' to LeRobot format.")
-        fmt_kwgs["openx_dataset_name"] = openx_dataset_name
-
-    hf_dataset, episode_data_index, info = from_raw_to_lerobot_format(**fmt_kwgs)
+    hf_dataset, episode_data_index, info = from_raw_to_lerobot_format(
+        raw_dir,
+        videos_dir,
+        fps,
+        video,
+        episodes,
+        encoding,
+    )
 
     lerobot_dataset = LeRobotDataset.from_preloaded(
         repo_id=repo_id,
@@ -290,7 +280,7 @@ def main():
         "--raw-format",
         type=str,
         required=True,
-        help="Dataset type (e.g. `pusht_zarr`, `umi_zarr`, `aloha_hdf5`, `xarm_pkl`, `dora_parquet`, `openx_rlds`).",
+        help="Dataset type (e.g. `pusht_zarr`, `umi_zarr`, `aloha_hdf5`, `xarm_pkl`, `dora_parquet`, `rlds`, `openx`).",
     )
     parser.add_argument(
         "--repo-id",

From 286bca37cc78e80d5853236b93f17cd0a25cf367 Mon Sep 17 00:00:00 2001
From: Remi <remi.cadene@huggingface.co>
Date: Tue, 3 Dec 2024 10:53:21 +0100
Subject: [PATCH 2/3] Fix missing local_files_only in record/replay (#540)

Co-authored-by: Simon Alibert <alibert.sim@gmail.com>
---
 .github/workflows/test.yml       | 70 ++++++++++++++++----------------
 lerobot/scripts/control_robot.py | 18 ++++++--
 tests/test_control_robot.py      |  2 +-
 3 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5de071750..53b37466a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -102,38 +102,38 @@ jobs:
             && rm -rf tests/outputs outputs
 
   # TODO(aliberts, rcadene): redesign after v2 migration / removing hydra
-  end-to-end:
-    name: End-to-end
-    runs-on: ubuntu-latest
-    env:
-      MUJOCO_GL: egl
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          lfs: true  # Ensure LFS files are pulled
-
-      - name: Install apt dependencies
-      # portaudio19-dev is needed to install pyaudio
-        run: |
-          sudo apt-get update && \
-          sudo apt-get install -y libegl1-mesa-dev portaudio19-dev
-
-      - name: Install poetry
-        run: |
-          pipx install poetry && poetry config virtualenvs.in-project true
-          echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
-
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-          cache: "poetry"
-
-      - name: Install poetry dependencies
-        run: |
-          poetry install --all-extras
-
-      - name: Test end-to-end
-        run: |
-          make test-end-to-end \
-            && rm -rf outputs
+  # end-to-end:
+  #   name: End-to-end
+  #   runs-on: ubuntu-latest
+  #   env:
+  #     MUJOCO_GL: egl
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #       with:
+  #         lfs: true  # Ensure LFS files are pulled
+
+  #     - name: Install apt dependencies
+  #     # portaudio19-dev is needed to install pyaudio
+  #       run: |
+  #         sudo apt-get update && \
+  #         sudo apt-get install -y libegl1-mesa-dev portaudio19-dev
+
+  #     - name: Install poetry
+  #       run: |
+  #         pipx install poetry && poetry config virtualenvs.in-project true
+  #         echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
+
+  #     - name: Set up Python 3.10
+  #       uses: actions/setup-python@v5
+  #       with:
+  #         python-version: "3.10"
+  #         cache: "poetry"
+
+  #     - name: Install poetry dependencies
+  #       run: |
+  #         poetry install --all-extras
+
+  #     - name: Test end-to-end
+  #       run: |
+  #         make test-end-to-end \
+  #           && rm -rf outputs
diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py
index 563023f48..e91c6c232 100644
--- a/lerobot/scripts/control_robot.py
+++ b/lerobot/scripts/control_robot.py
@@ -341,7 +341,7 @@ def replay(
     episode: int,
     fps: int | None = None,
     play_sounds: bool = True,
-    local_files_only: bool = True,
+    local_files_only: bool = False,
 ):
     # TODO(rcadene, aliberts): refactor with control_loop, once `dataset` is an instance of LeRobotDataset
     # TODO(rcadene): Add option to record logs
@@ -424,7 +424,7 @@ def replay(
         "--root",
         type=Path,
         default=None,
-        help="Root directory where the dataset will be stored locally at '{root}/{repo_id}' (e.g. 'data/hf_username/dataset_name').",
+        help="Root directory where the dataset will be stored (e.g. 'dataset/path').",
     )
     parser_record.add_argument(
         "--repo-id",
@@ -432,6 +432,12 @@ def replay(
         default="lerobot/test",
         help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).",
     )
+    parser_record.add_argument(
+        "--local-files-only",
+        type=int,
+        default=0,
+        help="Use local files only. By default, this script will try to fetch the dataset from the hub if it exists.",
+    )
     parser_record.add_argument(
         "--warmup-time-s",
         type=int,
@@ -520,7 +526,7 @@ def replay(
         "--root",
         type=Path,
         default=None,
-        help="Root directory where the dataset will be stored locally at '{root}/{repo_id}' (e.g. 'data/hf_username/dataset_name').",
+        help="Root directory where the dataset will be stored (e.g. 'dataset/path').",
     )
     parser_replay.add_argument(
         "--repo-id",
@@ -528,6 +534,12 @@ def replay(
         default="lerobot/test",
         help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).",
     )
+    parser_replay.add_argument(
+        "--local-files-only",
+        type=int,
+        default=0,
+        help="Use local files only. By default, this script will try to fetch the dataset from the hub if it exists.",
+    )
     parser_replay.add_argument("--episode", type=int, default=0, help="Index of the episode to replay.")
 
     args = parser.parse_args()
diff --git a/tests/test_control_robot.py b/tests/test_control_robot.py
index 0ba737a8f..8df108946 100644
--- a/tests/test_control_robot.py
+++ b/tests/test_control_robot.py
@@ -158,7 +158,7 @@ def test_record_and_replay_and_policy(tmpdir, request, robot_type, mock):
     assert dataset.meta.total_episodes == 2
     assert len(dataset) == 2
 
-    replay(robot, episode=0, fps=1, root=root, repo_id=repo_id, play_sounds=False)
+    replay(robot, episode=0, fps=1, root=root, repo_id=repo_id, play_sounds=False, local_files_only=True)
 
     # TODO(rcadene, aliberts): rethink this design
     if robot_type == "aloha":

From 8e7d6970eaf5a64b8af6ec45586d201b8ca9ef16 Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Tue, 3 Dec 2024 12:20:05 +0100
Subject: [PATCH 3/3] Control simulated robot with real leader (#514)

Co-authored-by: Remi <remi.cadene@huggingface.co>
---
 lerobot/scripts/control_robot.py     |   4 +-
 lerobot/scripts/control_sim_robot.py | 546 +++++++++++++++++++++++++++
 2 files changed, 548 insertions(+), 2 deletions(-)
 create mode 100644 lerobot/scripts/control_sim_robot.py

diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py
index e91c6c232..12eaf146f 100644
--- a/lerobot/scripts/control_robot.py
+++ b/lerobot/scripts/control_robot.py
@@ -68,8 +68,8 @@
 - Tap escape key 'esc' to stop the data recording.
 This might require a sudo permission to allow your terminal to monitor keyboard events.
 
-**NOTE**: You can resume/continue data recording by running the same data recording command twice.
-To avoid resuming by deleting the dataset, use `--force-override 1`.
+**NOTE**: You can resume/continue data recording by running the same data recording command and adding `--resume 1`.
+If the dataset you want to extend is not on the hub, you also need to add `--local-files-only 1`.
 
 - Train on this dataset with the ACT policy:
 ```bash
diff --git a/lerobot/scripts/control_sim_robot.py b/lerobot/scripts/control_sim_robot.py
new file mode 100644
index 000000000..85dfca64a
--- /dev/null
+++ b/lerobot/scripts/control_sim_robot.py
@@ -0,0 +1,546 @@
+"""
+Utilities to control a robot in simulation.
+
+Useful to record a dataset, replay a recorded episode and record an evaluation dataset.
+
+Examples of usage:
+
+
+- Unlimited teleoperation at a limited frequency of 30 Hz, to simulate data recording frequency.
+  You can modify this value depending on how fast your simulation can run:
+```bash
+python lerobot/scripts/control_robot.py teleoperate \
+    --fps 30 \
+    --robot-path lerobot/configs/robot/your_robot_config.yaml \
+    --sim-config lerobot/configs/env/your_sim_config.yaml
+```
+
+- Record one episode in order to test replay:
+```bash
+python lerobot/scripts/control_sim_robot.py record \
+    --robot-path lerobot/configs/robot/your_robot_config.yaml \
+    --sim-config lerobot/configs/env/your_sim_config.yaml \
+    --fps 30 \
+    --repo-id $USER/robot_sim_test \
+    --num-episodes 1 \
+    --run-compute-stats 0
+```
+
+Enable the --push-to-hub 1 to push the recorded dataset to the huggingface hub.
+
+- Visualize dataset:
+```bash
+python lerobot/scripts/visualize_dataset.py \
+    --repo-id $USER/robot_sim_test \
+    --episode-index 0
+```
+
+- Replay a sequence of test episodes: 
+```bash
+python lerobot/scripts/control_sim_robot.py replay \
+    --robot-path lerobot/configs/robot/your_robot_config.yaml \
+    --sim-config lerobot/configs/env/your_sim_config.yaml \
+    --fps 30 \
+    --repo-id $USER/robot_sim_test \
+    --episode 0
+```
+Note: The seed is saved, therefore, during replay we can load the same environment state as the one during collection.
+
+- Record a full dataset in order to train a policy,
+30 seconds of recording for each episode, and 10 seconds to reset the environment in between episodes:
+```bash
+python lerobot/scripts/control_sim_robot.py record \
+    --robot-path lerobot/configs/robot/your_robot_config.yaml \
+    --sim-config lerobot/configs/env/your_sim_config.yaml \
+    --fps 30 \
+    --repo-id $USER/robot_sim_test \
+    --num-episodes 50 \
+    --episode-time-s 30 \
+```
+
+**NOTE**: You can use your keyboard to control data recording flow.
+- Tap right arrow key '->' to early exit while recording an episode and go to reseting the environment.
+- Tap right arrow key '->' to early exit while reseting the environment and got to recording the next episode.
+- Tap left arrow key '<-' to early exit and re-record the current episode.
+- Tap escape key 'esc' to stop the data recording.
+This might require a sudo permission to allow your terminal to monitor keyboard events.
+
+**NOTE**: You can resume/continue data recording by running the same data recording command twice.
+"""
+
+import argparse
+import importlib
+import logging
+import time
+from pathlib import Path
+
+import cv2
+import gymnasium as gym
+import numpy as np
+import torch
+
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.robot_devices.control_utils import (
+    init_keyboard_listener,
+    init_policy,
+    is_headless,
+    log_control_info,
+    predict_action,
+    sanity_check_dataset_name,
+    sanity_check_dataset_robot_compatibility,
+    stop_recording,
+)
+from lerobot.common.robot_devices.robots.factory import make_robot
+from lerobot.common.robot_devices.robots.utils import Robot
+from lerobot.common.robot_devices.utils import busy_wait
+from lerobot.common.utils.utils import init_hydra_config, init_logging, log_say
+
+DEFAULT_FEATURES = {
+    "next.reward": {
+        "dtype": "float32",
+        "shape": (1,),
+        "names": None,
+    },
+    "next.success": {
+        "dtype": "bool",
+        "shape": (1,),
+        "names": None,
+    },
+    "seed": {
+        "dtype": "int64",
+        "shape": (1,),
+        "names": None,
+    },
+    "timestamp": {
+        "dtype": "float32",
+        "shape": (1,),
+        "names": None,
+    },
+}
+
+
+########################################################################################
+# Utilities
+########################################################################################
+def none_or_int(value):
+    if value == "None":
+        return None
+    return int(value)
+
+
+def init_sim_calibration(robot, cfg):
+    # Constants necessary for transforming the joint pos of the real robot to the sim
+    # depending on the robot discription used in that sim.
+    start_pos = np.array(robot.leader_arms.main.calibration["start_pos"])
+    axis_directions = np.array(cfg.get("axis_directions", [1]))
+    offsets = np.array(cfg.get("offsets", [0])) * np.pi
+
+    return {"start_pos": start_pos, "axis_directions": axis_directions, "offsets": offsets}
+
+
+def real_positions_to_sim(real_positions, axis_directions, start_pos, offsets):
+    """Counts - starting position -> radians -> align axes -> offset"""
+    return axis_directions * (real_positions - start_pos) * 2.0 * np.pi / 4096 + offsets
+
+
+########################################################################################
+# Control modes
+########################################################################################
+
+
+def teleoperate(env, robot: Robot, process_action_fn, teleop_time_s=None):
+    env = env()
+    env.reset()
+    start_teleop_t = time.perf_counter()
+    while True:
+        leader_pos = robot.leader_arms.main.read("Present_Position")
+        action = process_action_fn(leader_pos)
+        env.step(np.expand_dims(action, 0))
+        if teleop_time_s is not None and time.perf_counter() - start_teleop_t > teleop_time_s:
+            print("Teleoperation processes finished.")
+            break
+
+
+def record(
+    env,
+    robot: Robot,
+    process_action_from_leader,
+    root: Path,
+    repo_id: str,
+    task: str,
+    fps: int | None = None,
+    tags: list[str] | None = None,
+    pretrained_policy_name_or_path: str = None,
+    policy_overrides: bool | None = None,
+    episode_time_s: int = 30,
+    num_episodes: int = 50,
+    video: bool = True,
+    push_to_hub: bool = True,
+    num_image_writer_processes: int = 0,
+    num_image_writer_threads_per_camera: int = 4,
+    display_cameras: bool = False,
+    play_sounds: bool = True,
+    resume: bool = False,
+    local_files_only: bool = False,
+    run_compute_stats: bool = True,
+) -> LeRobotDataset:
+    # Load pretrained policy
+    policy = None
+    if pretrained_policy_name_or_path is not None:
+        policy, policy_fps, device, use_amp = init_policy(pretrained_policy_name_or_path, policy_overrides)
+
+        if fps is None:
+            fps = policy_fps
+            logging.warning(f"No fps provided, so using the fps from policy config ({policy_fps}).")
+
+    if policy is None and process_action_from_leader is None:
+        raise ValueError("Either policy or process_action_fn has to be set to enable control in sim.")
+
+    # initialize listener before sim env
+    listener, events = init_keyboard_listener()
+
+    # create sim env
+    env = env()
+
+    # Create empty dataset or load existing saved episodes
+    num_cameras = sum([1 if "image" in key else 0 for key in env.observation_space])
+
+    # get image keys
+    image_keys = [key for key in env.observation_space if "image" in key]
+    state_keys_dict = env_cfg.state_keys
+
+    if resume:
+        dataset = LeRobotDataset(
+            repo_id,
+            root=root,
+            local_files_only=local_files_only,
+        )
+        dataset.start_image_writer(
+            num_processes=num_image_writer_processes,
+            num_threads=num_image_writer_threads_per_camera * num_cameras,
+        )
+        sanity_check_dataset_robot_compatibility(dataset, robot, fps, video)
+    else:
+        features = DEFAULT_FEATURES
+        # add image keys to features
+        for key in image_keys:
+            shape = env.observation_space[key].shape
+            if not key.startswith("observation.image."):
+                key = "observation.image." + key
+            features[key] = {"dtype": "video", "names": ["channel", "height", "width"], "shape": shape}
+
+        for key, obs_key in state_keys_dict.items():
+            features[key] = {
+                "dtype": "float32",
+                "names": None,
+                "shape": env.observation_space[obs_key].shape,
+            }
+
+        features["action"] = {"dtype": "float32", "shape": env.action_space.shape, "names": None}
+
+        # Create empty dataset or load existing saved episodes
+        sanity_check_dataset_name(repo_id, policy)
+        dataset = LeRobotDataset.create(
+            repo_id,
+            fps,
+            root=root,
+            features=features,
+            use_videos=video,
+            image_writer_processes=num_image_writer_processes,
+            image_writer_threads=num_image_writer_threads_per_camera * num_cameras,
+        )
+
+    recorded_episodes = 0
+    while True:
+        log_say(f"Recording episode {dataset.num_episodes}", play_sounds)
+
+        if events is None:
+            events = {"exit_early": False}
+
+        if episode_time_s is None:
+            episode_time_s = float("inf")
+
+        timestamp = 0
+        start_episode_t = time.perf_counter()
+
+        seed = np.random.randint(0, 1e5)
+        observation, info = env.reset(seed=seed)
+
+        while timestamp < episode_time_s:
+            start_loop_t = time.perf_counter()
+
+            if policy is not None:
+                action = predict_action(observation, policy, device, use_amp)
+            else:
+                leader_pos = robot.leader_arms.main.read("Present_Position")
+                action = process_action_from_leader(leader_pos)
+
+            observation, reward, terminated, _, info = env.step(action)
+
+            success = info.get("is_success", False)
+            env_timestamp = info.get("timestamp", dataset.episode_buffer["size"] / fps)
+
+            frame = {
+                "action": torch.from_numpy(action),
+                "next.reward": reward,
+                "next.success": success,
+                "seed": seed,
+                "timestamp": env_timestamp,
+            }
+
+            for key in image_keys:
+                if not key.startswith("observation.image"):
+                    frame["observation.image." + key] = observation[key]
+                else:
+                    frame[key] = observation[key]
+
+            for key, obs_key in state_keys_dict.items():
+                frame[key] = torch.from_numpy(observation[obs_key])
+
+            dataset.add_frame(frame)
+
+            if display_cameras and not is_headless():
+                for key in image_keys:
+                    cv2.imshow(key, cv2.cvtColor(observation[key], cv2.COLOR_RGB2BGR))
+                cv2.waitKey(1)
+
+            if fps is not None:
+                dt_s = time.perf_counter() - start_loop_t
+                busy_wait(1 / fps - dt_s)
+
+            dt_s = time.perf_counter() - start_loop_t
+            log_control_info(robot, dt_s, fps=fps)
+
+            timestamp = time.perf_counter() - start_episode_t
+            if events["exit_early"] or terminated:
+                events["exit_early"] = False
+                break
+
+        if events["rerecord_episode"]:
+            log_say("Re-record episode", play_sounds)
+            events["rerecord_episode"] = False
+            events["exit_early"] = False
+            dataset.clear_episode_buffer()
+            continue
+
+        dataset.save_episode(task=task)
+        recorded_episodes += 1
+
+        if events["stop_recording"] or recorded_episodes >= num_episodes:
+            break
+        else:
+            logging.info("Waiting for a few seconds before starting next episode recording...")
+            busy_wait(3)
+
+    log_say("Stop recording", play_sounds, blocking=True)
+    stop_recording(robot, listener, display_cameras)
+
+    if run_compute_stats:
+        logging.info("Computing dataset statistics")
+    dataset.consolidate(run_compute_stats)
+
+    if push_to_hub:
+        dataset.push_to_hub(tags=tags)
+
+    log_say("Exiting", play_sounds)
+    return dataset
+
+
+def replay(
+    env, root: Path, repo_id: str, episode: int, fps: int | None = None, local_files_only: bool = True
+):
+    env = env()
+
+    local_dir = Path(root) / repo_id
+    if not local_dir.exists():
+        raise ValueError(local_dir)
+
+    dataset = LeRobotDataset(repo_id, root=root, local_files_only=local_files_only)
+    items = dataset.hf_dataset.select_columns("action")
+    seeds = dataset.hf_dataset.select_columns("seed")["seed"]
+
+    from_idx = dataset.episode_data_index["from"][episode].item()
+    to_idx = dataset.episode_data_index["to"][episode].item()
+    env.reset(seed=seeds[from_idx].item())
+    logging.info("Replaying episode")
+    log_say("Replaying episode", play_sounds=True)
+    for idx in range(from_idx, to_idx):
+        start_episode_t = time.perf_counter()
+        action = items[idx]["action"]
+        env.step(action.unsqueeze(0).numpy())
+        dt_s = time.perf_counter() - start_episode_t
+        busy_wait(1 / fps - dt_s)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(dest="mode", required=True)
+
+    # Set common options for all the subparsers
+    base_parser = argparse.ArgumentParser(add_help=False)
+    base_parser.add_argument(
+        "--robot-path",
+        type=str,
+        default="lerobot/configs/robot/koch.yaml",
+        help="Path to robot yaml file used to instantiate the robot using `make_robot` factory function.",
+    )
+
+    base_parser.add_argument(
+        "--sim-config",
+        help="Path to a yaml config you want to use for initializing a sim environment based on gym ",
+    )
+
+    parser_record = subparsers.add_parser("teleoperate", parents=[base_parser])
+
+    parser_record = subparsers.add_parser("record", parents=[base_parser])
+    parser_record.add_argument(
+        "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
+    )
+    parser_record.add_argument(
+        "--root",
+        type=Path,
+        default=None,
+        help="Root directory where the dataset will be stored locally at '{root}/{repo_id}' (e.g. 'data/hf_username/dataset_name').",
+    )
+    parser_record.add_argument(
+        "--repo-id",
+        type=str,
+        default="lerobot/test",
+        help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).",
+    )
+    parser_record.add_argument(
+        "--episode-time-s",
+        type=int,
+        default=60,
+        help="Number of seconds for data recording for each episode.",
+    )
+    parser_record.add_argument(
+        "--task",
+        type=str,
+        required=True,
+        help="A description of the task preformed during recording that can be used as a language instruction.",
+    )
+    parser_record.add_argument("--num-episodes", type=int, default=50, help="Number of episodes to record.")
+    parser_record.add_argument(
+        "--run-compute-stats",
+        type=int,
+        default=1,
+        help="By default, run the computation of the data statistics at the end of data collection. Compute intensive and not required to just replay an episode.",
+    )
+    parser_record.add_argument(
+        "--push-to-hub",
+        type=int,
+        default=1,
+        help="Upload dataset to Hugging Face hub.",
+    )
+    parser_record.add_argument(
+        "--tags",
+        type=str,
+        nargs="*",
+        help="Add tags to your dataset on the hub.",
+    )
+    parser_record.add_argument(
+        "--num-image-writer-processes",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses handling the saving of frames as PNGs. Set to 0 to use threads only; "
+            "set to ≥1 to use subprocesses, each using threads to write images. The best number of processes "
+            "and threads depends on your system. We recommend 4 threads per camera with 0 processes. "
+            "If fps is unstable, adjust the thread count. If still unstable, try using 1 or more subprocesses."
+        ),
+    )
+    parser_record.add_argument(
+        "--num-image-writer-threads-per-camera",
+        type=int,
+        default=4,
+        help=(
+            "Number of threads writing the frames as png images on disk, per camera. "
+            "Too much threads might cause unstable teleoperation fps due to main thread being blocked. "
+            "Not enough threads might cause low camera fps."
+        ),
+    )
+    parser_record.add_argument(
+        "--display-cameras",
+        type=int,
+        default=0,
+        help="Visualize image observations with opencv.",
+    )
+    parser_record.add_argument(
+        "--resume",
+        type=int,
+        default=0,
+        help="Resume recording on an existing dataset.",
+    )
+    parser_replay = subparsers.add_parser("replay", parents=[base_parser])
+    parser_replay.add_argument(
+        "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)"
+    )
+    parser_replay.add_argument(
+        "--root",
+        type=Path,
+        default=None,
+        help="Root directory where the dataset will be stored locally (e.g. 'data/hf_username/dataset_name'). By default, stored in cache folder.",
+    )
+    parser_replay.add_argument(
+        "--repo-id",
+        type=str,
+        default="lerobot/test",
+        help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).",
+    )
+    parser_replay.add_argument("--episode", type=int, default=0, help="Index of the episodes to replay.")
+
+    args = parser.parse_args()
+
+    init_logging()
+
+    control_mode = args.mode
+    robot_path = args.robot_path
+    env_config_path = args.sim_config
+    kwargs = vars(args)
+    del kwargs["mode"]
+    del kwargs["robot_path"]
+    del kwargs["sim_config"]
+
+    # make gym env
+    env_cfg = init_hydra_config(env_config_path)
+    importlib.import_module(f"gym_{env_cfg.env.name}")
+
+    def env_constructor():
+        return gym.make(env_cfg.env.handle, disable_env_checker=True, **env_cfg.env.gym)
+
+    robot = None
+    process_leader_actions_fn = None
+
+    if control_mode in ["teleoperate", "record"]:
+        # make robot
+        robot_overrides = ["~cameras", "~follower_arms"]
+        robot_cfg = init_hydra_config(robot_path, robot_overrides)
+        robot = make_robot(robot_cfg)
+        robot.connect()
+
+        calib_kwgs = init_sim_calibration(robot, env_cfg.calibration)
+
+        def process_leader_actions_fn(action):
+            return real_positions_to_sim(action, **calib_kwgs)
+
+        robot.leader_arms.main.calibration = None
+
+    if control_mode == "teleoperate":
+        teleoperate(env_constructor, robot, process_leader_actions_fn)
+
+    elif control_mode == "record":
+        record(env_constructor, robot, process_leader_actions_fn, **kwargs)
+
+    elif control_mode == "replay":
+        replay(env_constructor, **kwargs)
+
+    else:
+        raise ValueError(
+            f"Invalid control mode: '{control_mode}', only valid modes are teleoperate, record and replay."
+        )
+
+    if robot and robot.is_connected:
+        # Disconnect manually to avoid a "Core dump" during process
+        # termination due to camera threads not properly exiting.
+        robot.disconnect()