From a2c181992a131c92e1930f3ebc3014112fe03625 Mon Sep 17 00:00:00 2001 From: Michel Aractingi Date: Tue, 3 Dec 2024 00:51:55 +0100 Subject: [PATCH 1/3] Refactor OpenX (#505) --- .../push_dataset_to_hub/openx/configs.yaml | 639 ------------- .../push_dataset_to_hub/openx/data_utils.py | 106 --- .../push_dataset_to_hub/openx/droid_utils.py | 200 ---- .../push_dataset_to_hub/openx/transforms.py | 859 ------------------ .../push_dataset_to_hub/openx_rlds_format.py | 143 +-- lerobot/scripts/push_dataset_to_hub.py | 30 +- 6 files changed, 58 insertions(+), 1919 deletions(-) delete mode 100644 lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml delete mode 100644 lerobot/common/datasets/push_dataset_to_hub/openx/data_utils.py delete mode 100644 lerobot/common/datasets/push_dataset_to_hub/openx/droid_utils.py delete mode 100644 lerobot/common/datasets/push_dataset_to_hub/openx/transforms.py diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml b/lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml deleted file mode 100644 index f706270a2..000000000 --- a/lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml +++ /dev/null @@ -1,639 +0,0 @@ -OPENX_DATASET_CONFIGS: - fractal20220817_data: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - base_pose_tool_reached - - gripper_closed - fps: 3 - - kuka: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - clip_function_input/base_pose_tool_reached - - gripper_closed - fps: 10 - - bridge_openx: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - EEF_state - - gripper_state - fps: 5 - - taco_play: - image_obs_keys: - - rgb_static - - rgb_gripper - depth_obs_keys: - - depth_static - - depth_gripper - state_obs_keys: - - state_eef - - state_gripper - fps: 15 - - jaco_play: - image_obs_keys: - - image - - image_wrist - depth_obs_keys: - - null - state_obs_keys: - - state_eef - - state_gripper - fps: 10 - - berkeley_cable_routing: - image_obs_keys: - - image - - top_image - - wrist45_image - - wrist225_image - depth_obs_keys: - - null - state_obs_keys: - - robot_state - fps: 10 - - roboturk: - image_obs_keys: - - front_rgb - depth_obs_keys: - - null - state_obs_keys: - - null - fps: 10 - - nyu_door_opening_surprising_effectiveness: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - null - fps: 3 - - viola: - image_obs_keys: - - agentview_rgb - - eye_in_hand_rgb - depth_obs_keys: - - null - state_obs_keys: - - joint_states - - gripper_states - fps: 20 - - berkeley_autolab_ur5: - image_obs_keys: - - image - - hand_image - depth_obs_keys: - - image_with_depth - state_obs_keys: - - state - fps: 5 - - toto: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 30 - - language_table: - image_obs_keys: - - rgb - depth_obs_keys: - - null - state_obs_keys: - - effector_translation - fps: 10 - - columbia_cairlab_pusht_real: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - robot_state - fps: 10 - - stanford_kuka_multimodal_dataset_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - depth_image - state_obs_keys: - - ee_position - - ee_orientation - fps: 20 - - nyu_rot_dataset_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - eef_state - - gripper_state - fps: 3 - - io_ai_tech: - image_obs_keys: - - image - - image_fisheye - - image_left_side - - image_right_side - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 3 - - stanford_hydra_dataset_converted_externally_to_rlds: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - eef_state - - gripper_state - fps: 10 - - austin_buds_dataset_converted_externally_to_rlds: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 20 - - nyu_franka_play_dataset_converted_externally_to_rlds: - image_obs_keys: - - image - - image_additional_view - depth_obs_keys: - - depth - - depth_additional_view - state_obs_keys: - - eef_state - fps: 3 - - maniskill_dataset_converted_externally_to_rlds: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - depth - - wrist_depth - state_obs_keys: - - tcp_pose - - gripper_state - fps: 20 - - furniture_bench_dataset_converted_externally_to_rlds: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 10 - - cmu_franka_exploration_dataset_converted_externally_to_rlds: - image_obs_keys: - - highres_image - depth_obs_keys: - - null - state_obs_keys: - - null - fps: 10 - - ucsd_kitchen_dataset_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - joint_state - fps: 2 - - ucsd_pick_and_place_dataset_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - eef_state - - gripper_state - fps: 3 - - spoc: - image_obs_keys: - - image - - image_manipulation - depth_obs_keys: - - null - state_obs_keys: - - null - fps: 3 - - austin_sailor_dataset_converted_externally_to_rlds: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 20 - - austin_sirius_dataset_converted_externally_to_rlds: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 20 - - bc_z: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - present/xyz - - present/axis_angle - - present/sensed_close - fps: 10 - - utokyo_pr2_opening_fridge_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - eef_state - - gripper_state - fps: 10 - - utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - eef_state - - gripper_state - fps: 10 - - utokyo_xarm_pick_and_place_converted_externally_to_rlds: - image_obs_keys: - - image - - image2 - - hand_image - depth_obs_keys: - - null - state_obs_keys: - - end_effector_pose - fps: 10 - - utokyo_xarm_bimanual_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - pose_r - fps: 10 - - robo_net: - image_obs_keys: - - image - - image1 - depth_obs_keys: - - null - state_obs_keys: - - eef_state - - gripper_state - fps: 1 - - robo_set: - image_obs_keys: - - image_left - - image_right - - image_wrist - depth_obs_keys: - - null - state_obs_keys: - - state - - state_velocity - fps: 5 - - berkeley_mvp_converted_externally_to_rlds: - image_obs_keys: - - hand_image - depth_obs_keys: - - null - state_obs_keys: - - gripper - - pose - - joint_pos - fps: 5 - - berkeley_rpt_converted_externally_to_rlds: - image_obs_keys: - - hand_image - depth_obs_keys: - - null - state_obs_keys: - - joint_pos - - gripper - fps: 30 - - kaist_nonprehensile_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 10 - - stanford_mask_vit_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - eef_state - - gripper_state - - tokyo_u_lsmo_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - eef_state - - gripper_state - fps: 10 - - dlr_sara_pour_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 10 - - dlr_sara_grid_clamp_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 10 - - dlr_edan_shared_control_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 5 - - asu_table_top_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - eef_state - - gripper_state - fps: 12.5 - - stanford_robocook_converted_externally_to_rlds: - image_obs_keys: - - image_1 - - image_2 - depth_obs_keys: - - depth_1 - - depth_2 - state_obs_keys: - - eef_state - - gripper_state - fps: 5 - - imperialcollege_sawyer_wrist_cam: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 10 - - iamlab_cmu_pickup_insert_converted_externally_to_rlds: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - joint_state - - gripper_state - fps: 20 - - uiuc_d3field: - image_obs_keys: - - image_1 - - image_2 - depth_obs_keys: - - depth_1 - - depth_2 - state_obs_keys: - - null - fps: 1 - - utaustin_mutex: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 20 - - berkeley_fanuc_manipulation: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - joint_state - - gripper_state - fps: 10 - - cmu_playing_with_food: - image_obs_keys: - - image - - finger_vision_1 - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 10 - - cmu_play_fusion: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 5 - - cmu_stretch: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - eef_state - - gripper_state - fps: 10 - - berkeley_gnm_recon: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - state - - position - - yaw - fps: 3 - - berkeley_gnm_cory_hall: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - state - - position - - yaw - fps: 5 - - berkeley_gnm_sac_son: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - state - - position - - yaw - fps: 10 - - droid: - image_obs_keys: - - exterior_image_1_left - - exterior_image_2_left - - wrist_image_left - depth_obs_keys: - - null - state_obs_keys: - - proprio - fps: 15 - - droid_100: - image_obs_keys: - - exterior_image_1_left - - exterior_image_2_left - - wrist_image_left - depth_obs_keys: - - null - state_obs_keys: - - proprio - fps: 15 - - fmb: - image_obs_keys: - - image_side_1 - - image_side_2 - - image_wrist_1 - - image_wrist_2 - depth_obs_keys: - - image_side_1_depth - - image_side_2_depth - - image_wrist_1_depth - - image_wrist_2_depth - state_obs_keys: - - proprio - fps: 10 - - dobbe: - image_obs_keys: - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - proprio - fps: 3.75 - - usc_cloth_sim_converted_externally_to_rlds: - image_obs_keys: - - image - depth_obs_keys: - - null - state_obs_keys: - - null - fps: 10 - - plex_robosuite: - image_obs_keys: - - image - - wrist_image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 20 - - conq_hose_manipulation: - image_obs_keys: - - frontleft_fisheye_image - - frontright_fisheye_image - - hand_color_image - depth_obs_keys: - - null - state_obs_keys: - - state - fps: 30 diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx/data_utils.py b/lerobot/common/datasets/push_dataset_to_hub/openx/data_utils.py deleted file mode 100644 index 1582c67c2..000000000 --- a/lerobot/common/datasets/push_dataset_to_hub/openx/data_utils.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the Licens e. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -NOTE(YL): Adapted from: - Octo: https://github.com/octo-models/octo/blob/main/octo/data/utils/data_utils.py - -data_utils.py - -Additional utils for data processing. -""" - -from typing import Any, Dict, List - -import tensorflow as tf - - -def binarize_gripper_actions(actions: tf.Tensor) -> tf.Tensor: - """ - Converts gripper actions from continuous to binary values (0 and 1). - - We exploit that fact that most of the time, the gripper is fully open (near 1.0) or fully closed (near 0.0). As it - transitions between the two, it sometimes passes through a few intermediate values. We relabel those intermediate - values based on the state that is reached _after_ those intermediate values. - - In the edge case that the trajectory ends with an intermediate value, we give up on binarizing and relabel that - chunk of intermediate values as the last action in the trajectory. - - The `scan_fn` implements the following logic: - new_actions = np.empty_like(actions) - carry = actions[-1] - for i in reversed(range(actions.shape[0])): - if in_between_mask[i]: - carry = carry - else: - carry = float(open_mask[i]) - new_actions[i] = carry - """ - open_mask, closed_mask = actions > 0.95, actions < 0.05 - in_between_mask = tf.logical_not(tf.logical_or(open_mask, closed_mask)) - is_open_float = tf.cast(open_mask, tf.float32) - - def scan_fn(carry, i): - return tf.cond(in_between_mask[i], lambda: tf.cast(carry, tf.float32), lambda: is_open_float[i]) - - return tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), actions[-1], reverse=True) - - -def invert_gripper_actions(actions: tf.Tensor) -> tf.Tensor: - return 1 - actions - - -def rel2abs_gripper_actions(actions: tf.Tensor) -> tf.Tensor: - """ - Converts relative gripper actions (+1 for closing, -1 for opening) to absolute actions (0 = closed; 1 = open). - - Assumes that the first relative gripper is not redundant (i.e. close when already closed)! - """ - # Note =>> -1 for closing, 1 for opening, 0 for no change - opening_mask, closing_mask = actions < -0.1, actions > 0.1 - thresholded_actions = tf.where(opening_mask, 1, tf.where(closing_mask, -1, 0)) - - def scan_fn(carry, i): - return tf.cond(thresholded_actions[i] == 0, lambda: carry, lambda: thresholded_actions[i]) - - # If no relative grasp, assumes open for whole trajectory - start = -1 * thresholded_actions[tf.argmax(thresholded_actions != 0, axis=0)] - start = tf.cond(start == 0, lambda: 1, lambda: start) - - # Note =>> -1 for closed, 1 for open - new_actions = tf.scan(scan_fn, tf.range(tf.shape(actions)[0]), start) - new_actions = tf.cast(new_actions, tf.float32) / 2 + 0.5 - - return new_actions - - -# === Bridge-V2 =>> Dataset-Specific Transform === -def relabel_bridge_actions(traj: Dict[str, Any]) -> Dict[str, Any]: - """Relabels actions to use reached proprioceptive state; discards last timestep (no-action).""" - movement_actions = traj["observation"]["state"][1:, :6] - traj["observation"]["state"][:-1, :6] - traj_truncated = tf.nest.map_structure(lambda x: x[:-1], traj) - traj_truncated["action"] = tf.concat([movement_actions, traj["action"][:-1, -1:]], axis=1) - - return traj_truncated - - -# === RLDS Dataset Initialization Utilities === -def pprint_data_mixture(dataset_kwargs_list: List[Dict[str, Any]], dataset_weights: List[int]) -> None: - print("\n######################################################################################") - print(f"# Loading the following {len(dataset_kwargs_list)} datasets (incl. sampling weight):{'': >24} #") - for dataset_kwargs, weight in zip(dataset_kwargs_list, dataset_weights, strict=False): - pad = 80 - len(dataset_kwargs["name"]) - print(f"# {dataset_kwargs['name']}: {weight:=>{pad}f} #") - print("######################################################################################\n") diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx/droid_utils.py b/lerobot/common/datasets/push_dataset_to_hub/openx/droid_utils.py deleted file mode 100644 index 22ac4d9e3..000000000 --- a/lerobot/common/datasets/push_dataset_to_hub/openx/droid_utils.py +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -NOTE(YL): Adapted from: - OpenVLA: https://github.com/openvla/openvla - -Episode transforms for DROID dataset. -""" - -from typing import Any, Dict - -import tensorflow as tf -import tensorflow_graphics.geometry.transformation as tfg - - -def rmat_to_euler(rot_mat): - return tfg.euler.from_rotation_matrix(rot_mat) - - -def euler_to_rmat(euler): - return tfg.rotation_matrix_3d.from_euler(euler) - - -def invert_rmat(rot_mat): - return tfg.rotation_matrix_3d.inverse(rot_mat) - - -def rotmat_to_rot6d(mat): - """ - Converts rotation matrix to R6 rotation representation (first two rows in rotation matrix). - Args: - mat: rotation matrix - - Returns: 6d vector (first two rows of rotation matrix) - - """ - r6 = mat[..., :2, :] - r6_0, r6_1 = r6[..., 0, :], r6[..., 1, :] - r6_flat = tf.concat([r6_0, r6_1], axis=-1) - return r6_flat - - -def velocity_act_to_wrist_frame(velocity, wrist_in_robot_frame): - """ - Translates velocity actions (translation + rotation) from base frame of the robot to wrist frame. - Args: - velocity: 6d velocity action (3 x translation, 3 x rotation) - wrist_in_robot_frame: 6d pose of the end-effector in robot base frame - - Returns: 9d velocity action in robot wrist frame (3 x translation, 6 x rotation as R6) - - """ - r_frame = euler_to_rmat(wrist_in_robot_frame[:, 3:6]) - r_frame_inv = invert_rmat(r_frame) - - # world to wrist: dT_pi = R^-1 dT_rbt - vel_t = (r_frame_inv @ velocity[:, :3][..., None])[..., 0] - - # world to wrist: dR_pi = R^-1 dR_rbt R - dr_ = euler_to_rmat(velocity[:, 3:6]) - dr_ = r_frame_inv @ (dr_ @ r_frame) - dr_r6 = rotmat_to_rot6d(dr_) - return tf.concat([vel_t, dr_r6], axis=-1) - - -def rand_swap_exterior_images(img1, img2): - """ - Randomly swaps the two exterior images (for training with single exterior input). - """ - return tf.cond(tf.random.uniform(shape=[]) > 0.5, lambda: (img1, img2), lambda: (img2, img1)) - - -def droid_baseact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - """ - DROID dataset transformation for actions expressed in *base* frame of the robot. - """ - dt = trajectory["action_dict"]["cartesian_velocity"][:, :3] - dr_ = trajectory["action_dict"]["cartesian_velocity"][:, 3:6] - - trajectory["action"] = tf.concat( - ( - dt, - dr_, - 1 - trajectory["action_dict"]["gripper_position"], - ), - axis=-1, - ) - trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = ( - rand_swap_exterior_images( - trajectory["observation"]["exterior_image_1_left"], - trajectory["observation"]["exterior_image_2_left"], - ) - ) - trajectory["observation"]["proprio"] = tf.concat( - ( - trajectory["observation"]["cartesian_position"], - trajectory["observation"]["gripper_position"], - ), - axis=-1, - ) - return trajectory - - -def droid_wristact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - """ - DROID dataset transformation for actions expressed in *wrist* frame of the robot. - """ - wrist_act = velocity_act_to_wrist_frame( - trajectory["action_dict"]["cartesian_velocity"], trajectory["observation"]["cartesian_position"] - ) - trajectory["action"] = tf.concat( - ( - wrist_act, - trajectory["action_dict"]["gripper_position"], - ), - axis=-1, - ) - trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = ( - rand_swap_exterior_images( - trajectory["observation"]["exterior_image_1_left"], - trajectory["observation"]["exterior_image_2_left"], - ) - ) - trajectory["observation"]["proprio"] = tf.concat( - ( - trajectory["observation"]["cartesian_position"], - trajectory["observation"]["gripper_position"], - ), - axis=-1, - ) - return trajectory - - -def droid_finetuning_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - """ - DROID dataset transformation for actions expressed in *base* frame of the robot. - """ - dt = trajectory["action_dict"]["cartesian_velocity"][:, :3] - dr_ = trajectory["action_dict"]["cartesian_velocity"][:, 3:6] - trajectory["action"] = tf.concat( - ( - dt, - dr_, - 1 - trajectory["action_dict"]["gripper_position"], - ), - axis=-1, - ) - trajectory["observation"]["proprio"] = tf.concat( - ( - trajectory["observation"]["cartesian_position"], - trajectory["observation"]["gripper_position"], - ), - axis=-1, - ) - return trajectory - - -def zero_action_filter(traj: Dict) -> bool: - """ - Filters transitions whose actions are all-0 (only relative actions, no gripper action). - Note: this filter is applied *after* action normalization, so need to compare to "normalized 0". - """ - droid_q01 = tf.convert_to_tensor( - [ - -0.7776297926902771, - -0.5803514122962952, - -0.5795090794563293, - -0.6464047729969025, - -0.7041108310222626, - -0.8895104378461838, - ] - ) - droid_q99 = tf.convert_to_tensor( - [ - 0.7597932070493698, - 0.5726242214441299, - 0.7351000607013702, - 0.6705610305070877, - 0.6464948207139969, - 0.8897542208433151, - ] - ) - droid_norm_0_act = ( - 2 * (tf.zeros_like(traj["action"][:, :6]) - droid_q01) / (droid_q99 - droid_q01 + 1e-8) - 1 - ) - - return tf.reduce_any(tf.math.abs(traj["action"][:, :6] - droid_norm_0_act) > 1e-5) diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx/transforms.py b/lerobot/common/datasets/push_dataset_to_hub/openx/transforms.py deleted file mode 100644 index a0c1e30f6..000000000 --- a/lerobot/common/datasets/push_dataset_to_hub/openx/transforms.py +++ /dev/null @@ -1,859 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -NOTE(YL): Adapted from: - OpenVLA: https://github.com/openvla/openvla - Octo: https://github.com/octo-models/octo - -transforms.py - -Defines a registry of per-dataset standardization transforms for each dataset in Open-X Embodiment. - -Transforms adopt the following structure: - Input: Dictionary of *batched* features (i.e., has leading time dimension) - Output: Dictionary `step` =>> { - "observation": { - - State (in chosen state representation) - }, - "action": Action (in chosen action representation), - "language_instruction": str - } -""" - -from typing import Any, Dict - -import tensorflow as tf - -from lerobot.common.datasets.push_dataset_to_hub.openx.data_utils import ( - binarize_gripper_actions, - invert_gripper_actions, - rel2abs_gripper_actions, - relabel_bridge_actions, -) - - -def droid_baseact_transform_fn(): - from lerobot.common.datasets.push_dataset_to_hub.openx.droid_utils import droid_baseact_transform - - return droid_baseact_transform - - -def bridge_openx_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - """ - Applies to version of Bridge V2 in Open X-Embodiment mixture. - - Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it! - """ - for key in trajectory: - if key == "traj_metadata": - continue - elif key in ["observation", "action"]: - for key2 in trajectory[key]: - trajectory[key][key2] = trajectory[key][key2][1:] - else: - trajectory[key] = trajectory[key][1:] - - trajectory["action"] = tf.concat( - ( - trajectory["action"]["world_vector"], - trajectory["action"]["rotation_delta"], - tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32), - ), - axis=-1, - ) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - trajectory = relabel_bridge_actions(trajectory) - trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:] - return trajectory - - -def bridge_orig_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - """ - Applies to original version of Bridge V2 from the official project website. - - Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it! - """ - for key in trajectory: - if key == "traj_metadata": - continue - elif key == "observation": - for key2 in trajectory[key]: - trajectory[key][key2] = trajectory[key][key2][1:] - else: - trajectory[key] = trajectory[key][1:] - - trajectory["action"] = tf.concat( - [ - trajectory["action"][:, :6], - binarize_gripper_actions(trajectory["action"][:, -1])[:, None], - ], - axis=1, - ) - trajectory = relabel_bridge_actions(trajectory) - trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:] - return trajectory - - -def ppgm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["action"] = tf.concat( - [ - trajectory["action"][:, :6], - binarize_gripper_actions(trajectory["action"][:, -1])[:, None], - ], - axis=1, - ) - trajectory["observation"]["EEF_state"] = trajectory["observation"]["cartesian_position"][:, :6] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["gripper_position"][:, -1:] - return trajectory - - -def rt1_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # make gripper action absolute action, +1 = open, 0 = close - gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0] - gripper_action = rel2abs_gripper_actions(gripper_action) - - trajectory["action"] = tf.concat( - ( - trajectory["action"]["world_vector"], - trajectory["action"]["rotation_delta"], - gripper_action[:, None], - ), - axis=-1, - ) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - return trajectory - - -def kuka_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # make gripper action absolute action, +1 = open, 0 = close - gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0] - gripper_action = rel2abs_gripper_actions(gripper_action) - - trajectory["action"] = tf.concat( - ( - trajectory["action"]["world_vector"], - trajectory["action"]["rotation_delta"], - gripper_action[:, None], - ), - axis=-1, - ) - # decode compressed state - eef_value = tf.io.decode_compressed( - trajectory["observation"]["clip_function_input/base_pose_tool_reached"], - compression_type="ZLIB", - ) - eef_value = tf.io.decode_raw(eef_value, tf.float32) - trajectory["observation"]["clip_function_input/base_pose_tool_reached"] = tf.reshape(eef_value, (-1, 7)) - gripper_value = tf.io.decode_compressed( - trajectory["observation"]["gripper_closed"], compression_type="ZLIB" - ) - gripper_value = tf.io.decode_raw(gripper_value, tf.float32) - trajectory["observation"]["gripper_closed"] = tf.reshape(gripper_value, (-1, 1)) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - return trajectory - - -def taco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["state_eef"] = trajectory["observation"]["robot_obs"][:, :6] - trajectory["observation"]["state_gripper"] = trajectory["observation"]["robot_obs"][:, 7:8] - trajectory["action"] = trajectory["action"]["rel_actions_world"] - - # invert gripper action + clip, +1 = open, 0 = close - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :6], - tf.clip_by_value(trajectory["action"][:, -1:], 0, 1), - ), - axis=-1, - ) - - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - return trajectory - - -def jaco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["state_eef"] = trajectory["observation"]["end_effector_cartesian_pos"][:, :6] - trajectory["observation"]["state_gripper"] = trajectory["observation"]["end_effector_cartesian_pos"][ - :, -1: - ] - - # make gripper action absolute action, +1 = open, 0 = close - gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0] - gripper_action = rel2abs_gripper_actions(gripper_action) - - trajectory["action"] = tf.concat( - ( - trajectory["action"]["world_vector"], - tf.zeros_like(trajectory["action"]["world_vector"]), - gripper_action[:, None], - ), - axis=-1, - ) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - return trajectory - - -def berkeley_cable_routing_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["action"] = tf.concat( - ( - trajectory["action"]["world_vector"], - trajectory["action"]["rotation_delta"], - tf.zeros_like(trajectory["action"]["world_vector"][:, :1]), - ), - axis=-1, - ) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - return trajectory - - -def roboturk_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # invert absolute gripper action, +1 = open, 0 = close - gripper_action = invert_gripper_actions( - tf.clip_by_value(trajectory["action"]["gripper_closedness_action"], 0, 1) - ) - - trajectory["action"] = tf.concat( - ( - trajectory["action"]["world_vector"], - trajectory["action"]["rotation_delta"], - gripper_action, - ), - axis=-1, - ) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - trajectory["language_embedding"] = trajectory["observation"]["natural_language_embedding"] - return trajectory - - -def nyu_door_opening_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # make gripper action absolute action, +1 = open, 0 = close - gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0] - gripper_action = rel2abs_gripper_actions(gripper_action) - - trajectory["action"] = tf.concat( - ( - trajectory["action"]["world_vector"], - trajectory["action"]["rotation_delta"], - gripper_action[:, None], - ), - axis=-1, - ) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - return trajectory - - -def viola_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # make gripper action, +1 = open, 0 = close - gripper_action = trajectory["action"]["gripper_closedness_action"][:, None] - gripper_action = tf.clip_by_value(gripper_action, 0, 1) - gripper_action = invert_gripper_actions(gripper_action) - - trajectory["action"] = tf.concat( - ( - trajectory["action"]["world_vector"], - trajectory["action"]["rotation_delta"], - gripper_action, - ), - axis=-1, - ) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - return trajectory - - -def berkeley_autolab_ur5_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["state"] = trajectory["observation"]["robot_state"][:, 6:14] - - # make gripper action absolute action, +1 = open, 0 = close - gripper_action = trajectory["action"]["gripper_closedness_action"] - gripper_action = rel2abs_gripper_actions(gripper_action) - - trajectory["action"] = tf.concat( - ( - trajectory["action"]["world_vector"], - trajectory["action"]["rotation_delta"], - gripper_action[:, None], - ), - axis=-1, - ) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - return trajectory - - -def toto_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["action"] = tf.concat( - ( - trajectory["action"]["world_vector"], - trajectory["action"]["rotation_delta"], - tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32), - ), - axis=-1, - ) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - return trajectory - - -def language_table_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # default to "open" gripper - trajectory["action"] = tf.concat( - ( - trajectory["action"], - tf.zeros_like(trajectory["action"]), - tf.zeros_like(trajectory["action"]), - tf.ones_like(trajectory["action"][:, :1]), - ), - axis=-1, - ) - - # decode language instruction - instruction_bytes = trajectory["observation"]["instruction"] - instruction_encoded = tf.strings.unicode_encode(instruction_bytes, output_encoding="UTF-8") - # Remove trailing padding --> convert RaggedTensor to regular Tensor. - trajectory["language_instruction"] = tf.strings.split(instruction_encoded, "\x00")[:, :1].to_tensor()[ - :, 0 - ] - return trajectory - - -def pusht_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["action"] = tf.concat( - ( - trajectory["action"]["world_vector"], - trajectory["action"]["rotation_delta"], - trajectory["action"]["gripper_closedness_action"][:, None], - ), - axis=-1, - ) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - return trajectory - - -def stanford_kuka_multimodal_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["depth_image"] = trajectory["observation"]["depth_image"][..., 0] - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :3], - tf.zeros_like(trajectory["action"][:, :3]), - trajectory["action"][:, -1:], - ), - axis=-1, - ) - return trajectory - - -def nyu_rot_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][..., :6] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., -1:] - trajectory["action"] = trajectory["action"][..., :7] - return trajectory - - -def stanford_hydra_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # invert gripper action, +1 = open, 0 = close - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :6], - invert_gripper_actions(trajectory["action"][:, -1:]), - ), - axis=-1, - ) - - trajectory["observation"]["eef_state"] = tf.concat( - ( - trajectory["observation"]["state"][:, :3], - trajectory["observation"]["state"][:, 7:10], - ), - axis=-1, - ) - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -3:-2] - return trajectory - - -def austin_buds_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # invert gripper action + clip, +1 = open, 0 = close - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :6], - invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)), - ), - axis=-1, - ) - - trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8] - return trajectory - - -def nyu_franka_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["depth"] = tf.cast(trajectory["observation"]["depth"][..., 0], tf.float32) - trajectory["observation"]["depth_additional_view"] = tf.cast( - trajectory["observation"]["depth_additional_view"][..., 0], tf.float32 - ) - trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, -6:] - - # clip gripper action, +1 = open, 0 = close - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, -8:-2], - tf.clip_by_value(trajectory["action"][:, -2:-1], 0, 1), - ), - axis=-1, - ) - return trajectory - - -def maniskill_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., 7:8] - return trajectory - - -def furniture_bench_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - import tensorflow_graphics.geometry.transformation as tft - - trajectory["observation"]["state"] = tf.concat( - ( - trajectory["observation"]["state"][:, :7], - trajectory["observation"]["state"][:, -1:], - ), - axis=-1, - ) - - # invert gripper action + clip, +1 = open, 0 = close - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :3], - tft.euler.from_quaternion(trajectory["action"][:, 3:7]), - invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)), - ), - axis=-1, - ) - return trajectory - - -def cmu_franka_exploration_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["action"] = trajectory["action"][..., :-1] - return trajectory - - -def ucsd_kitchen_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7] - trajectory["action"] = trajectory["action"][..., :-1] - return trajectory - - -def ucsd_pick_place_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:] - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :3], - tf.zeros_like(trajectory["action"][:, :3]), - trajectory["action"][:, -1:], - ), - axis=-1, - ) - return trajectory - - -def austin_sailor_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # invert gripper action + clip, +1 = open, 0 = close - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :6], - invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)), - ), - axis=-1, - ) - return trajectory - - -def austin_sirius_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # invert gripper action + clip, +1 = open, 0 = close - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :6], - invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)), - ), - axis=-1, - ) - return trajectory - - -def bc_z_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["action"] = tf.concat( - ( - trajectory["action"]["future/xyz_residual"][:, :3], - trajectory["action"]["future/axis_angle_residual"][:, :3], - invert_gripper_actions(tf.cast(trajectory["action"]["future/target_close"][:, :1], tf.float32)), - ), - axis=-1, - ) - trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"] - return trajectory - - -def tokyo_pr2_opening_fridge_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:] - trajectory["action"] = trajectory["action"][..., :-1] - return trajectory - - -def tokyo_pr2_tabletop_manipulation_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:] - trajectory["action"] = trajectory["action"][..., :-1] - return trajectory - - -def utokyo_xarm_bimanual_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["action"] = trajectory["action"][..., -7:] - return trajectory - - -def robo_net_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["eef_state"] = tf.concat( - ( - trajectory["observation"]["state"][:, :4], - tf.zeros_like(trajectory["observation"]["state"][:, :2]), - ), - axis=-1, - ) - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:] - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :4], - tf.zeros_like(trajectory["action"][:, :2]), - trajectory["action"][:, -1:], - ), - axis=-1, - ) - return trajectory - - -def berkeley_mvp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - """ - trajectory["observation"]["state"] = tf.concat(( - tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32), - trajectory["observation"]["pose"], - trajectory["observation"]["joint_pos"],), - axis=-1,) - """ - trajectory["observation"]["gripper"] = tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32) - return trajectory - - -def berkeley_rpt_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["gripper"] = tf.cast(trajectory["observation"]["gripper"][:, None], tf.float32) - return trajectory - - -def kaist_nonprehensible_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["state"] = trajectory["observation"]["state"][:, -7:] - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :6], - tf.zeros_like(trajectory["action"][:, :1]), - ), - axis=-1, - ) - return trajectory - - -def stanford_mask_vit_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["eef_state"] = tf.concat( - ( - trajectory["observation"]["end_effector_pose"][:, :4], - tf.zeros_like(trajectory["observation"]["end_effector_pose"][:, :2]), - ), - axis=-1, - ) - trajectory["observation"]["gripper_state"] = trajectory["observation"]["end_effector_pose"][:, -1:] - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :4], - tf.zeros_like(trajectory["action"][:, :2]), - trajectory["action"][:, -1:], - ), - axis=-1, - ) - return trajectory - - -def tokyo_lsmo_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:] - return trajectory - - -def dlr_sara_grid_clamp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :6] - return trajectory - - -def dlr_edan_shared_control_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # invert gripper action, +1 = open, 0 = close - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :6], - invert_gripper_actions(trajectory["action"][:, -1:]), - ), - axis=-1, - ) - return trajectory - - -def asu_table_top_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["eef_state"] = trajectory["ground_truth_states"]["EE"] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:] - return trajectory - - -def robocook_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:] - return trajectory - - -def imperial_wristcam_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["action"] = trajectory["action"][..., :-1] - return trajectory - - -def iamlab_pick_insert_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - import tensorflow_graphics.geometry.transformation as tft - - trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 7:8] - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :3], - tft.euler.from_quaternion(trajectory["action"][:, 3:7]), - trajectory["action"][:, 7:8], - ), - axis=-1, - ) - return trajectory - - -def uiuc_d3field_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["action"] = tf.concat( - ( - trajectory["action"], - tf.zeros_like(trajectory["action"]), - tf.zeros_like(trajectory["action"][:, :1]), - ), - axis=-1, - ) - return trajectory - - -def utaustin_mutex_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8] - - # invert gripper action + clip, +1 = open, 0 = close - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :6], - invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)), - ), - axis=-1, - ) - return trajectory - - -def berkeley_fanuc_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :6] - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 6:7] - - # dataset does not store gripper actions, so use gripper state info, invert so +1 = open, 0 = close - trajectory["action"] = tf.concat( - ( - trajectory["action"], - invert_gripper_actions(trajectory["observation"]["gripper_state"]), - ), - axis=-1, - ) - return trajectory - - -def cmu_playing_with_food_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - import tensorflow_graphics.geometry.transformation as tft - - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :3], - tft.euler.from_quaternion(trajectory["action"][:, 3:7]), - trajectory["action"][:, -1:], - ), - axis=-1, - ) - return trajectory - - -def playfusion_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :3], - trajectory["action"][:, -4:], - ), - axis=-1, - ) - return trajectory - - -def cmu_stretch_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["eef_state"] = tf.concat( - ( - trajectory["observation"]["state"][:, :3], - tf.zeros_like(trajectory["observation"]["state"][:, :3]), - ), - axis=-1, - ) - trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:] - trajectory["action"] = trajectory["action"][..., :-1] - return trajectory - - -def gnm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - trajectory["observation"]["state"] = tf.concat( - ( - trajectory["observation"]["position"], - tf.zeros_like(trajectory["observation"]["state"][:, :3]), - trajectory["observation"]["yaw"], - ), - axis=-1, - ) - trajectory["action"] = tf.concat( - ( - trajectory["action"], - tf.zeros_like(trajectory["action"]), - tf.zeros_like(trajectory["action"]), - tf.zeros_like(trajectory["action"][:, :1]), - ), - axis=-1, - ) - return trajectory - - -def fmb_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # every input feature is batched, ie has leading batch dimension - trajectory["observation"]["proprio"] = tf.concat( - ( - trajectory["observation"]["eef_pose"], - trajectory["observation"]["state_gripper_pose"][..., None], - ), - axis=-1, - ) - return trajectory - - -def dobbe_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # every input feature is batched, ie has leading batch dimension - trajectory["observation"]["proprio"] = trajectory["observation"]["state"] - return trajectory - - -def robo_set_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - # gripper action is in -1...1 --> clip to 0...1, flip - gripper_action = trajectory["action"][:, -1:] - gripper_action = invert_gripper_actions(tf.clip_by_value(gripper_action, 0, 1)) - - trajectory["action"] = tf.concat( - ( - trajectory["action"][:, :7], - gripper_action, - ), - axis=-1, - ) - return trajectory - - -def identity_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]: - return trajectory - - -# === Registry === -OPENX_STANDARDIZATION_TRANSFORMS = { - "bridge_openx": bridge_openx_dataset_transform, - "bridge_orig": bridge_orig_dataset_transform, - "bridge_dataset": bridge_orig_dataset_transform, - "ppgm": ppgm_dataset_transform, - "ppgm_static": ppgm_dataset_transform, - "ppgm_wrist": ppgm_dataset_transform, - "fractal20220817_data": rt1_dataset_transform, - "kuka": kuka_dataset_transform, - "taco_play": taco_play_dataset_transform, - "jaco_play": jaco_play_dataset_transform, - "berkeley_cable_routing": berkeley_cable_routing_dataset_transform, - "roboturk": roboturk_dataset_transform, - "nyu_door_opening_surprising_effectiveness": nyu_door_opening_dataset_transform, - "viola": viola_dataset_transform, - "berkeley_autolab_ur5": berkeley_autolab_ur5_dataset_transform, - "toto": toto_dataset_transform, - "language_table": language_table_dataset_transform, - "columbia_cairlab_pusht_real": pusht_dataset_transform, - "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": stanford_kuka_multimodal_dataset_transform, - "nyu_rot_dataset_converted_externally_to_rlds": nyu_rot_dataset_transform, - "stanford_hydra_dataset_converted_externally_to_rlds": stanford_hydra_dataset_transform, - "austin_buds_dataset_converted_externally_to_rlds": austin_buds_dataset_transform, - "nyu_franka_play_dataset_converted_externally_to_rlds": nyu_franka_play_dataset_transform, - "maniskill_dataset_converted_externally_to_rlds": maniskill_dataset_transform, - "furniture_bench_dataset_converted_externally_to_rlds": furniture_bench_dataset_transform, - "cmu_franka_exploration_dataset_converted_externally_to_rlds": cmu_franka_exploration_dataset_transform, - "ucsd_kitchen_dataset_converted_externally_to_rlds": ucsd_kitchen_dataset_transform, - "ucsd_pick_and_place_dataset_converted_externally_to_rlds": ucsd_pick_place_dataset_transform, - "austin_sailor_dataset_converted_externally_to_rlds": austin_sailor_dataset_transform, - "austin_sirius_dataset_converted_externally_to_rlds": austin_sirius_dataset_transform, - "bc_z": bc_z_dataset_transform, - "utokyo_pr2_opening_fridge_converted_externally_to_rlds": tokyo_pr2_opening_fridge_dataset_transform, - "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": tokyo_pr2_tabletop_manipulation_dataset_transform, - "utokyo_xarm_pick_and_place_converted_externally_to_rlds": identity_transform, - "utokyo_xarm_bimanual_converted_externally_to_rlds": utokyo_xarm_bimanual_dataset_transform, - "robo_net": robo_net_dataset_transform, - "berkeley_mvp_converted_externally_to_rlds": berkeley_mvp_dataset_transform, - "berkeley_rpt_converted_externally_to_rlds": berkeley_rpt_dataset_transform, - "kaist_nonprehensile_converted_externally_to_rlds": kaist_nonprehensible_dataset_transform, - "stanford_mask_vit_converted_externally_to_rlds": stanford_mask_vit_dataset_transform, - "tokyo_u_lsmo_converted_externally_to_rlds": tokyo_lsmo_dataset_transform, - "dlr_sara_pour_converted_externally_to_rlds": identity_transform, - "dlr_sara_grid_clamp_converted_externally_to_rlds": dlr_sara_grid_clamp_dataset_transform, - "dlr_edan_shared_control_converted_externally_to_rlds": dlr_edan_shared_control_dataset_transform, - "asu_table_top_converted_externally_to_rlds": asu_table_top_dataset_transform, - "stanford_robocook_converted_externally_to_rlds": robocook_dataset_transform, - "imperialcollege_sawyer_wrist_cam": imperial_wristcam_dataset_transform, - "iamlab_cmu_pickup_insert_converted_externally_to_rlds": iamlab_pick_insert_dataset_transform, - "uiuc_d3field": uiuc_d3field_dataset_transform, - "utaustin_mutex": utaustin_mutex_dataset_transform, - "berkeley_fanuc_manipulation": berkeley_fanuc_dataset_transform, - "cmu_playing_with_food": cmu_playing_with_food_dataset_transform, - "cmu_play_fusion": playfusion_dataset_transform, - "cmu_stretch": cmu_stretch_dataset_transform, - "berkeley_gnm_recon": gnm_dataset_transform, - "berkeley_gnm_cory_hall": gnm_dataset_transform, - "berkeley_gnm_sac_son": gnm_dataset_transform, - "droid": droid_baseact_transform_fn(), - "droid_100": droid_baseact_transform_fn(), # first 100 episodes of droid - "fmb": fmb_transform, - "dobbe": dobbe_dataset_transform, - "robo_set": robo_set_dataset_transform, - "usc_cloth_sim_converted_externally_to_rlds": identity_transform, - "plex_robosuite": identity_transform, - "conq_hose_manipulation": identity_transform, - "io_ai_tech": identity_transform, - "spoc": identity_transform, -} diff --git a/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py b/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py index cfe115034..1f8a5d144 100644 --- a/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py +++ b/lerobot/common/datasets/push_dataset_to_hub/openx_rlds_format.py @@ -14,13 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. """ +For all datasets in the RLDS format. For https://github.com/google-deepmind/open_x_embodiment (OPENX) datasets. +NOTE: You need to install tensorflow and tensorflow_datsets before running this script. + Example: python lerobot/scripts/push_dataset_to_hub.py \ - --raw-dir /hdd/tensorflow_datasets/bridge_dataset/1.0.0/ \ - --repo-id youliangtan/sampled_bridge_data_v2 \ - --raw-format openx_rlds.bridge_orig \ + --raw-dir /path/to/data/bridge_dataset/1.0.0/ \ + --repo-id your_hub/sampled_bridge_data_v2 \ + --raw-format rlds \ --episodes 3 4 5 8 9 Exact dataset fps defined in openx/config.py, obtained from: @@ -35,12 +38,10 @@ import tensorflow_datasets as tfds import torch import tqdm -import yaml from datasets import Dataset, Features, Image, Sequence, Value from PIL import Image as PILImage from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION -from lerobot.common.datasets.push_dataset_to_hub.openx.transforms import OPENX_STANDARDIZATION_TRANSFORMS from lerobot.common.datasets.push_dataset_to_hub.utils import ( calculate_episode_data_index, concatenate_episodes, @@ -52,11 +53,6 @@ ) from lerobot.common.datasets.video_utils import VideoFrame, encode_video_frames -with open("lerobot/common/datasets/push_dataset_to_hub/openx/configs.yaml") as f: - _openx_list = yaml.safe_load(f) - -OPENX_DATASET_CONFIGS = _openx_list["OPENX_DATASET_CONFIGS"] - np.set_printoptions(precision=2) @@ -108,7 +104,6 @@ def load_from_raw( video: bool, episodes: list[int] | None = None, encoding: dict | None = None, - openx_dataset_name: str | None = None, ): """ Args: @@ -136,16 +131,17 @@ def load_from_raw( # we will apply the standardization transform if the dataset_name is provided # if the dataset name is not provided and the goal is to convert any rlds formatted dataset # search for 'image' keys in the observations - if openx_dataset_name is not None: - print(" - applying standardization transform for dataset: ", openx_dataset_name) - assert openx_dataset_name in OPENX_STANDARDIZATION_TRANSFORMS - transform_fn = OPENX_STANDARDIZATION_TRANSFORMS[openx_dataset_name] - dataset = dataset.map(transform_fn) - - image_keys = OPENX_DATASET_CONFIGS[openx_dataset_name]["image_obs_keys"] - else: - obs_keys = dataset_info.features["steps"]["observation"].keys() - image_keys = [key for key in obs_keys if "image" in key] + image_keys = [] + state_keys = [] + observation_info = dataset_info.features["steps"]["observation"] + for key in observation_info: + # check whether the key is for an image or a vector observation + if len(observation_info[key].shape) == 3: + # only adding uint8 images discards depth images + if observation_info[key].dtype == tf.uint8: + image_keys.append(key) + else: + state_keys.append(key) lang_key = "language_instruction" if "language_instruction" in dataset.element_spec else None @@ -193,50 +189,31 @@ def load_from_raw( num_frames = episode["action"].shape[0] - ########################################################### - # Handle the episodic data - - # last step of demonstration is considered done - done = torch.zeros(num_frames, dtype=torch.bool) - done[-1] = True ep_dict = {} - langs = [] # TODO: might be located in "observation" - - image_array_dict = {key: [] for key in image_keys} + for key in state_keys: + ep_dict[f"observation.{key}"] = tf_to_torch(episode["observation"][key]) - # We will create the state observation tensor by stacking the state - # obs keys defined in the openx/configs.py - if openx_dataset_name is not None: - state_obs_keys = OPENX_DATASET_CONFIGS[openx_dataset_name]["state_obs_keys"] - # stack the state observations, if is None, pad with zeros - states = [] - for key in state_obs_keys: - if key in episode["observation"]: - states.append(tf_to_torch(episode["observation"][key])) - else: - states.append(torch.zeros(num_frames, 1)) # pad with zeros - states = torch.cat(states, dim=1) - # assert states.shape == (num_frames, 8), f"states shape: {states.shape}" - else: - states = tf_to_torch(episode["observation"]["state"]) - - actions = tf_to_torch(episode["action"]) - rewards = tf_to_torch(episode["reward"]).float() + ep_dict["action"] = tf_to_torch(episode["action"]) + ep_dict["next.reward"] = tf_to_torch(episode["reward"]).float() + ep_dict["next.done"] = tf_to_torch(episode["is_last"]) + ep_dict["is_terminal"] = tf_to_torch(episode["is_terminal"]) + ep_dict["is_first"] = tf_to_torch(episode["is_first"]) + ep_dict["discount"] = tf_to_torch(episode["discount"]) # If lang_key is present, convert the entire tensor at once if lang_key is not None: - langs = [str(x) for x in episode[lang_key]] + ep_dict["language_instruction"] = [x.numpy().decode("utf-8") for x in episode[lang_key]] + + ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps + ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames) + ep_dict["frame_index"] = torch.arange(0, num_frames, 1) + + image_array_dict = {key: [] for key in image_keys} for im_key in image_keys: imgs = episode["observation"][im_key] image_array_dict[im_key] = [tf_img_convert(img) for img in imgs] - # simple assertions - for item in [states, actions, rewards, done]: - assert len(item) == num_frames - - ########################################################### - # loop through all cameras for im_key in image_keys: img_key = f"observation.images.{im_key}" @@ -262,17 +239,6 @@ def load_from_raw( else: ep_dict[img_key] = [PILImage.fromarray(x) for x in imgs_array] - if lang_key is not None: - ep_dict["language_instruction"] = langs - - ep_dict["observation.state"] = states - ep_dict["action"] = actions - ep_dict["timestamp"] = torch.arange(0, num_frames, 1) / fps - ep_dict["episode_index"] = torch.tensor([ep_idx] * num_frames) - ep_dict["frame_index"] = torch.arange(0, num_frames, 1) - ep_dict["next.reward"] = rewards - ep_dict["next.done"] = done - path_ep_dict = tmp_ep_dicts_dir.joinpath( "ep_dict_" + "0" * (10 - len(str(ep_idx))) + str(ep_idx) + ".pt" ) @@ -290,30 +256,28 @@ def load_from_raw( def to_hf_dataset(data_dict, video) -> Dataset: features = {} - keys = [key for key in data_dict if "observation.images." in key] - for key in keys: - if video: - features[key] = VideoFrame() - else: - features[key] = Image() + for key in data_dict: + # check if vector state obs + if key.startswith("observation.") and "observation.images." not in key: + features[key] = Sequence(length=data_dict[key].shape[1], feature=Value(dtype="float32", id=None)) + # check if image obs + elif "observation.images." in key: + if video: + features[key] = VideoFrame() + else: + features[key] = Image() - features["observation.state"] = Sequence( - length=data_dict["observation.state"].shape[1], feature=Value(dtype="float32", id=None) - ) - if "observation.velocity" in data_dict: - features["observation.velocity"] = Sequence( - length=data_dict["observation.velocity"].shape[1], feature=Value(dtype="float32", id=None) - ) - if "observation.effort" in data_dict: - features["observation.effort"] = Sequence( - length=data_dict["observation.effort"].shape[1], feature=Value(dtype="float32", id=None) - ) if "language_instruction" in data_dict: features["language_instruction"] = Value(dtype="string", id=None) features["action"] = Sequence( length=data_dict["action"].shape[1], feature=Value(dtype="float32", id=None) ) + + features["is_terminal"] = Value(dtype="bool", id=None) + features["is_first"] = Value(dtype="bool", id=None) + features["discount"] = Value(dtype="float32", id=None) + features["episode_index"] = Value(dtype="int64", id=None) features["frame_index"] = Value(dtype="int64", id=None) features["timestamp"] = Value(dtype="float32", id=None) @@ -333,19 +297,8 @@ def from_raw_to_lerobot_format( video: bool = True, episodes: list[int] | None = None, encoding: dict | None = None, - openx_dataset_name: str | None = None, ): - """This is a test impl for rlds conversion""" - if openx_dataset_name is None: - # set a default rlds frame rate if the dataset is not from openx - fps = 30 - elif "fps" not in OPENX_DATASET_CONFIGS[openx_dataset_name]: - raise ValueError( - "fps for this dataset is not specified in openx/configs.py yet," "means it is not yet tested" - ) - fps = OPENX_DATASET_CONFIGS[openx_dataset_name]["fps"] - - data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding, openx_dataset_name) + data_dict = load_from_raw(raw_dir, videos_dir, fps, video, episodes, encoding) hf_dataset = to_hf_dataset(data_dict, video) episode_data_index = calculate_episode_data_index(hf_dataset) info = { diff --git a/lerobot/scripts/push_dataset_to_hub.py b/lerobot/scripts/push_dataset_to_hub.py index 2bb641a4d..0233ede69 100644 --- a/lerobot/scripts/push_dataset_to_hub.py +++ b/lerobot/scripts/push_dataset_to_hub.py @@ -66,7 +66,7 @@ def get_from_raw_to_lerobot_format_fn(raw_format: str): from lerobot.common.datasets.push_dataset_to_hub.umi_zarr_format import from_raw_to_lerobot_format elif raw_format == "aloha_hdf5": from lerobot.common.datasets.push_dataset_to_hub.aloha_hdf5_format import from_raw_to_lerobot_format - elif "openx_rlds" in raw_format: + elif raw_format in ["rlds", "openx"]: from lerobot.common.datasets.push_dataset_to_hub.openx_rlds_format import from_raw_to_lerobot_format elif raw_format == "dora_parquet": from lerobot.common.datasets.push_dataset_to_hub.dora_parquet_format import from_raw_to_lerobot_format @@ -204,24 +204,14 @@ def push_dataset_to_hub( # convert dataset from original raw format to LeRobot format from_raw_to_lerobot_format = get_from_raw_to_lerobot_format_fn(raw_format) - fmt_kwgs = { - "raw_dir": raw_dir, - "videos_dir": videos_dir, - "fps": fps, - "video": video, - "episodes": episodes, - "encoding": encoding, - } - - if "openx_rlds." in raw_format: - # Support for official OXE dataset name inside `raw_format`. - # For instance, `raw_format="oxe_rlds"` uses the default formating (TODO what does that mean?), - # and `raw_format="oxe_rlds.bridge_orig"` uses the brdige_orig formating - _, openx_dataset_name = raw_format.split(".") - print(f"Converting dataset [{openx_dataset_name}] from 'openx_rlds' to LeRobot format.") - fmt_kwgs["openx_dataset_name"] = openx_dataset_name - - hf_dataset, episode_data_index, info = from_raw_to_lerobot_format(**fmt_kwgs) + hf_dataset, episode_data_index, info = from_raw_to_lerobot_format( + raw_dir, + videos_dir, + fps, + video, + episodes, + encoding, + ) lerobot_dataset = LeRobotDataset.from_preloaded( repo_id=repo_id, @@ -290,7 +280,7 @@ def main(): "--raw-format", type=str, required=True, - help="Dataset type (e.g. `pusht_zarr`, `umi_zarr`, `aloha_hdf5`, `xarm_pkl`, `dora_parquet`, `openx_rlds`).", + help="Dataset type (e.g. `pusht_zarr`, `umi_zarr`, `aloha_hdf5`, `xarm_pkl`, `dora_parquet`, `rlds`, `openx`).", ) parser.add_argument( "--repo-id", From 286bca37cc78e80d5853236b93f17cd0a25cf367 Mon Sep 17 00:00:00 2001 From: Remi Date: Tue, 3 Dec 2024 10:53:21 +0100 Subject: [PATCH 2/3] Fix missing local_files_only in record/replay (#540) Co-authored-by: Simon Alibert --- .github/workflows/test.yml | 70 ++++++++++++++++---------------- lerobot/scripts/control_robot.py | 18 ++++++-- tests/test_control_robot.py | 2 +- 3 files changed, 51 insertions(+), 39 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5de071750..53b37466a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -102,38 +102,38 @@ jobs: && rm -rf tests/outputs outputs # TODO(aliberts, rcadene): redesign after v2 migration / removing hydra - end-to-end: - name: End-to-end - runs-on: ubuntu-latest - env: - MUJOCO_GL: egl - steps: - - uses: actions/checkout@v4 - with: - lfs: true # Ensure LFS files are pulled - - - name: Install apt dependencies - # portaudio19-dev is needed to install pyaudio - run: | - sudo apt-get update && \ - sudo apt-get install -y libegl1-mesa-dev portaudio19-dev - - - name: Install poetry - run: | - pipx install poetry && poetry config virtualenvs.in-project true - echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH - - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - cache: "poetry" - - - name: Install poetry dependencies - run: | - poetry install --all-extras - - - name: Test end-to-end - run: | - make test-end-to-end \ - && rm -rf outputs + # end-to-end: + # name: End-to-end + # runs-on: ubuntu-latest + # env: + # MUJOCO_GL: egl + # steps: + # - uses: actions/checkout@v4 + # with: + # lfs: true # Ensure LFS files are pulled + + # - name: Install apt dependencies + # # portaudio19-dev is needed to install pyaudio + # run: | + # sudo apt-get update && \ + # sudo apt-get install -y libegl1-mesa-dev portaudio19-dev + + # - name: Install poetry + # run: | + # pipx install poetry && poetry config virtualenvs.in-project true + # echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH + + # - name: Set up Python 3.10 + # uses: actions/setup-python@v5 + # with: + # python-version: "3.10" + # cache: "poetry" + + # - name: Install poetry dependencies + # run: | + # poetry install --all-extras + + # - name: Test end-to-end + # run: | + # make test-end-to-end \ + # && rm -rf outputs diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py index 563023f48..e91c6c232 100644 --- a/lerobot/scripts/control_robot.py +++ b/lerobot/scripts/control_robot.py @@ -341,7 +341,7 @@ def replay( episode: int, fps: int | None = None, play_sounds: bool = True, - local_files_only: bool = True, + local_files_only: bool = False, ): # TODO(rcadene, aliberts): refactor with control_loop, once `dataset` is an instance of LeRobotDataset # TODO(rcadene): Add option to record logs @@ -424,7 +424,7 @@ def replay( "--root", type=Path, default=None, - help="Root directory where the dataset will be stored locally at '{root}/{repo_id}' (e.g. 'data/hf_username/dataset_name').", + help="Root directory where the dataset will be stored (e.g. 'dataset/path').", ) parser_record.add_argument( "--repo-id", @@ -432,6 +432,12 @@ def replay( default="lerobot/test", help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).", ) + parser_record.add_argument( + "--local-files-only", + type=int, + default=0, + help="Use local files only. By default, this script will try to fetch the dataset from the hub if it exists.", + ) parser_record.add_argument( "--warmup-time-s", type=int, @@ -520,7 +526,7 @@ def replay( "--root", type=Path, default=None, - help="Root directory where the dataset will be stored locally at '{root}/{repo_id}' (e.g. 'data/hf_username/dataset_name').", + help="Root directory where the dataset will be stored (e.g. 'dataset/path').", ) parser_replay.add_argument( "--repo-id", @@ -528,6 +534,12 @@ def replay( default="lerobot/test", help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).", ) + parser_replay.add_argument( + "--local-files-only", + type=int, + default=0, + help="Use local files only. By default, this script will try to fetch the dataset from the hub if it exists.", + ) parser_replay.add_argument("--episode", type=int, default=0, help="Index of the episode to replay.") args = parser.parse_args() diff --git a/tests/test_control_robot.py b/tests/test_control_robot.py index 0ba737a8f..8df108946 100644 --- a/tests/test_control_robot.py +++ b/tests/test_control_robot.py @@ -158,7 +158,7 @@ def test_record_and_replay_and_policy(tmpdir, request, robot_type, mock): assert dataset.meta.total_episodes == 2 assert len(dataset) == 2 - replay(robot, episode=0, fps=1, root=root, repo_id=repo_id, play_sounds=False) + replay(robot, episode=0, fps=1, root=root, repo_id=repo_id, play_sounds=False, local_files_only=True) # TODO(rcadene, aliberts): rethink this design if robot_type == "aloha": From 8e7d6970eaf5a64b8af6ec45586d201b8ca9ef16 Mon Sep 17 00:00:00 2001 From: Michel Aractingi Date: Tue, 3 Dec 2024 12:20:05 +0100 Subject: [PATCH 3/3] Control simulated robot with real leader (#514) Co-authored-by: Remi --- lerobot/scripts/control_robot.py | 4 +- lerobot/scripts/control_sim_robot.py | 546 +++++++++++++++++++++++++++ 2 files changed, 548 insertions(+), 2 deletions(-) create mode 100644 lerobot/scripts/control_sim_robot.py diff --git a/lerobot/scripts/control_robot.py b/lerobot/scripts/control_robot.py index e91c6c232..12eaf146f 100644 --- a/lerobot/scripts/control_robot.py +++ b/lerobot/scripts/control_robot.py @@ -68,8 +68,8 @@ - Tap escape key 'esc' to stop the data recording. This might require a sudo permission to allow your terminal to monitor keyboard events. -**NOTE**: You can resume/continue data recording by running the same data recording command twice. -To avoid resuming by deleting the dataset, use `--force-override 1`. +**NOTE**: You can resume/continue data recording by running the same data recording command and adding `--resume 1`. +If the dataset you want to extend is not on the hub, you also need to add `--local-files-only 1`. - Train on this dataset with the ACT policy: ```bash diff --git a/lerobot/scripts/control_sim_robot.py b/lerobot/scripts/control_sim_robot.py new file mode 100644 index 000000000..85dfca64a --- /dev/null +++ b/lerobot/scripts/control_sim_robot.py @@ -0,0 +1,546 @@ +""" +Utilities to control a robot in simulation. + +Useful to record a dataset, replay a recorded episode and record an evaluation dataset. + +Examples of usage: + + +- Unlimited teleoperation at a limited frequency of 30 Hz, to simulate data recording frequency. + You can modify this value depending on how fast your simulation can run: +```bash +python lerobot/scripts/control_robot.py teleoperate \ + --fps 30 \ + --robot-path lerobot/configs/robot/your_robot_config.yaml \ + --sim-config lerobot/configs/env/your_sim_config.yaml +``` + +- Record one episode in order to test replay: +```bash +python lerobot/scripts/control_sim_robot.py record \ + --robot-path lerobot/configs/robot/your_robot_config.yaml \ + --sim-config lerobot/configs/env/your_sim_config.yaml \ + --fps 30 \ + --repo-id $USER/robot_sim_test \ + --num-episodes 1 \ + --run-compute-stats 0 +``` + +Enable the --push-to-hub 1 to push the recorded dataset to the huggingface hub. + +- Visualize dataset: +```bash +python lerobot/scripts/visualize_dataset.py \ + --repo-id $USER/robot_sim_test \ + --episode-index 0 +``` + +- Replay a sequence of test episodes: +```bash +python lerobot/scripts/control_sim_robot.py replay \ + --robot-path lerobot/configs/robot/your_robot_config.yaml \ + --sim-config lerobot/configs/env/your_sim_config.yaml \ + --fps 30 \ + --repo-id $USER/robot_sim_test \ + --episode 0 +``` +Note: The seed is saved, therefore, during replay we can load the same environment state as the one during collection. + +- Record a full dataset in order to train a policy, +30 seconds of recording for each episode, and 10 seconds to reset the environment in between episodes: +```bash +python lerobot/scripts/control_sim_robot.py record \ + --robot-path lerobot/configs/robot/your_robot_config.yaml \ + --sim-config lerobot/configs/env/your_sim_config.yaml \ + --fps 30 \ + --repo-id $USER/robot_sim_test \ + --num-episodes 50 \ + --episode-time-s 30 \ +``` + +**NOTE**: You can use your keyboard to control data recording flow. +- Tap right arrow key '->' to early exit while recording an episode and go to reseting the environment. +- Tap right arrow key '->' to early exit while reseting the environment and got to recording the next episode. +- Tap left arrow key '<-' to early exit and re-record the current episode. +- Tap escape key 'esc' to stop the data recording. +This might require a sudo permission to allow your terminal to monitor keyboard events. + +**NOTE**: You can resume/continue data recording by running the same data recording command twice. +""" + +import argparse +import importlib +import logging +import time +from pathlib import Path + +import cv2 +import gymnasium as gym +import numpy as np +import torch + +from lerobot.common.datasets.lerobot_dataset import LeRobotDataset +from lerobot.common.robot_devices.control_utils import ( + init_keyboard_listener, + init_policy, + is_headless, + log_control_info, + predict_action, + sanity_check_dataset_name, + sanity_check_dataset_robot_compatibility, + stop_recording, +) +from lerobot.common.robot_devices.robots.factory import make_robot +from lerobot.common.robot_devices.robots.utils import Robot +from lerobot.common.robot_devices.utils import busy_wait +from lerobot.common.utils.utils import init_hydra_config, init_logging, log_say + +DEFAULT_FEATURES = { + "next.reward": { + "dtype": "float32", + "shape": (1,), + "names": None, + }, + "next.success": { + "dtype": "bool", + "shape": (1,), + "names": None, + }, + "seed": { + "dtype": "int64", + "shape": (1,), + "names": None, + }, + "timestamp": { + "dtype": "float32", + "shape": (1,), + "names": None, + }, +} + + +######################################################################################## +# Utilities +######################################################################################## +def none_or_int(value): + if value == "None": + return None + return int(value) + + +def init_sim_calibration(robot, cfg): + # Constants necessary for transforming the joint pos of the real robot to the sim + # depending on the robot discription used in that sim. + start_pos = np.array(robot.leader_arms.main.calibration["start_pos"]) + axis_directions = np.array(cfg.get("axis_directions", [1])) + offsets = np.array(cfg.get("offsets", [0])) * np.pi + + return {"start_pos": start_pos, "axis_directions": axis_directions, "offsets": offsets} + + +def real_positions_to_sim(real_positions, axis_directions, start_pos, offsets): + """Counts - starting position -> radians -> align axes -> offset""" + return axis_directions * (real_positions - start_pos) * 2.0 * np.pi / 4096 + offsets + + +######################################################################################## +# Control modes +######################################################################################## + + +def teleoperate(env, robot: Robot, process_action_fn, teleop_time_s=None): + env = env() + env.reset() + start_teleop_t = time.perf_counter() + while True: + leader_pos = robot.leader_arms.main.read("Present_Position") + action = process_action_fn(leader_pos) + env.step(np.expand_dims(action, 0)) + if teleop_time_s is not None and time.perf_counter() - start_teleop_t > teleop_time_s: + print("Teleoperation processes finished.") + break + + +def record( + env, + robot: Robot, + process_action_from_leader, + root: Path, + repo_id: str, + task: str, + fps: int | None = None, + tags: list[str] | None = None, + pretrained_policy_name_or_path: str = None, + policy_overrides: bool | None = None, + episode_time_s: int = 30, + num_episodes: int = 50, + video: bool = True, + push_to_hub: bool = True, + num_image_writer_processes: int = 0, + num_image_writer_threads_per_camera: int = 4, + display_cameras: bool = False, + play_sounds: bool = True, + resume: bool = False, + local_files_only: bool = False, + run_compute_stats: bool = True, +) -> LeRobotDataset: + # Load pretrained policy + policy = None + if pretrained_policy_name_or_path is not None: + policy, policy_fps, device, use_amp = init_policy(pretrained_policy_name_or_path, policy_overrides) + + if fps is None: + fps = policy_fps + logging.warning(f"No fps provided, so using the fps from policy config ({policy_fps}).") + + if policy is None and process_action_from_leader is None: + raise ValueError("Either policy or process_action_fn has to be set to enable control in sim.") + + # initialize listener before sim env + listener, events = init_keyboard_listener() + + # create sim env + env = env() + + # Create empty dataset or load existing saved episodes + num_cameras = sum([1 if "image" in key else 0 for key in env.observation_space]) + + # get image keys + image_keys = [key for key in env.observation_space if "image" in key] + state_keys_dict = env_cfg.state_keys + + if resume: + dataset = LeRobotDataset( + repo_id, + root=root, + local_files_only=local_files_only, + ) + dataset.start_image_writer( + num_processes=num_image_writer_processes, + num_threads=num_image_writer_threads_per_camera * num_cameras, + ) + sanity_check_dataset_robot_compatibility(dataset, robot, fps, video) + else: + features = DEFAULT_FEATURES + # add image keys to features + for key in image_keys: + shape = env.observation_space[key].shape + if not key.startswith("observation.image."): + key = "observation.image." + key + features[key] = {"dtype": "video", "names": ["channel", "height", "width"], "shape": shape} + + for key, obs_key in state_keys_dict.items(): + features[key] = { + "dtype": "float32", + "names": None, + "shape": env.observation_space[obs_key].shape, + } + + features["action"] = {"dtype": "float32", "shape": env.action_space.shape, "names": None} + + # Create empty dataset or load existing saved episodes + sanity_check_dataset_name(repo_id, policy) + dataset = LeRobotDataset.create( + repo_id, + fps, + root=root, + features=features, + use_videos=video, + image_writer_processes=num_image_writer_processes, + image_writer_threads=num_image_writer_threads_per_camera * num_cameras, + ) + + recorded_episodes = 0 + while True: + log_say(f"Recording episode {dataset.num_episodes}", play_sounds) + + if events is None: + events = {"exit_early": False} + + if episode_time_s is None: + episode_time_s = float("inf") + + timestamp = 0 + start_episode_t = time.perf_counter() + + seed = np.random.randint(0, 1e5) + observation, info = env.reset(seed=seed) + + while timestamp < episode_time_s: + start_loop_t = time.perf_counter() + + if policy is not None: + action = predict_action(observation, policy, device, use_amp) + else: + leader_pos = robot.leader_arms.main.read("Present_Position") + action = process_action_from_leader(leader_pos) + + observation, reward, terminated, _, info = env.step(action) + + success = info.get("is_success", False) + env_timestamp = info.get("timestamp", dataset.episode_buffer["size"] / fps) + + frame = { + "action": torch.from_numpy(action), + "next.reward": reward, + "next.success": success, + "seed": seed, + "timestamp": env_timestamp, + } + + for key in image_keys: + if not key.startswith("observation.image"): + frame["observation.image." + key] = observation[key] + else: + frame[key] = observation[key] + + for key, obs_key in state_keys_dict.items(): + frame[key] = torch.from_numpy(observation[obs_key]) + + dataset.add_frame(frame) + + if display_cameras and not is_headless(): + for key in image_keys: + cv2.imshow(key, cv2.cvtColor(observation[key], cv2.COLOR_RGB2BGR)) + cv2.waitKey(1) + + if fps is not None: + dt_s = time.perf_counter() - start_loop_t + busy_wait(1 / fps - dt_s) + + dt_s = time.perf_counter() - start_loop_t + log_control_info(robot, dt_s, fps=fps) + + timestamp = time.perf_counter() - start_episode_t + if events["exit_early"] or terminated: + events["exit_early"] = False + break + + if events["rerecord_episode"]: + log_say("Re-record episode", play_sounds) + events["rerecord_episode"] = False + events["exit_early"] = False + dataset.clear_episode_buffer() + continue + + dataset.save_episode(task=task) + recorded_episodes += 1 + + if events["stop_recording"] or recorded_episodes >= num_episodes: + break + else: + logging.info("Waiting for a few seconds before starting next episode recording...") + busy_wait(3) + + log_say("Stop recording", play_sounds, blocking=True) + stop_recording(robot, listener, display_cameras) + + if run_compute_stats: + logging.info("Computing dataset statistics") + dataset.consolidate(run_compute_stats) + + if push_to_hub: + dataset.push_to_hub(tags=tags) + + log_say("Exiting", play_sounds) + return dataset + + +def replay( + env, root: Path, repo_id: str, episode: int, fps: int | None = None, local_files_only: bool = True +): + env = env() + + local_dir = Path(root) / repo_id + if not local_dir.exists(): + raise ValueError(local_dir) + + dataset = LeRobotDataset(repo_id, root=root, local_files_only=local_files_only) + items = dataset.hf_dataset.select_columns("action") + seeds = dataset.hf_dataset.select_columns("seed")["seed"] + + from_idx = dataset.episode_data_index["from"][episode].item() + to_idx = dataset.episode_data_index["to"][episode].item() + env.reset(seed=seeds[from_idx].item()) + logging.info("Replaying episode") + log_say("Replaying episode", play_sounds=True) + for idx in range(from_idx, to_idx): + start_episode_t = time.perf_counter() + action = items[idx]["action"] + env.step(action.unsqueeze(0).numpy()) + dt_s = time.perf_counter() - start_episode_t + busy_wait(1 / fps - dt_s) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="mode", required=True) + + # Set common options for all the subparsers + base_parser = argparse.ArgumentParser(add_help=False) + base_parser.add_argument( + "--robot-path", + type=str, + default="lerobot/configs/robot/koch.yaml", + help="Path to robot yaml file used to instantiate the robot using `make_robot` factory function.", + ) + + base_parser.add_argument( + "--sim-config", + help="Path to a yaml config you want to use for initializing a sim environment based on gym ", + ) + + parser_record = subparsers.add_parser("teleoperate", parents=[base_parser]) + + parser_record = subparsers.add_parser("record", parents=[base_parser]) + parser_record.add_argument( + "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)" + ) + parser_record.add_argument( + "--root", + type=Path, + default=None, + help="Root directory where the dataset will be stored locally at '{root}/{repo_id}' (e.g. 'data/hf_username/dataset_name').", + ) + parser_record.add_argument( + "--repo-id", + type=str, + default="lerobot/test", + help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).", + ) + parser_record.add_argument( + "--episode-time-s", + type=int, + default=60, + help="Number of seconds for data recording for each episode.", + ) + parser_record.add_argument( + "--task", + type=str, + required=True, + help="A description of the task preformed during recording that can be used as a language instruction.", + ) + parser_record.add_argument("--num-episodes", type=int, default=50, help="Number of episodes to record.") + parser_record.add_argument( + "--run-compute-stats", + type=int, + default=1, + help="By default, run the computation of the data statistics at the end of data collection. Compute intensive and not required to just replay an episode.", + ) + parser_record.add_argument( + "--push-to-hub", + type=int, + default=1, + help="Upload dataset to Hugging Face hub.", + ) + parser_record.add_argument( + "--tags", + type=str, + nargs="*", + help="Add tags to your dataset on the hub.", + ) + parser_record.add_argument( + "--num-image-writer-processes", + type=int, + default=0, + help=( + "Number of subprocesses handling the saving of frames as PNGs. Set to 0 to use threads only; " + "set to ≥1 to use subprocesses, each using threads to write images. The best number of processes " + "and threads depends on your system. We recommend 4 threads per camera with 0 processes. " + "If fps is unstable, adjust the thread count. If still unstable, try using 1 or more subprocesses." + ), + ) + parser_record.add_argument( + "--num-image-writer-threads-per-camera", + type=int, + default=4, + help=( + "Number of threads writing the frames as png images on disk, per camera. " + "Too much threads might cause unstable teleoperation fps due to main thread being blocked. " + "Not enough threads might cause low camera fps." + ), + ) + parser_record.add_argument( + "--display-cameras", + type=int, + default=0, + help="Visualize image observations with opencv.", + ) + parser_record.add_argument( + "--resume", + type=int, + default=0, + help="Resume recording on an existing dataset.", + ) + parser_replay = subparsers.add_parser("replay", parents=[base_parser]) + parser_replay.add_argument( + "--fps", type=none_or_int, default=None, help="Frames per second (set to None to disable)" + ) + parser_replay.add_argument( + "--root", + type=Path, + default=None, + help="Root directory where the dataset will be stored locally (e.g. 'data/hf_username/dataset_name'). By default, stored in cache folder.", + ) + parser_replay.add_argument( + "--repo-id", + type=str, + default="lerobot/test", + help="Dataset identifier. By convention it should match '{hf_username}/{dataset_name}' (e.g. `lerobot/test`).", + ) + parser_replay.add_argument("--episode", type=int, default=0, help="Index of the episodes to replay.") + + args = parser.parse_args() + + init_logging() + + control_mode = args.mode + robot_path = args.robot_path + env_config_path = args.sim_config + kwargs = vars(args) + del kwargs["mode"] + del kwargs["robot_path"] + del kwargs["sim_config"] + + # make gym env + env_cfg = init_hydra_config(env_config_path) + importlib.import_module(f"gym_{env_cfg.env.name}") + + def env_constructor(): + return gym.make(env_cfg.env.handle, disable_env_checker=True, **env_cfg.env.gym) + + robot = None + process_leader_actions_fn = None + + if control_mode in ["teleoperate", "record"]: + # make robot + robot_overrides = ["~cameras", "~follower_arms"] + robot_cfg = init_hydra_config(robot_path, robot_overrides) + robot = make_robot(robot_cfg) + robot.connect() + + calib_kwgs = init_sim_calibration(robot, env_cfg.calibration) + + def process_leader_actions_fn(action): + return real_positions_to_sim(action, **calib_kwgs) + + robot.leader_arms.main.calibration = None + + if control_mode == "teleoperate": + teleoperate(env_constructor, robot, process_leader_actions_fn) + + elif control_mode == "record": + record(env_constructor, robot, process_leader_actions_fn, **kwargs) + + elif control_mode == "replay": + replay(env_constructor, **kwargs) + + else: + raise ValueError( + f"Invalid control mode: '{control_mode}', only valid modes are teleoperate, record and replay." + ) + + if robot and robot.is_connected: + # Disconnect manually to avoid a "Core dump" during process + # termination due to camera threads not properly exiting. + robot.disconnect()