diff --git a/main.py b/main.py index 56c8bad..c19750e 100644 --- a/main.py +++ b/main.py @@ -29,6 +29,7 @@ "image_grid", "descriptor_qas", "add_kois", + "add_bev", "reasoning", "system_prompt", ], @@ -44,7 +45,7 @@ "--dataset_split", help="The dataset split to use for training / evaluation.", type=str, - choices=["train", "val"], + choices=["train", "val", "test"], default="val", ) parser.add_argument( @@ -89,7 +90,7 @@ ) elif args.eval: resize_image_size = get_resize_image_size( - resize_factor=resize_factor, grid="image_grid" in args.approach + resize_factor=resize_factor, grid="image_grid" in args.approach, bev="add_bev" in args.approach, front_cam="front_cam" in args.approach, ) logger.debug(f"Using resize image size: {resize_image_size}") if is_cuda(): diff --git a/requirements.txt b/requirements.txt index d647ce2..200f3dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,5 @@ gdown~=5.2.0 pre-commit~=4.2.0 peft~=0.15.2 trl~=0.18.1 +polars==1.31.0 ultralytics==8.3.168 diff --git a/src/constants.py b/src/constants.py index 634ad46..c3705ef 100644 --- a/src/constants.py +++ b/src/constants.py @@ -5,6 +5,7 @@ drivelm_dir = data_dir / "drivelm" nuscenes_dir = data_dir / "nuscenes" grid_dir = nuscenes_dir / "samples" / "GRID" +bev_dir = nuscenes_dir / "samples" / "BEV" drivelm_train_json = drivelm_dir / "v1_1_train_nus.json" drivelm_val_json = drivelm_dir / "v1_1_val_nus_q_only.json" drivelm_test_json = drivelm_dir / "v1_1_test_nus.json" @@ -20,6 +21,8 @@ IMAGE_SIZE[0] * GRID[0], IMAGE_SIZE[1] * GRID[1], ) # (height, width) +BEV_IMG_SIZE = (500, 500) +BEV_AND_FRONT_CAM_IMG_SIZE = (500, 1388) GRID_POSITIONS = { "CAM_FRONT_LEFT": (0, 0), diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py index f690f6c..fb97071 100644 --- a/src/data/basic_dataset.py +++ b/src/data/basic_dataset.py @@ -5,10 +5,12 @@ from src.constants import drivelm_dir from src.data.create_image_grid_dataset import create_image_grid_dataset +from src.data.generate_bev import generate_bevs from src.data.generate_descriptor_qas import ( generate_descriptor_qas, ) from src.data.generate_reasoning_context import generate_reasoning_context +from src.data.get_sensor_calibration import get_calibration from src.data.load_dataset import load_dataset from src.data.message_formats import MessageFormat from src.data.query_item import QueryItem @@ -47,7 +49,9 @@ def __init__( message_format: MessageFormat, split="train", add_augmented=False, + front_cam=False, add_kois=False, + add_bev=False, use_grid=False, use_reasoning=False, use_system_prompt=False, @@ -58,8 +62,10 @@ def __init__( ): self.message_format = message_format self.split = split + self.front_cam = front_cam self.use_reasoning = use_reasoning self.use_grid = use_grid + self.add_bev = add_bev self.resize_factor = resize_factor self.system_prompt_provider = ( SystemPromptProvider(config_path=system_prompt_config_path) @@ -70,13 +76,19 @@ def __init__( data = load_dataset(split) if split == "train": + if add_bev: + data = get_calibration(data) + data = generate_bevs(data, front_cam=front_cam) data = normalise_key_object_infos(data, resize_factor, use_grid) if split == "train" and add_augmented: data = generate_descriptor_qas(data) - if split == "val" and add_kois: + if (split == "val" or split == "test") and add_kois: data = generate_yolo_kois(data) + if add_bev: + data = get_calibration(data) + data = generate_bevs(data, front_cam=front_cam) data = normalise_key_object_infos(data, resize_factor, use_grid) if use_grid: @@ -87,13 +99,17 @@ def __init__( for scene_id in data.keys(): scene_obj = data[scene_id]["key_frames"] for key_frame_id in scene_obj.keys(): - # NOTE: Only consider FRONT camera images or GRID images for now image_paths = scene_obj[key_frame_id]["image_paths"] if use_grid: image_path = os.path.join( drivelm_dir, image_paths["GRID"], ) + elif add_kois and add_bev: + image_path = os.path.join( + drivelm_dir, + image_paths["BEV"], + ) else: image_path = os.path.join( drivelm_dir, @@ -111,6 +127,10 @@ def __init__( else None ) + camera_calibration = None + if split=="val" and add_kois and add_bev: + camera_calibration = scene_obj[key_frame_id]["camera_calibration"] + qas = scene_obj[key_frame_id]["QA"] qas_perception = qas["perception"] @@ -154,6 +174,8 @@ def __init__( "qa": remove_nones(qa), "qa_type": qa_types[i], "id": scene_id + "_" + key_frame_id + "_" + str(i), + "key_frame_id": key_frame_id, + "camera_calibration": camera_calibration, "key_object_info": key_object_infos if qa_types[i] != "perception" else None, @@ -175,6 +197,7 @@ def __getitem__(self, idx): question = qa["qa"]["Q"] answer = qa["qa"]["A"] tags = qa["qa"].get("tag", []) + camera_calibration = qa["camera_calibration"] key_object_info = qa["key_object_info"] image_path = qa["image_path"] system_prompt = ( @@ -183,6 +206,8 @@ def __getitem__(self, idx): question=question, resize_factor=self.resize_factor, use_grid=self.use_grid, + add_bev=self.add_bev, + front_cam=self.front_cam, use_reasoning=self.use_reasoning, ) if self.system_prompt_provider @@ -198,6 +223,7 @@ def __getitem__(self, idx): key_object_info=key_object_info, system_prompt=system_prompt, ground_truth_answer=answer, + camera_calibration=camera_calibration, ) if self.use_reasoning and self.split == "train": diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py new file mode 100644 index 0000000..98506d8 --- /dev/null +++ b/src/data/generate_bev.py @@ -0,0 +1,285 @@ +import cv2 +import os +import numpy as np +from scipy.spatial.transform import Rotation as R_scipy +from tqdm import tqdm + +from src.constants import bev_dir, drivelm_dir +from src.data.get_sensor_calibration import CameraCalibration +from src.utils.utils import get_logger, key_object_str_to_dict + + +logger = get_logger(__name__) + + +def generate_bev_from_detections( + calibration: dict[str, CameraCalibration], + kois: dict, + ) -> np.ndarray: + """ + Generates a Bird's-Eye View (BEV) map from detected objects for a keyframe, + using nuScenes camera calibration information. + + Args: + calibration: A dictionary where keys are camera names (e.g., 'CAM_FRONT') + and values are CameraCalibration objects. + kois: A dictionary where keys contain camera names and values contain + object detection information including 2d_bbox and Category. + Returns: + A NumPy array representing the BEV map (H, W, 3). + """ + bev_map_res_m_per_pixel = 0.1 + bev_map_x_range = 50.0 + bev_map_y_range = 50.0 + + # --- BEV Map Initialization --- + # Calculate min/max extents based on ranges to center ego (0,0) + x_min_m = -bev_map_x_range / 2.0 + y_min_m = -bev_map_y_range / 2.0 + + bev_map_width_pixels = int(bev_map_x_range / bev_map_res_m_per_pixel) + bev_map_height_pixels = int(bev_map_y_range / bev_map_res_m_per_pixel) + + bev_map = np.zeros((bev_map_height_pixels, bev_map_width_pixels, 3), dtype=np.uint8) + bev_map.fill(20) + + all_projected_objects = [] + + total_items = 0 + for camera_name, cam_calib in calibration.items(): + current_keys = [koi_key for koi_key in kois.keys() if camera_name == key_object_str_to_dict(koi_key)["camera"]] + current_identifiers = [key_object_str_to_dict(k)["id"] for k in current_keys] + current_camera_kois = [kois[k] for k in current_keys] + current_camera_boxes = [koi["2d_bbox"] for koi in current_camera_kois] + total_items += len(current_camera_boxes) + current_camera_names = [koi["Category"] for koi in current_camera_kois] + + K = np.array(cam_calib.camera_intrinsic, dtype=np.float64) + t_camera_to_ego = np.array(cam_calib.translation, dtype=np.float64) # (x, y, z) + q_camera_to_ego = np.array(cam_calib.rotation, dtype=np.float64) # (w, x, y, z) + + # 1. Convert quaternion to rotation matrix: R_ego_from_camera + # nuScenes quaternion is (w, x, y, z) -> scipy Rotation.from_quat expects (x, y, z, w) + r_ego_from_camera_scipy = R_scipy.from_quat([q_camera_to_ego[1], q_camera_to_ego[2], q_camera_to_ego[3], q_camera_to_ego[0]]) + R_ego_from_camera = r_ego_from_camera_scipy.as_matrix() # 3x3 rotation matrix from camera to ego + + for i in range(len(current_camera_boxes)): + bbox = current_camera_boxes[i] + obj_name = current_camera_names[i] + + # Use the bottom-center of the 2D bounding box as the ground contact point heuristic. + x1, y1, x2, y2 = bbox + bottom_center_2d = np.array([(x1 + x2) / 2, y2], dtype=np.float64) + + # --- Project 2D image point back to 3D on the ground plane (Z=0 in ego frame) --- + + # Convert 2D image point to a 3D ray direction in the camera frame (normalized coordinates). + uv_hom = np.array([bottom_center_2d[0], bottom_center_2d[1], 1.0], dtype=np.float64).reshape(3, 1) + K_inv = np.linalg.inv(K) + ray_direction_camera_frame = np.dot(K_inv, uv_hom).flatten() + + # Transform the ray from the camera frame to the ego vehicle frame. + ray_origin_ego = t_camera_to_ego + ray_direction_ego = np.dot(R_ego_from_camera, ray_direction_camera_frame) + + # Intersect the ray with the ground plane (Z_ego = 0). + if np.isclose(ray_direction_ego[2], 0.0): + continue # Ray is parallel or near-parallel to ground plane + + lam = -ray_origin_ego[2] / ray_direction_ego[2] + + # Ensure the intersection point is in front of the camera (positive lambda). + if lam < 0: + continue + + point_3d_ego = ray_origin_ego + lam * ray_direction_ego + + # Store the projected object's information + projected_object_info = { + 'class': obj_name, + 'x_ego': -point_3d_ego[1], + 'y_ego': point_3d_ego[0], + 'z_ego': point_3d_ego[2], # Should be close to 0 + 'camera_name': camera_name, + 'original_bbox': bbox, + 'identifier': current_identifiers[i], + } + all_projected_objects.append(projected_object_info) + + # --- Remove Duplicate Objects --- + # Group objects by spatial proximity and class, keep the one with best visibility + unique_objects = [] + proximity_threshold = 10.0 + duplicates_removed = 0 + + for obj in all_projected_objects: + is_duplicate = False + for unique_obj in unique_objects: + # Check if objects are of same class and spatially close + if (obj['class'] == unique_obj['class'] and + # TODO: Tune the prox threshold + np.sqrt((obj['x_ego'] - unique_obj['x_ego'])**2 + + (obj['y_ego'] - unique_obj['y_ego'])**2) < proximity_threshold): + + # Keep the object from the camera that provides better view + # Prefer front cameras for forward objects, side cameras for side objects, etc. + current_distance = np.sqrt(obj['x_ego']**2 + obj['y_ego']**2) + unique_distance = np.sqrt(unique_obj['x_ego']**2 + unique_obj['y_ego']**2) + + # Replace if current object is closer or from a more appropriate camera + if (current_distance < unique_distance or + _is_better_camera_view(obj, unique_obj)): + unique_objects.remove(unique_obj) + unique_objects.append(obj) + else: + duplicates_removed += 1 + is_duplicate = True + break + + if not is_duplicate: + unique_objects.append(obj) + + # --- Render Projected Objects onto the BEV Map --- + for obj_info in unique_objects: + x_ego = obj_info['x_ego'] + y_ego = obj_info['y_ego'] + obj_class = obj_info['class'] + identifier = obj_info['identifier'] + + col_bev = int((x_ego - x_min_m) / bev_map_res_m_per_pixel) + row_bev = int(bev_map_height_pixels - 1 - ((y_ego - y_min_m) / bev_map_res_m_per_pixel)) + + if 0 <= col_bev < bev_map_width_pixels and 0 <= row_bev < bev_map_height_pixels: + if 'car' in obj_class.lower() or 'truck' in obj_class.lower() or 'bus' in obj_class.lower() or \ + 'bicycle' in obj_class.lower() or 'motorcycle' in obj_class.lower(): + car_width_bev = int(1.5 / bev_map_res_m_per_pixel) + car_length_bev = int(3.0 / bev_map_res_m_per_pixel) + color = (0, 255, 255) # Yellow (BGR) + + if (0 <= (col_bev - car_width_bev) and (col_bev + car_width_bev) < bev_map_width_pixels + and 0 <= (row_bev - car_length_bev) and (row_bev + car_length_bev) < bev_map_height_pixels): + cv2.rectangle(bev_map, + (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2), + (col_bev + car_width_bev // 2, row_bev + car_length_bev // 2), + color, -1) + cv2.putText(bev_map, identifier, (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2 - 5), + cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1) + + elif 'person' in obj_class.lower(): + ped_width_bev = int(1.0 / bev_map_res_m_per_pixel) + ped_length_bev = int(1.0 / bev_map_res_m_per_pixel) + color = (255, 0, 0) # Blue (BGR) + + if (0 <= (col_bev - ped_width_bev) and (col_bev + ped_width_bev) < bev_map_width_pixels + and 0 <= (row_bev - ped_length_bev) and (row_bev + ped_length_bev) < bev_map_height_pixels): + cv2.rectangle(bev_map, + (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2), + (col_bev + ped_width_bev // 2, row_bev + ped_length_bev // 2), + color, -1) + cv2.putText(bev_map, identifier, (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2 - 5), + cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1) + + # --- Draw Ego Vehicle --- + ego_x_m = 0.0 # Ego vehicle is at (0,0) in its own frame + ego_y_m = 0.0 + + # Convert ego (0,0) to BEV map pixel coordinates + ego_col_bev = int((ego_x_m - x_min_m) / bev_map_res_m_per_pixel) + ego_row_bev = int(bev_map_height_pixels - 1 - ((ego_y_m - y_min_m) / bev_map_res_m_per_pixel)) + + # Ego vehicle dimensions (approximate typical car size) + ego_width_pixels = int(1.5 / bev_map_res_m_per_pixel) + ego_length_pixels = int(3.0 / bev_map_res_m_per_pixel) + + ego_color = (0, 0, 255) # Red (BGR) + cv2.rectangle(bev_map, + (ego_col_bev - ego_width_pixels // 2, ego_row_bev - ego_length_pixels // 2), + (ego_col_bev + ego_width_pixels // 2, ego_row_bev + ego_length_pixels // 2), + ego_color, -1) + cv2.putText(bev_map, 'Ego', (ego_col_bev - ego_width_pixels // 2, ego_row_bev - ego_length_pixels // 2 - 5), + cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1) + + marker_color = (255, 255, 255) # White + font = cv2.FONT_HERSHEY_SIMPLEX + font_scale = 0.6 + thickness = 2 + + cv2.putText(bev_map, 'FRONT', (bev_map_width_pixels//2 - 30, 25), font, font_scale, marker_color, thickness) + cv2.putText(bev_map, 'BACK', (bev_map_width_pixels//2 - 25, bev_map_height_pixels - 10), font, font_scale, marker_color, thickness) + cv2.putText(bev_map, 'LEFT', (10, bev_map_height_pixels//2), font, font_scale, marker_color, thickness) + cv2.putText(bev_map, 'RIGHT', (bev_map_width_pixels - 60, bev_map_height_pixels//2), font, font_scale, marker_color, thickness) + + return bev_map + + +def _is_better_camera_view(obj1, obj2): + """ + Determine if obj1 has a better camera view than obj2 based on object position and camera type. + """ + x1, y1 = obj1['x_ego'], obj1['y_ego'] + x2, y2 = obj2['x_ego'], obj2['y_ego'] + cam1 = obj1['camera_name'] + cam2 = obj2['camera_name'] + + def get_camera_score(x, y, camera_name): + score = 0 + # Front cameras are best for forward objects (y > 0) + if 'FRONT' in camera_name and y > 0: + score += 3 + # Back cameras are best for rear objects (y < 0) + elif 'BACK' in camera_name and y < 0: + score += 3 + # Left cameras are best for left objects (x < 0) + if 'LEFT' in camera_name and x < 0: + score += 2 + # Right cameras are best for right objects (x > 0) + elif 'RIGHT' in camera_name and x > 0: + score += 2 + # Center cameras (FRONT, BACK) are good for center objects + if camera_name in ['CAM_FRONT', 'CAM_BACK'] and abs(x) < 5: + score += 1 + return score + + score1 = get_camera_score(x1, y1, cam1) + score2 = get_camera_score(x2, y2, cam2) + + return score1 > score2 + + +def generate_bevs(data, front_cam: bool = False): + bev_dir.mkdir(parents=True, exist_ok=True) + + for scene_id, scene_obj in tqdm(data.items(), desc="Generating BEVs"): + for key_frame_id, key_frame in scene_obj["key_frames"].items(): + image_paths = key_frame["image_paths"] + if front_cam: + image_name = f"{scene_id}_{key_frame_id}__BEV_FRONT_CAM.jpg" + else: + image_name = f"{scene_id}_{key_frame_id}__BEV.jpg" + bev_path = bev_dir / image_name + image_paths["BEV"] = "../nuscenes/samples/BEV/" + image_name + + if not bev_path.exists(): + image_paths = { + key: os.path.join(drivelm_dir, path) + for key, path in image_paths.items() + } + kois = key_frame["key_object_infos"] + calibration = key_frame["camera_calibration"] + bev_img = generate_bev_from_detections( + kois=kois, + calibration=calibration, + ) + if front_cam: + front_image = cv2.imread(image_paths["CAM_FRONT"]) + target_height = min(front_image.shape[0], bev_img.shape[0]) + front_aspect = front_image.shape[1] / front_image.shape[0] + front_width = int(target_height * front_aspect) + front_resized = cv2.resize(front_image, (front_width, target_height)) + bev_resized = cv2.resize(bev_img, (bev_img.shape[1], target_height)) + combined_img = np.hstack([front_resized, bev_resized]) + cv2.imwrite(bev_path, combined_img) + else: + cv2.imwrite(bev_path, bev_img) + logger.debug(f"Saved bev image: {bev_img}") + return data diff --git a/src/data/generate_yolo_kois.py b/src/data/generate_yolo_kois.py index 2425fd4..cc89cbf 100644 --- a/src/data/generate_yolo_kois.py +++ b/src/data/generate_yolo_kois.py @@ -1,5 +1,6 @@ import os +from tqdm import tqdm from ultralytics import YOLO from src.constants import drivelm_dir @@ -7,15 +8,21 @@ def generate_yolo_kois(data, max_results_per_cam: int = 5): model = YOLO("yolo11n.pt") - for _, scene_obj in data.items(): + for _, scene_obj in tqdm(data.items(), desc="Generating KOIs with YOLO"): for _, key_frame in scene_obj["key_frames"].items(): image_paths_raw = key_frame["image_paths"] i = 0 kois = [] for camera, image_path in image_paths_raw.items(): - results = model(os.path.join(drivelm_dir, image_path))[:max_results_per_cam] + results = model( + os.path.join(drivelm_dir, image_path), + max_det=max_results_per_cam, + classes=[0, 1, 2, 3, 5, 6, 7, 9, 11], # [person, bicycle, car, motorcycle, bus, train, truck, traffic light, stop sign] + verbose=False + ) + bbox = [xyxy for res in results for xyxy in res.boxes.xyxy.cpu().tolist()] center_points = [ - (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh + (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh.cpu() ] categories = [ res.names[cls.item()] @@ -28,13 +35,15 @@ def generate_yolo_kois(data, max_results_per_cam: int = 5): ( f"", categories[j], + bbox[j] ) ) key_frame["key_object_infos"] = { descriptor: { "Category": category, + "2d_bbox": bbox, } - for descriptor, category in kois + for descriptor, category, bbox in kois } return data diff --git a/src/data/get_sensor_calibration.py b/src/data/get_sensor_calibration.py new file mode 100644 index 0000000..5d93703 --- /dev/null +++ b/src/data/get_sensor_calibration.py @@ -0,0 +1,79 @@ +import polars as pl +from tqdm import tqdm + +from src.constants import nuscenes_dir + + +def get_sample_data_and_calibrated_camera_df() -> pl.DataFrame: + sample_data_lf = pl.read_json(nuscenes_dir / "sample_data.json").lazy() + sample_data_lf = sample_data_lf.filter( + pl.col("is_key_frame") == True # noqa: E712 + ).select([ + "token", + "sample_token", + "calibrated_sensor_token" + ]) + calibrated_camera_lf = pl.read_json(nuscenes_dir / "calibrated_sensor.json").lazy() + calibrated_camera_lf = calibrated_camera_lf.filter( + pl.col("camera_intrinsic").len() != 0 + ) + sensor_lf = pl.read_json(nuscenes_dir / "sensor.json").lazy() + calibrated_camera_with_sensor_type_lf = calibrated_camera_lf.join( + sensor_lf, + left_on="sensor_token", + right_on="token", + suffix="_sensor", + ) + return sample_data_lf.join( + calibrated_camera_with_sensor_type_lf, + left_on="calibrated_sensor_token", + right_on="token", + suffix="_calibrated" + ).collect() + + +cameras = [ + "CAM_FRONT", + "CAM_FRONT_LEFT", + "CAM_FRONT_RIGHT", + "CAM_BACK", + "CAM_BACK_LEFT", + "CAM_BACK_RIGHT", +] + +class CameraCalibration: + camera_intrinsic: list[list[float]] + translation: list[float] + rotation: list[float] + + def __init__(self, camera_intrinsic, translation, rotation): + self.camera_intrinsic = camera_intrinsic + self.translation = translation + self.rotation = rotation + + +def get_camera_calibration(lf: pl.DataFrame, key_frame_id) -> dict[str, CameraCalibration]: + calibration_per_camera = {} + for cam in cameras: + calibration = lf.filter( + pl.col("channel") == cam, + pl.col("sample_token") == key_frame_id + ).select( + "translation", + "rotation", + "camera_intrinsic" + ).to_dict() + assert len(calibration["translation"]) == 1 + calibration_per_camera[cam] = CameraCalibration( + camera_intrinsic=calibration["camera_intrinsic"][0].to_list(), + translation=calibration["translation"][0].to_list(), + rotation=calibration["rotation"][0].to_list(), + ) + return calibration_per_camera + +def get_calibration(data: dict): + lf = get_sample_data_and_calibrated_camera_df() + for _, scene in tqdm(data.items(), desc="Fetching camera calibration data"): + for key_frame_id, key_frame in scene["key_frames"].items(): + key_frame["camera_calibration"] = get_camera_calibration(lf, key_frame_id) + return data diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py index 89a597f..8aa030e 100644 --- a/src/data/load_dataset.py +++ b/src/data/load_dataset.py @@ -1,4 +1,5 @@ import os +import shutil from json import load import gdown @@ -42,9 +43,24 @@ def get_ds(split: str) -> None: id="1fsVP7jOpvChcpoXVdypaZ4HREX1gA7As", output=os.path.join(drivelm_dir, "v1_1_val_nus_q_only.json"), ) + get_nuscenes_ds() -def load_dataset(split: str): +def get_nuscenes_ds(): + out_name = os.path.join(nuscenes_dir, "nuscenes_json.zip") + gdown.download( + id="1sqW1y2k346mtLCQnO0NAab3sEzxUyQ_d", + output=out_name, + ) + shutil.unpack_archive(out_name, nuscenes_dir) + gdown.download( + id="1sqW1y2k346mtLCQnO0NAab3sEzxUyQ_d", + output=out_name, + ) + shutil.unpack_archive(out_name, nuscenes_dir) + + +def load_dataset(split: str) -> dict: dataset_paths = { "train": drivelm_train_json, "val": drivelm_val_json, diff --git a/src/data/query_item.py b/src/data/query_item.py index a138034..a2a644e 100644 --- a/src/data/query_item.py +++ b/src/data/query_item.py @@ -1,6 +1,7 @@ from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple +from src.data.get_sensor_calibration import CameraCalibration from src.data.message_formats import MessageFormat @@ -11,6 +12,7 @@ class QueryItem: qa_id: str qa_type: str tags: List[str] + camera_calibration: CameraCalibration key_object_info: Optional[Dict[str, Any]] = None system_prompt: str = None ground_truth_answer: Optional[str] = None diff --git a/src/data/system_prompts.py b/src/data/system_prompts.py index 41fc68c..85b070d 100644 --- a/src/data/system_prompts.py +++ b/src/data/system_prompts.py @@ -13,20 +13,26 @@ def __init__(self, config_path=None): self.prompts = yaml.safe_load(f) def get_approach_prompt( - self, resize_factor: float, use_grid: bool = False, use_reasoning: bool = False, + self, resize_factor: float, use_grid: bool = False, use_reasoning: bool = False, add_bev: bool = False, front_cam: bool = False ) -> str: approach = self.prompts.get("approach_prompt", {}) prompt = approach.get("base", "You are an autonomous driving assistant. ") grid_prompts = approach.get("use_grid", {}) if use_grid: - im_size = get_resize_image_size(resize_factor, True) + im_size = get_resize_image_size(resize_factor, grid=True) prompt += grid_prompts.get( "enabled", f"You are provided with a grid of images with size {im_size[1], im_size[0]} of the current situation. Starting from the upper left, the upper row shows images from the 'FRONT_LEFT', 'FRONT' and 'FRONT_RIGHT' cameras respectively. Starting from the bottom left, the lower row shows images from the 'BACK_LEFT', 'BACK' and 'BACK_RIGHT' cameras respectively. ", ) + elif add_bev and not front_cam: + im_size = get_resize_image_size(resize_factor, bev=True) + prompt += f"You are provided with a birds eye view image with size {im_size[1], im_size[0]} of the vehicle and the sorrounding objects. The ego vehicle is marked in red, vehicles are marked in yellow and predestrians are marked in blue. Each of the objects is associated with an id, that corresponds to the id given in the list of key object infos. E.g. a vehicle with the id 'c1' would correspond to a key object ''. This view should provide you with a good overview of the objects surrounding the vehicle and their relative distance. " + elif add_bev and front_cam: + im_size = get_resize_image_size(resize_factor, bev=True, front_cam=True) + prompt += f"You are provided with the front view of the car together with a birds eye view image with size {im_size[1], im_size[0]} of the vehicle and the sorrounding objects. The ego vehicle is marked in red, vehicles are marked in yellow and predestrians are marked in blue. Each of the objects is associated with an id, that corresponds to the id given in the list of key object infos. E.g. a vehicle with the id 'c1' would correspond to a key object ''. This view should provide you with a good overview of the objects surrounding the vehicle and their relative distance. " else: - im_size = get_resize_image_size(resize_factor, False) + im_size = get_resize_image_size(resize_factor) prompt += grid_prompts.get( "disabled", f"You receive a single image with size {im_size[1], im_size[0]} from the front camera. ", diff --git a/src/eval/eval_models.py b/src/eval/eval_models.py index 86b40b4..9669b00 100644 --- a/src/eval/eval_models.py +++ b/src/eval/eval_models.py @@ -29,7 +29,9 @@ def evaluate_model( use_system_prompt: bool = False, system_prompt_config_path: Optional[str] = None, use_reasoning: bool = False, + front_cam: bool = False, add_kois: bool = False, + add_bev: bool = False, approach_name: Optional[str] = None, exclude_question_tags: List[int] = [], exclude_question_types: List[str] = [], @@ -38,7 +40,9 @@ def evaluate_model( dataset = DriveLMImageDataset( message_format=engine.message_formatter, split=dataset_split, + front_cam=front_cam, add_kois=add_kois, + add_bev=add_bev, use_grid=use_grid, use_reasoning=use_reasoning, use_system_prompt=use_system_prompt, diff --git a/src/models/qwen_vl_inference.py b/src/models/qwen_vl_inference.py index 6487604..342c497 100644 --- a/src/models/qwen_vl_inference.py +++ b/src/models/qwen_vl_inference.py @@ -17,7 +17,7 @@ class QwenVLInferenceEngine(BaseInferenceEngine): def __init__( self, - processor_path: str = "Qwen/Qwen2.5-VL-3B-Instruct", + processor_path: str = "Qwen/Qwen2.5-VL-7B-Instruct", model_path: Optional[str] = None, use_4bit: bool = False, torch_dtype: Optional[torch.dtype] = None, diff --git a/src/reasoning/reasoning_engine.py b/src/reasoning/reasoning_engine.py index 02f11aa..eb79c85 100644 --- a/src/reasoning/reasoning_engine.py +++ b/src/reasoning/reasoning_engine.py @@ -37,7 +37,7 @@ def process_batch(self, batch_items: List[QueryItem]) -> List[QueryItem]: image_path=item.image_path, qa_id=f"{item.qa_id}_reasoning", qa_type=item.qa_type, - key_object_info=item.key_object_info, # note not available in eval mode + key_object_info=item.key_object_info, system_prompt=item.system_prompt, ) desc_item.formatted_message = desc_item.format_message( diff --git a/src/train/train_qwen.py b/src/train/train_qwen.py index 36c8e0c..6f9655a 100644 --- a/src/train/train_qwen.py +++ b/src/train/train_qwen.py @@ -41,7 +41,7 @@ class TrainingArguments(transformers.TrainingArguments): cache_dir: Optional[str] = field(default=None) optim: str = field(default="adamw_torch") model_max_length: int = field( - default=512, + default=1028, metadata={ "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." }, @@ -242,7 +242,6 @@ def create_optimizer(self): return self.optimizer -# TODO: Look into the deepspeed config def train( approach_name: str, resize_factor: float, @@ -252,6 +251,9 @@ def train( use_augmented: bool = False, use_reasoning: bool = False, use_system_prompt: bool = False, + add_kois: bool = False, + add_bev: bool = False, + front_cam: bool = False, **kwargs, ): name = approach_name + datetime.now().strftime("%H:%M:%S-%m-%d-%Y%") @@ -310,6 +312,9 @@ def collator(batch: Any): dataset = DriveLMImageDataset( engine.training_message_formatter, split="train", + front_cam=front_cam, + add_kois=add_kois, + add_bev=add_bev, use_grid=use_grid, add_augmented=use_augmented, use_reasoning=use_reasoning, @@ -317,7 +322,7 @@ def collator(batch: Any): resize_factor=resize_factor, ) if test_set_size is not None: - dataset = create_subset(dataset, int(test_set_size)) + dataset = create_subset(dataset, int(test_set_size), equal_distribution=True) dataset = [item.formatted_message for item in dataset] engine.load_model(flash_attn=False) diff --git a/src/utils/approach.py b/src/utils/approach.py index 651ebf2..7cba317 100644 --- a/src/utils/approach.py +++ b/src/utils/approach.py @@ -3,9 +3,11 @@ def get_approach_kwargs(approaches: List[str]) -> Dict[str, Any]: approach_kwargs_map = { + "front_cam": {"front_cam": True}, "image_grid": {"use_grid": True}, "descriptor_qas": {"use_augmented": True}, "add_kois": {"add_kois": True}, + "add_bev": {"add_bev": True}, "reasoning": {"use_reasoning": True}, "system_prompt": {"use_system_prompt": True}, # Add more approaches here as needed diff --git a/src/utils/utils.py b/src/utils/utils.py index df46d2b..09f4aae 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -6,7 +6,7 @@ import torch from torch.utils.data import Dataset, Subset -from src.constants import GRID_IMG_SIZE, IMAGE_SIZE, GRID_POSITIONS +from src.constants import BEV_IMG_SIZE, BEV_AND_FRONT_CAM_IMG_SIZE, GRID_IMG_SIZE, IMAGE_SIZE, GRID_POSITIONS from src.data.query_item import QueryItem from src.utils.logger import get_logger @@ -136,9 +136,13 @@ def tuple_mul(t: Tuple[float, float], scalar: float) -> Tuple[float, float]: return (t[0] * scalar, t[1] * scalar) -def get_resize_image_size(resize_factor: float, grid: bool = False) -> Tuple[int, int]: +def get_resize_image_size(resize_factor: float, grid: bool = False, bev: bool = False, front_cam: bool = False) -> Tuple[int, int]: if grid: size = tuple_mul(GRID_IMG_SIZE, resize_factor) + elif bev and not front_cam: + size = tuple_mul(BEV_IMG_SIZE, resize_factor) + elif bev and front_cam: + size = tuple_mul(BEV_AND_FRONT_CAM_IMG_SIZE, resize_factor) else: size = tuple_mul(IMAGE_SIZE, resize_factor) return tuple_cast(size, int)