From bd712781d485607e67767c7a19a0c3004c14387f Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 26 Jul 2025 14:13:39 +0200
Subject: [PATCH 01/16] get camera calibration for query items

---
 requirements.txt                   |  1 +
 src/data/basic_dataset.py          |  7 +++
 src/data/get_sensor_calibration.py | 72 ++++++++++++++++++++++++++++++
 src/data/query_item.py             |  2 +
 4 files changed, 82 insertions(+)
 create mode 100644 src/data/get_sensor_calibration.py

diff --git a/requirements.txt b/requirements.txt
index 8e0bfb3..d7f6619 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ gdown~=5.2.0
 pre-commit~=4.2.0
 peft~=0.15.2
 trl~=0.18.1
+polars==1.31.0
diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index c3c37a8..0ee73c0 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -5,6 +5,7 @@
 
 from src.constants import drivelm_dir
 from src.data.generate_reasoning_context import generate_reasoning_context
+from src.data.get_sensor_calibration import get_sample_data_and_calibrated_camera_lf, get_camera_calibration
 from src.data.load_dataset import load_dataset
 from src.data.message_formats import MessageFormat
 from src.data.query_item import QueryItem
@@ -48,6 +49,7 @@ def __init__(
         self.use_reasoning = use_reasoning
         self.use_grid = use_grid
         self.use_system_prompt = use_system_prompt
+        self.calibration_lf = get_sample_data_and_calibrated_camera_lf()
 
         data = load_dataset(
             split,
@@ -114,6 +116,7 @@ def __init__(
                             "qa": remove_nones(qa),
                             "qa_type": qa_types[i],
                             "id": scene_id + "_" + key_frame_id + "_" + str(i),
+                            "key_frame_id": key_frame_id,
                             "key_object_info": key_object_infos
                             if qa_types[i] != "perception"
                             else None,
@@ -132,6 +135,7 @@ def __len__(self):
 
     def __getitem__(self, idx):
         qa = self.qas[idx]
+        key_frame_id = qa["key_frame_id"]
         question = qa["qa"]["Q"]
         answer = qa["qa"]["A"]
         tags = qa["qa"].get("tag", [])
@@ -147,6 +151,8 @@ def __getitem__(self, idx):
             else None
         )
 
+        camera_calibration = get_camera_calibration(self.calibration_lf, key_frame_id)
+
         query_item = QueryItem(
             question=question,
             image_path=image_path,
@@ -156,6 +162,7 @@ def __getitem__(self, idx):
             key_object_info=key_object_info,
             system_prompt=system_prompt,
             ground_truth_answer=answer,
+            camera_calibration=camera_calibration,
         )
 
         if self.use_reasoning and self.split == "train":
diff --git a/src/data/get_sensor_calibration.py b/src/data/get_sensor_calibration.py
new file mode 100644
index 0000000..07916b3
--- /dev/null
+++ b/src/data/get_sensor_calibration.py
@@ -0,0 +1,72 @@
+import polars as pl
+
+from src.constants import nuscenes_dir
+
+
+def get_sample_data_and_calibrated_camera_lf() -> pl.LazyFrame:
+    sample_data_lf = pl.read_json(nuscenes_dir / "sample_data.json").lazy()
+    sample_data_lf = sample_data_lf.filter(
+        pl.col("is_key_frame") == True  # noqa: E712
+    ).select([
+        "token",
+        "sample_token",
+        "calibrated_sensor_token"
+    ])
+    calibrated_camera_lf = pl.read_json(nuscenes_dir / "calibrated_sensor.json").lazy()
+    calibrated_camera_lf = calibrated_camera_lf.filter(
+        pl.col("camera_intrinsic").len() != 0
+    )
+    sensor_lf = pl.read_json(nuscenes_dir / "sensor.json").lazy()
+    calibrated_camera_with_sensor_type_lf = calibrated_camera_lf.join(
+        sensor_lf,
+        left_on="sensor_token",
+        right_on="token",
+        suffix="_sensor",
+    )
+    return sample_data_lf.join(
+        calibrated_camera_with_sensor_type_lf, 
+        left_on="calibrated_sensor_token", 
+        right_on="token", 
+        suffix="_calibrated"
+    )
+
+
+cameras = [
+    "CAM_FRONT",
+    "CAM_FRONT_LEFT",
+    "CAM_FRONT_RIGHT",
+    "CAM_BACK",
+    "CAM_BACK_LEFT",
+    "CAM_BACK_RIGHT",
+]
+
+# TODO: Think about using np arrays here instead
+class CameraCalibration:
+    camera_intrinsic: list[list[float]]
+    translation: list[float]
+    rotation: list[float]
+
+    def __init__(self, camera_intrinsic, translation, rotation):
+        self.camera_intrinsic = camera_intrinsic
+        self.translation = translation
+        self.rotation = rotation
+
+
+def get_camera_calibration(lf: pl.LazyFrame, key_frame_id) -> dict[str, CameraCalibration]:
+    calibration_per_camera = {}
+    for cam in cameras:
+        calibration = lf.filter(
+            pl.col("channel") == cam,
+            pl.col("sample_token") == key_frame_id
+        ).select(
+            "translation",
+            "rotation",
+            "camera_intrinsic"
+        ).collect().to_dict()
+        assert len(calibration["translation"]) == 1
+        calibration_per_camera[cam] = CameraCalibration(
+            camera_intrinsic=calibration["camera_intrinsic"][0].to_list(),
+            translation=calibration["translation"][0].to_list(),
+            rotation=calibration["rotation"][0].to_list(),
+        )
+    return calibration_per_camera
diff --git a/src/data/query_item.py b/src/data/query_item.py
index a138034..a2a644e 100644
--- a/src/data/query_item.py
+++ b/src/data/query_item.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple
 
+from src.data.get_sensor_calibration import CameraCalibration
 from src.data.message_formats import MessageFormat
 
 
@@ -11,6 +12,7 @@ class QueryItem:
     qa_id: str
     qa_type: str
     tags: List[str]
+    camera_calibration: CameraCalibration
     key_object_info: Optional[Dict[str, Any]] = None
     system_prompt: str = None
     ground_truth_answer: Optional[str] = None

From 6104fe99f15edcea3f5d54146c7cbe2af9176c67 Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 26 Jul 2025 14:42:07 +0200
Subject: [PATCH 02/16] add download link to nuscenes json

---
 src/data/load_dataset.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index 1f0afe3..48778dd 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -47,6 +47,12 @@ def get_ds(split: str) -> None:
             id="1fsVP7jOpvChcpoXVdypaZ4HREX1gA7As",
             output=os.path.join(drivelm_dir, "v1_1_val_nus_q_only.json"),
         )
+    out_name = os.path.join(nuscenes_dir, "nuscenes_json.zip")
+    gdown.download(
+        id="1sqW1y2k346mtLCQnO0NAab3sEzxUyQ_d",
+        output=out_name,
+    )
+    extract_children(out_name, nuscenes_dir)
 
 
 def load_dataset(

From 82c92963accc55f1f05f3d23bebd9b05a277fc90 Mon Sep 17 00:00:00 2001
From: csiemssen <100309871+csiemssen@users.noreply.github.com>
Date: Fri, 25 Jul 2025 15:00:40 +0200
Subject: [PATCH 03/16] add milestone 3 and report to readme (#94)

---
 README.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index d6a709c..38e22a3 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,16 @@
-## ⚒️ Tools
-#### Zotero
-Citation/Research Manager
-[Group](https://www.zotero.org/groups/5975647/app-ras-driving-with-language)
-
-## 📚 DriveLM Challenge
+## 🏆 DriveLM Challenge
 - [Challenge Website](https://opendrivelab.com/challenge2024/#driving_with_language)
 - [GitHub Repository](https://github.com/OpenDriveLab/DriveLM)
 - [Team Google Form](https://docs.google.com/forms/d/e/1FAIpQLSef_L4L9jXV_88pXkuFmaloifhRuFjVARbjsV-8GWETc6aNCA/viewform)
 
-
-## 💬 Milestone Presentations
+## 📄 Presentations and Report
 - [Milestone 1](https://docs.google.com/presentation/d/13reSKMykn5WhVyi5zi5oK5OygVjTZljeMWflJejQZlw/edit?slide=id.g32bc6f01e94_0_43#slide=id.g32bc6f01e94_0_43)
 - [Milestone 2](https://docs.google.com/presentation/d/1suusmSruqXyRdfvViq1NKfDEqTpH5-M9w7zgh7HDCAo/edit?slide=id.g32bc6f01e94_0_74#slide=id.g32bc6f01e94_0_74)
+- [Milestone 3](https://docs.google.com/presentation/d/1Hpav8SiMT5LqfAGE8KdR5SxGAkOWBNx9zaBSD1SdWUQ/edit?slide=id.g32bc6f01e94_0_74#slide=id.g32bc6f01e94_0_74)
+- [Report](https://www.overleaf.com/project/6877602e966f2ddf5c867888)
+
+#### ⚒️ Tools
+ - [Zotero](https://www.zotero.org/groups/5975647/app-ras-driving-with-language)
 
 ## Setup
 - Download the [NuScenes](https://github.com/OpenDriveLab/DriveLM/tree/main/challenge) training and validation datasets, and place them together in the `data/nuscenes` directory

From 094e05908f980869437d404769ccae2bcd861c2f Mon Sep 17 00:00:00 2001
From: Veit Laule <83905032+vDawgg@users.noreply.github.com>
Date: Sun, 27 Jul 2025 13:32:07 +0200
Subject: [PATCH 04/16] yolo test pipeline for KOI generation with quality
 check (#100)

---
 .gitignore                         |  2 ++
 main.py                            |  1 +
 merge_model_and_adapter.py         |  4 ++-
 requirements.txt                   |  1 +
 src/data/basic_dataset.py          |  4 ++-
 src/data/generate_yolo_kois.py     | 40 ++++++++++++++++++++++++++++++
 src/data/load_dataset.py           |  5 ++++
 src/data/message_formats.py        | 17 +++++++------
 src/eval/eval_models.py            |  2 ++
 src/train/train_qwen.py            |  1 +
 src/utils/approach.py              |  1 +
 src/utils/intern_vl_image_utils.py |  8 ++----
 src/utils/logger.py                |  4 +--
 tests/test_message_format.py       |  6 ++---
 14 files changed, 74 insertions(+), 22 deletions(-)
 create mode 100644 src/data/generate_yolo_kois.py

diff --git a/.gitignore b/.gitignore
index 21d1fbb..74149e9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -199,3 +199,5 @@ backup/
 
 # ignore vscode config
 .vscode/
+
+yolo11*
diff --git a/main.py b/main.py
index 59f6df7..6463c1a 100644
--- a/main.py
+++ b/main.py
@@ -28,6 +28,7 @@
             "front_cam",
             "image_grid",
             "descriptor_qas",
+            "add_kois",
             "reasoning",
             "system_prompt",
         ],
diff --git a/merge_model_and_adapter.py b/merge_model_and_adapter.py
index 4615297..2388d01 100644
--- a/merge_model_and_adapter.py
+++ b/merge_model_and_adapter.py
@@ -16,7 +16,9 @@
 )
 args = parser.parse_args()
 
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2.5-VL-3B-Instruct"
+)
 
 model = PeftModel.from_pretrained(
     model=model,
diff --git a/requirements.txt b/requirements.txt
index d7f6619..200f3dc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ pre-commit~=4.2.0
 peft~=0.15.2
 trl~=0.18.1
 polars==1.31.0
+ultralytics==8.3.168
diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index 0ee73c0..74c555a 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -39,6 +39,7 @@ def __init__(
         message_format: MessageFormat,
         split="train",
         add_augmented=False,
+        add_kois=False,
         use_grid=False,
         use_reasoning=False,
         use_system_prompt=False,
@@ -54,6 +55,7 @@ def __init__(
         data = load_dataset(
             split,
             add_augmented=add_augmented,
+            add_kois=add_kois,
             use_grid=use_grid,
             exclude_tags=exclude_question_tags,
         )
@@ -83,7 +85,7 @@ def __init__(
 
                 key_object_infos = (
                     scene_obj[key_frame_id]["key_object_infos"]
-                    if split == "train"
+                    if split == "train" or add_kois
                     else None
                 )
 
diff --git a/src/data/generate_yolo_kois.py b/src/data/generate_yolo_kois.py
new file mode 100644
index 0000000..4a2562a
--- /dev/null
+++ b/src/data/generate_yolo_kois.py
@@ -0,0 +1,40 @@
+import os
+
+from ultralytics import YOLO
+
+from src.constants import drivelm_dir
+
+
+def generate_yolo_kois(data):
+    model = YOLO("yolo11n.pt")
+    for _, scene_obj in data.items():
+        for _, key_frame in scene_obj["key_frames"].items():
+            image_paths_raw = key_frame["image_paths"]
+            i = 0
+            kois = []
+            for camera, image_path in image_paths_raw.items():
+                results = model(os.path.join(drivelm_dir, image_path))
+                center_points = [
+                    (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh
+                ]
+                categories = [
+                    res.names[cls.item()]
+                    for res in results
+                    for cls in res.boxes.cls.int()
+                ]
+                for j in range(len(center_points)):
+                    i += 1
+                    kois.append(
+                        (
+                            f"<c{i},{camera},{center_points[j][0]},{center_points[j][1]}>",
+                            categories[j],
+                        )
+                    )
+            key_frame["key_object_infos"] = {
+                descriptor: {
+                    "Category": category,
+                }
+                for descriptor, category in kois
+            }
+
+    return data
diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index 48778dd..9f44e0c 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -16,6 +16,7 @@
 from src.data.generate_descriptor_qas import (
     generate_descriptor_qas,
 )
+from src.data.generate_yolo_kois import generate_yolo_kois
 from src.utils.logger import get_logger
 from src.utils.utils import extract_children
 
@@ -58,6 +59,7 @@ def get_ds(split: str) -> None:
 def load_dataset(
     split: str,
     add_augmented: bool = False,
+    add_kois: bool = False,
     use_grid: bool = False,
     exclude_tags: List[int] = [],
 ):
@@ -87,6 +89,9 @@ def load_dataset(
     if split == "train" and add_augmented:
         data = generate_descriptor_qas(data)
 
+    if split == "val" and add_kois:
+        data = generate_yolo_kois(data)
+
     if use_grid:
         data = create_image_grid_dataset(data)
 
diff --git a/src/data/message_formats.py b/src/data/message_formats.py
index f4e7231..6d63cf4 100644
--- a/src/data/message_formats.py
+++ b/src/data/message_formats.py
@@ -29,19 +29,13 @@ def format(
         content = []
         if system_prompt:
             content.append({"type": "text", "text": system_prompt})
-        content.append({"type": "text", "text": "Question: " + question})
-        content.append(
-            {
-                "type": "image",
-                "image": f"file://{image_path}",
-            }
-        )
 
         if key_object_info:
             content.append(
                 {
                     "type": "text",
-                    "text": "Key object infos:\n" + key_object_info.__str__(),
+                    "text": "List of objects in the scene:\n"
+                    + key_object_info.__str__(),
                 }
             )
 
@@ -52,6 +46,13 @@ def format(
                 )
                 content.append({"type": "text", "text": f"Context Answer: {context_a}"})
 
+        content.append({"type": "text", "text": "Question: " + question})
+        content.append(
+            {
+                "type": "image",
+                "image": f"file://{image_path}",
+            }
+        )
         return {
             "role": "user",
             "content": content,
diff --git a/src/eval/eval_models.py b/src/eval/eval_models.py
index 9c203f6..cd22780 100644
--- a/src/eval/eval_models.py
+++ b/src/eval/eval_models.py
@@ -23,11 +23,13 @@ def evaluate_model(
     use_grid: bool = False,
     use_system_prompt: bool = False,
     use_reasoning: bool = False,
+    add_kois: bool = False,
     approach_name: Optional[str] = None,
 ):
     dataset = DriveLMImageDataset(
         message_format=engine.message_formatter,
         split=dataset_split,
+        add_kois=add_kois,
         use_grid=use_grid,
         use_system_prompt=use_system_prompt,
         use_reasoning=use_reasoning,
diff --git a/src/train/train_qwen.py b/src/train/train_qwen.py
index f97c6be..77bb01a 100644
--- a/src/train/train_qwen.py
+++ b/src/train/train_qwen.py
@@ -252,6 +252,7 @@ def train(
     use_augmented: bool = False,
     use_reasoning: bool = False,
     use_system_prompt: bool = False,
+    **kwargs,
 ):
     name = approach_name + datetime.now().strftime("%H:%M:%S-%m-%d-%Y%")
 
diff --git a/src/utils/approach.py b/src/utils/approach.py
index 7aa21ce..651ebf2 100644
--- a/src/utils/approach.py
+++ b/src/utils/approach.py
@@ -5,6 +5,7 @@ def get_approach_kwargs(approaches: List[str]) -> Dict[str, Any]:
     approach_kwargs_map = {
         "image_grid": {"use_grid": True},
         "descriptor_qas": {"use_augmented": True},
+        "add_kois": {"add_kois": True},
         "reasoning": {"use_reasoning": True},
         "system_prompt": {"use_system_prompt": True},
         # Add more approaches here as needed
diff --git a/src/utils/intern_vl_image_utils.py b/src/utils/intern_vl_image_utils.py
index 646cb34..e805423 100644
--- a/src/utils/intern_vl_image_utils.py
+++ b/src/utils/intern_vl_image_utils.py
@@ -11,9 +11,7 @@
 def build_transform(input_size: int):
     return T.Compose(
         [
-            T.Lambda(
-                lambda img: img.convert("RGB") if img.mode != "RGB" else img
-            ),
+            T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
             T.Resize(
                 (input_size, input_size),
                 interpolation=InterpolationMode.BICUBIC,
@@ -24,9 +22,7 @@ def build_transform(input_size: int):
     )
 
 
-def find_closest_aspect_ratio(
-    aspect_ratio, target_ratios, width, height, image_size
-):
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
     best_ratio_diff = float("inf")
     best_ratio = (1, 1)
     area = width * height
diff --git a/src/utils/logger.py b/src/utils/logger.py
index 0071fa3..2caa869 100644
--- a/src/utils/logger.py
+++ b/src/utils/logger.py
@@ -11,9 +11,7 @@
 root_logger = logging.getLogger()
 root_logger.setLevel(logging.INFO)
 
-formatter = logging.Formatter(
-    "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-)
+formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 
 stream_handler = logging.StreamHandler(sys.stdout)
 stream_handler.setFormatter(formatter)
diff --git a/tests/test_message_format.py b/tests/test_message_format.py
index f9fa58c..e9debe8 100644
--- a/tests/test_message_format.py
+++ b/tests/test_message_format.py
@@ -28,14 +28,14 @@ def test_format_of_qwen_message(self):
             "role": "user",
             "content": [
                 {"type": "text", "text": system_prompt},
-                {"type": "text", "text": "Question: " + question},
-                {"type": "image", "image": "file:///path/to/your/image.jpg"},
                 {
                     "type": "text",
-                    "text": "Key object infos:\n{'object': 'car', 'color': 'red'}",
+                    "text": "List of objects in the scene:\n{'object': 'car', 'color': 'red'}",
                 },
                 {"type": "text", "text": "Context Question: What is this?"},
                 {"type": "text", "text": "Context Answer: This is a car."},
+                {"type": "text", "text": "Question: " + question},
+                {"type": "image", "image": "file:///path/to/your/image.jpg"},
             ],
         }
         self.assertEqual(

From 432ce688aeedc01bffc7ea6ba5176a821c828b6d Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Wed, 30 Jul 2025 14:22:21 +0200
Subject: [PATCH 05/16] initial (convoluted) bev generation

---
 main.py                            |   2 +-
 src/constants.py                   |   1 +
 src/data/basic_dataset.py          |   9 +-
 src/data/generate_bev.py           | 361 +++++++++++++++++++++++++++++
 src/data/generate_yolo_kois.py     |  11 +-
 src/data/get_sensor_calibration.py |   9 +-
 src/data/load_dataset.py           |   7 +
 7 files changed, 389 insertions(+), 11 deletions(-)
 create mode 100644 src/data/generate_bev.py

diff --git a/main.py b/main.py
index 6463c1a..2baf380 100644
--- a/main.py
+++ b/main.py
@@ -44,7 +44,7 @@
         "--dataset_split",
         help="The dataset split to use for training / evaluation.",
         type=str,
-        choices=["train", "val"],
+        choices=["train", "val", "test"],
         default="val",
     )
     parser.add_argument(
diff --git a/src/constants.py b/src/constants.py
index 770ee4b..737b39d 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -5,6 +5,7 @@
 drivelm_dir = data_dir / "drivelm"
 nuscenes_dir = data_dir / "nuscenes"
 grid_dir = nuscenes_dir / "samples" / "GRID"
+bev_dir = nuscenes_dir / "samples" / "BEV"
 drivelm_train_json = drivelm_dir / "v1_1_train_nus.json"
 drivelm_val_json = drivelm_dir / "v1_1_val_nus_q_only.json"
 drivelm_test_json = drivelm_dir / "v1_1_test_nus.json"
diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index 74c555a..9d24fc8 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -5,7 +5,6 @@
 
 from src.constants import drivelm_dir
 from src.data.generate_reasoning_context import generate_reasoning_context
-from src.data.get_sensor_calibration import get_sample_data_and_calibrated_camera_lf, get_camera_calibration
 from src.data.load_dataset import load_dataset
 from src.data.message_formats import MessageFormat
 from src.data.query_item import QueryItem
@@ -50,7 +49,6 @@ def __init__(
         self.use_reasoning = use_reasoning
         self.use_grid = use_grid
         self.use_system_prompt = use_system_prompt
-        self.calibration_lf = get_sample_data_and_calibrated_camera_lf()
 
         data = load_dataset(
             split,
@@ -89,6 +87,8 @@ def __init__(
                     else None
                 )
 
+                camera_calibration = scene_obj[key_frame_id]["camera_calibration"]
+
                 qas = scene_obj[key_frame_id]["QA"]
 
                 qas_perception = qas["perception"]
@@ -119,6 +119,7 @@ def __init__(
                             "qa_type": qa_types[i],
                             "id": scene_id + "_" + key_frame_id + "_" + str(i),
                             "key_frame_id": key_frame_id,
+                            "camera_calibration": camera_calibration,
                             "key_object_info": key_object_infos
                             if qa_types[i] != "perception"
                             else None,
@@ -137,10 +138,10 @@ def __len__(self):
 
     def __getitem__(self, idx):
         qa = self.qas[idx]
-        key_frame_id = qa["key_frame_id"]
         question = qa["qa"]["Q"]
         answer = qa["qa"]["A"]
         tags = qa["qa"].get("tag", [])
+        camera_calibration = qa["camera_calibration"]
         key_object_info = qa["key_object_info"]
         image_path = qa["image_path"]
         system_prompt = (
@@ -153,8 +154,6 @@ def __getitem__(self, idx):
             else None
         )
 
-        camera_calibration = get_camera_calibration(self.calibration_lf, key_frame_id)
-
         query_item = QueryItem(
             question=question,
             image_path=image_path,
diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
new file mode 100644
index 0000000..6ae92c2
--- /dev/null
+++ b/src/data/generate_bev.py
@@ -0,0 +1,361 @@
+import cv2
+import os
+import numpy as np
+from scipy.spatial.transform import Rotation as R_scipy
+from tqdm import tqdm
+
+from src.constants import bev_dir, drivelm_dir
+from src.data.get_sensor_calibration import CameraCalibration
+from src.utils.utils import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def generate_bev_from_detections(
+        calibration: dict[str, CameraCalibration], 
+        kois: dict,
+    ) -> np.ndarray:
+    """
+    Generates a Bird's-Eye View (BEV) map from detected objects for a keyframe,
+    using nuScenes camera calibration information.
+
+    Args:
+        calibration: A dictionary where keys are camera names (e.g., 'CAM_FRONT')
+                     and values are CameraCalibration objects.
+        kois: A dictionary where keys contain camera names and values contain
+              object detection information including 2d_bbox and Category.
+    Returns:
+        A NumPy array representing the BEV map (H, W, 3).
+    """
+    bev_map_res_m_per_pixel = 0.1
+    bev_map_x_range = 50.0
+    bev_map_y_range = 50.0
+
+    # --- BEV Map Initialization ---
+    # Calculate min/max extents based on ranges to center ego (0,0)
+    x_min_m = -bev_map_x_range / 2.0
+    y_min_m = -bev_map_y_range / 2.0
+
+    bev_map_width_pixels = int(bev_map_x_range / bev_map_res_m_per_pixel)
+    bev_map_height_pixels = int(bev_map_y_range / bev_map_res_m_per_pixel)
+
+    bev_map = np.zeros((bev_map_height_pixels, bev_map_width_pixels, 3), dtype=np.uint8)
+    bev_map.fill(20)  # Dark background for the BEV map
+
+    all_projected_objects = []
+
+    total_items = 0
+    for camera_name, cam_calib in calibration.items():
+        current_camera_kois = [koi_val for koi_key, koi_val in kois.items() if camera_name in koi_key]
+        current_camera_boxes = [koi["2d_bbox"] for koi in current_camera_kois]
+        total_items += len(current_camera_boxes)
+        current_camera_names = [koi["Category"] for koi in current_camera_kois]
+
+        K = np.array(cam_calib.camera_intrinsic, dtype=np.float64)
+        t_camera_to_ego = np.array(cam_calib.translation, dtype=np.float64) # (x, y, z)
+        q_camera_to_ego = np.array(cam_calib.rotation, dtype=np.float64) # (w, x, y, z)
+
+        # 1. Convert quaternion to rotation matrix: R_ego_from_camera
+        # nuScenes quaternion is (w, x, y, z) -> scipy Rotation.from_quat expects (x, y, z, w)
+        r_ego_from_camera_scipy = R_scipy.from_quat([q_camera_to_ego[1], q_camera_to_ego[2], q_camera_to_ego[3], q_camera_to_ego[0]])
+        R_ego_from_camera = r_ego_from_camera_scipy.as_matrix() # 3x3 rotation matrix from camera to ego
+
+        for i in range(len(current_camera_boxes)):
+            bbox = current_camera_boxes[i]
+            obj_name = current_camera_names[i]
+
+            # Use the bottom-center of the 2D bounding box as the ground contact point heuristic.
+            x1, y1, x2, y2 = bbox
+            bottom_center_2d = np.array([(x1 + x2) / 2, y2], dtype=np.float64)
+
+            # --- Project 2D image point back to 3D on the ground plane (Z=0 in ego frame) ---
+
+            # Convert 2D image point to a 3D ray direction in the camera frame (normalized coordinates).
+            uv_hom = np.array([bottom_center_2d[0], bottom_center_2d[1], 1.0], dtype=np.float64).reshape(3, 1)
+            K_inv = np.linalg.inv(K)
+            ray_direction_camera_frame = np.dot(K_inv, uv_hom).flatten()
+
+            # Transform the ray from the camera frame to the ego vehicle frame.
+            ray_origin_ego = t_camera_to_ego
+            ray_direction_ego = np.dot(R_ego_from_camera, ray_direction_camera_frame)
+
+            # Intersect the ray with the ground plane (Z_ego = 0).
+            if np.isclose(ray_direction_ego[2], 0.0):
+                continue # Ray is parallel or near-parallel to ground plane
+
+            lam = -ray_origin_ego[2] / ray_direction_ego[2]
+
+            # Ensure the intersection point is in front of the camera (positive lambda).
+            if lam < 0:
+                continue
+
+            point_3d_ego = ray_origin_ego + lam * ray_direction_ego
+
+            # Store the projected object's information
+            projected_object_info = {
+                'class': obj_name,
+                'x_ego': point_3d_ego[0],
+                'y_ego': point_3d_ego[1],
+                'z_ego': point_3d_ego[2], # Should be close to 0
+                'camera_name': camera_name,
+                'original_bbox': bbox
+            }
+            all_projected_objects.append(projected_object_info)
+
+    logger.debug(f"Total objects detected across all cameras: {total_items}")
+    logger.debug(f"Total objects after initial projection: {len(all_projected_objects)}")
+    
+    # --- Remove Duplicate Objects ---
+    # Group objects by spatial proximity and class, keep the one with best visibility
+    unique_objects = []
+    proximity_threshold = 2.0  # meters - objects within this distance are considered duplicates
+    duplicates_removed = 0
+    
+    for obj in all_projected_objects:
+        is_duplicate = False
+        for unique_obj in unique_objects:
+            # Check if objects are of same class and spatially close
+            if (obj['class'] == unique_obj['class'] and
+                np.sqrt((obj['x_ego'] - unique_obj['x_ego'])**2 + 
+                       (obj['y_ego'] - unique_obj['y_ego'])**2) < proximity_threshold):
+                
+                # Keep the object from the camera that provides better view
+                # Prefer front cameras for forward objects, side cameras for side objects, etc.
+                current_distance = np.sqrt(obj['x_ego']**2 + obj['y_ego']**2)
+                unique_distance = np.sqrt(unique_obj['x_ego']**2 + unique_obj['y_ego']**2)
+                
+                # Replace if current object is closer or from a more appropriate camera
+                if (current_distance < unique_distance or 
+                    _is_better_camera_view(obj, unique_obj)):
+                    unique_objects.remove(unique_obj)
+                    unique_objects.append(obj)
+                else:
+                    duplicates_removed += 1
+                is_duplicate = True
+                break
+        
+        if not is_duplicate:
+            unique_objects.append(obj)
+
+    logger.debug(f"Total objects after duplicate removal: {len(unique_objects)} (removed {duplicates_removed} duplicates)")
+
+    # --- Render Projected Objects onto the BEV Map ---
+    for obj_info in unique_objects:
+        x_ego = obj_info['x_ego']
+        y_ego = obj_info['y_ego']
+        obj_class = obj_info['class']
+
+        # Convert ego coordinates (meters) to BEV map pixel coordinates.
+        # In nuScenes coordinate system: X+ is right, Y+ is forward, Z+ is up
+        # BEV map: columns represent X (left-right), rows represent Y (forward-back)
+        
+        # Ego X range: [x_min_m, x_max_m] -> BEV columns: [0, bev_map_width_pixels-1]
+        col_bev = int((x_ego - x_min_m) / bev_map_res_m_per_pixel)
+        # Ego Y range: [y_min_m, y_max_m] -> BEV rows: [bev_map_height_pixels-1, 0] (inverted)
+        # Y+ (forward) should appear at top of image (lower row indices)
+        row_bev = int(bev_map_height_pixels - 1 - ((y_ego - y_min_m) / bev_map_res_m_per_pixel))
+
+        # Ensure projected point is within the defined BEV map boundaries
+        if 0 <= col_bev < bev_map_width_pixels and 0 <= row_bev < bev_map_height_pixels:
+            if 'car' in obj_class.lower() or 'vehicle' in obj_class.lower() or \
+               'truck' in obj_class.lower() or 'bus' in obj_class.lower() or \
+               'trailer' in obj_class.lower() or 'construction_vehicle' in obj_class.lower():
+                car_width_bev = int(2.0 / bev_map_res_m_per_pixel)
+                car_length_bev = int(4.5 / bev_map_res_m_per_pixel)
+                color = (0, 255, 255) # Yellow (BGR)
+
+                cv2.rectangle(bev_map,
+                              (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2),
+                              (col_bev + car_width_bev // 2, row_bev + car_length_bev // 2),
+                              color, -1)
+                cv2.putText(bev_map, 'Car', (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2 - 5),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+
+            elif 'pedestrian' in obj_class.lower() or 'person' in obj_class.lower():
+                ped_width_bev = int(0.6 / bev_map_res_m_per_pixel) # Approx 0.6m wide
+                ped_length_bev = int(0.6 / bev_map_res_m_per_pixel) # Approx 0.6m long
+                color = (255, 0, 0) # Blue (BGR)
+
+                cv2.rectangle(bev_map,
+                              (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2),
+                              (col_bev + ped_width_bev // 2, row_bev + ped_length_bev // 2),
+                              color, -1)
+                cv2.putText(bev_map, 'Ped', (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2 - 5),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+
+            elif 'traffic_cone' in obj_class.lower():
+                cone_radius_bev = int(0.3 / bev_map_res_m_per_pixel / 2)
+                color = (0, 0, 255) # Red (BGR)
+                cv2.circle(bev_map, (col_bev, row_bev), cone_radius_bev, color, -1)
+                cv2.putText(bev_map, 'Cone', (col_bev - cone_radius_bev, row_bev - cone_radius_bev - 5),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+
+            elif 'barrier' in obj_class.lower():
+                barrier_width_bev = int(0.2 / bev_map_res_m_per_pixel)
+                barrier_length_bev = int(1.5 / bev_map_res_m_per_pixel)
+                color = (128, 128, 128) # Grey (BGR)
+                cv2.rectangle(bev_map,
+                              (col_bev - barrier_width_bev // 2, row_bev - barrier_length_bev // 2),
+                              (col_bev + barrier_width_bev // 2, row_bev + barrier_length_bev // 2),
+                              color, -1)
+                cv2.putText(bev_map, 'Barrier', (col_bev - barrier_width_bev // 2, row_bev - barrier_length_bev // 2 - 5),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+
+    # --- Draw Ego Vehicle ---
+    ego_x_m = 0.0 # Ego vehicle is at (0,0) in its own frame
+    ego_y_m = 0.0
+
+    # Convert ego (0,0) to BEV map pixel coordinates
+    ego_col_bev = int((ego_x_m - x_min_m) / bev_map_res_m_per_pixel)
+    ego_row_bev = int(bev_map_height_pixels - 1 - ((ego_y_m - y_min_m) / bev_map_res_m_per_pixel))
+    
+    # Ego vehicle dimensions (approximate typical car size)
+    ego_width_m = 2.0
+    ego_length_m = 5.0
+    ego_width_pixels = int(ego_width_m / bev_map_res_m_per_pixel)
+    ego_length_pixels = int(ego_length_m / bev_map_res_m_per_pixel)
+    
+    ego_color = (0, 0, 255) # Red (BGR)
+    cv2.rectangle(bev_map,
+                  (ego_col_bev - ego_width_pixels // 2, ego_row_bev - ego_length_pixels // 2),
+                  (ego_col_bev + ego_width_pixels // 2, ego_row_bev + ego_length_pixels // 2),
+                  ego_color, -1)
+    cv2.putText(bev_map, 'Ego', (ego_col_bev - ego_width_pixels // 2, ego_row_bev - ego_length_pixels // 2 - 5),
+                cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+    
+    # Draw a forward arrow for ego vehicle (Y+ is forward, should point towards top of image)
+    # Arrow points from center towards smaller row index (upward in image = forward in world)
+    arrow_end_y = ego_row_bev - ego_length_pixels // 2 - 10
+    cv2.arrowedLine(bev_map, (ego_col_bev, ego_row_bev), (ego_col_bev, arrow_end_y), (0, 255, 0), 2)
+
+    # Add orientation verification markers
+    _add_orientation_markers(bev_map, bev_map_width_pixels, bev_map_height_pixels)
+
+    # Validate BEV orientation with front camera objects
+    _validate_bev_orientation(unique_objects, bev_map_height_pixels, bev_map_res_m_per_pixel, y_min_m)
+
+    return bev_map
+
+
+def _add_orientation_markers(bev_map, width, height):
+    """
+    Add orientation markers to verify BEV coordinate system.
+    Front should be at top, back at bottom, left on left side, right on right side.
+    """
+    marker_color = (255, 255, 255)  # White
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.6
+    thickness = 2
+    
+    # Add directional labels
+    cv2.putText(bev_map, 'FRONT', (width//2 - 30, 25), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'BACK', (width//2 - 25, height - 10), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'LEFT', (10, height//2), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'RIGHT', (width - 60, height//2), font, font_scale, marker_color, thickness)
+    
+    # Add coordinate axes
+    center_x, center_y = width//2, height//2
+    axis_length = 30
+    
+    # X-axis (horizontal, positive to the right)
+    cv2.arrowedLine(bev_map, (center_x, center_y), (center_x + axis_length, center_y), (0, 255, 255), 2)
+    cv2.putText(bev_map, 'X+', (center_x + axis_length + 5, center_y + 5), font, 0.4, (0, 255, 255), 1)
+    
+    # Y-axis (vertical, positive upward/forward)
+    cv2.arrowedLine(bev_map, (center_x, center_y), (center_x, center_y - axis_length), (255, 0, 255), 2)
+    cv2.putText(bev_map, 'Y+', (center_x + 5, center_y - axis_length - 5), font, 0.4, (255, 0, 255), 1)
+
+
+def _validate_bev_orientation(objects, bev_height_pixels, resolution, y_min_m):
+    """
+    Validate that the BEV orientation is correct by checking if CAM_FRONT objects 
+    appear in the upper part of the image (smaller row indices).
+    """
+    front_objects = [obj for obj in objects if 'FRONT' in obj['camera_name'] and obj['y_ego'] > 0]
+    back_objects = [obj for obj in objects if 'BACK' in obj['camera_name'] and obj['y_ego'] < 0]
+    
+    if front_objects:
+        front_rows = []
+        for obj in front_objects:
+            y_ego = obj['y_ego']
+            row_bev = int(bev_height_pixels - 1 - ((y_ego - y_min_m) / resolution))
+            front_rows.append(row_bev)
+        
+        avg_front_row = np.mean(front_rows)
+        logger.debug(f"CAM_FRONT objects average row: {avg_front_row:.1f} (should be < {bev_height_pixels/2} for upper half)")
+        
+        if avg_front_row > bev_height_pixels / 2:
+            logger.warning("CAM_FRONT objects appear in lower half of BEV - check coordinate system!")
+    
+    if back_objects:
+        back_rows = []
+        for obj in back_objects:
+            y_ego = obj['y_ego']
+            row_bev = int(bev_height_pixels - 1 - ((y_ego - y_min_m) / resolution))
+            back_rows.append(row_bev)
+        
+        avg_back_row = np.mean(back_rows)
+        logger.debug(f"CAM_BACK objects average row: {avg_back_row:.1f} (should be > {bev_height_pixels/2} for lower half)")
+        
+        if avg_back_row < bev_height_pixels / 2:
+            logger.warning("CAM_BACK objects appear in upper half of BEV - check coordinate system!")
+
+
+def _is_better_camera_view(obj1, obj2):
+    """
+    Determine if obj1 has a better camera view than obj2 based on object position and camera type.
+    """
+    x1, y1 = obj1['x_ego'], obj1['y_ego']
+    x2, y2 = obj2['x_ego'], obj2['y_ego']
+    cam1 = obj1['camera_name']
+    cam2 = obj2['camera_name']
+    
+    # Score cameras based on how well they align with object position
+    def get_camera_score(x, y, camera_name):
+        score = 0
+        # Front cameras are best for forward objects (y > 0)
+        if 'FRONT' in camera_name and y > 0:
+            score += 3
+        # Back cameras are best for rear objects (y < 0)
+        elif 'BACK' in camera_name and y < 0:
+            score += 3
+        # Left cameras are best for left objects (x < 0)
+        if 'LEFT' in camera_name and x < 0:
+            score += 2
+        # Right cameras are best for right objects (x > 0)
+        elif 'RIGHT' in camera_name and x > 0:
+            score += 2
+        # Center cameras (FRONT, BACK) are good for center objects
+        if camera_name in ['CAM_FRONT', 'CAM_BACK'] and abs(x) < 5:
+            score += 1
+        return score
+    
+    score1 = get_camera_score(x1, y1, cam1)
+    score2 = get_camera_score(x2, y2, cam2)
+    
+    return score1 > score2
+
+
+def generate_bevs(data):
+    for scene_id, scene_obj in tqdm(data.items(), desc="Generating BEVs"):
+        for key_frame_id, key_frame in scene_obj["key_frames"].items():
+            image_paths = key_frame["image_paths"]
+            image_name = f"{scene_id}_{key_frame_id}__BEV.jpg"
+            bev_path = bev_dir / image_name
+            image_paths["BEV"] = "../nuscenes/samples/BEV/" + image_name
+
+            #if not bev_path.exists():
+            image_paths = {
+                key: os.path.join(drivelm_dir, path)
+                for key, path in image_paths.items()
+            }
+            kois = key_frame["key_object_infos"]
+            calibration = key_frame["camera_calibration"]
+            bev_img = generate_bev_from_detections(
+                kois=kois,
+                calibration=calibration,
+            )
+            cv2.imwrite(bev_path, bev_img)
+            logger.debug(f"Saved bev image: {bev_img}")
+    return data
diff --git a/src/data/generate_yolo_kois.py b/src/data/generate_yolo_kois.py
index 4a2562a..b70669c 100644
--- a/src/data/generate_yolo_kois.py
+++ b/src/data/generate_yolo_kois.py
@@ -5,7 +5,7 @@
 from src.constants import drivelm_dir
 
 
-def generate_yolo_kois(data):
+def generate_yolo_kois(data, max_results_per_camera:int = 5):
     model = YOLO("yolo11n.pt")
     for _, scene_obj in data.items():
         for _, key_frame in scene_obj["key_frames"].items():
@@ -13,9 +13,10 @@ def generate_yolo_kois(data):
             i = 0
             kois = []
             for camera, image_path in image_paths_raw.items():
-                results = model(os.path.join(drivelm_dir, image_path))
+                results = model(os.path.join(drivelm_dir, image_path))[:max_results_per_camera]
+                bbox = [xyxy for res in results for xyxy in res.boxes.xyxy.cpu().tolist()]
                 center_points = [
-                    (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh
+                    (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh.cpu()
                 ]
                 categories = [
                     res.names[cls.item()]
@@ -28,13 +29,15 @@ def generate_yolo_kois(data):
                         (
                             f"<c{i},{camera},{center_points[j][0]},{center_points[j][1]}>",
                             categories[j],
+                            bbox[j]
                         )
                     )
             key_frame["key_object_infos"] = {
                 descriptor: {
                     "Category": category,
+                    "2d_bbox": bbox,
                 }
-                for descriptor, category in kois
+                for descriptor, category, bbox in kois
             }
 
     return data
diff --git a/src/data/get_sensor_calibration.py b/src/data/get_sensor_calibration.py
index 07916b3..18fffbb 100644
--- a/src/data/get_sensor_calibration.py
+++ b/src/data/get_sensor_calibration.py
@@ -1,4 +1,5 @@
 import polars as pl
+from tqdm import tqdm
 
 from src.constants import nuscenes_dir
 
@@ -40,7 +41,6 @@ def get_sample_data_and_calibrated_camera_lf() -> pl.LazyFrame:
     "CAM_BACK_RIGHT",
 ]
 
-# TODO: Think about using np arrays here instead
 class CameraCalibration:
     camera_intrinsic: list[list[float]]
     translation: list[float]
@@ -70,3 +70,10 @@ def get_camera_calibration(lf: pl.LazyFrame, key_frame_id) -> dict[str, CameraCa
             rotation=calibration["rotation"][0].to_list(),
         )
     return calibration_per_camera
+
+def get_calibration(data: dict):
+    lf = get_sample_data_and_calibrated_camera_lf()
+    for _, scene in tqdm(data.items(), desc="Fetching camera calibration data"):
+        for key_frame_id, key_frame in scene["key_frames"].items():
+            key_frame["camera_calibration"] = get_camera_calibration(lf, key_frame_id)
+    return data
diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index 9f44e0c..6249833 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -1,4 +1,5 @@
 import os
+import json
 from json import load
 from typing import List
 
@@ -13,10 +14,12 @@
 )
 from src.data.create_image_grid_dataset import create_image_grid_dataset
 from src.data.extract_test_dataset import extract_data
+from src.data.generate_bev import generate_bevs
 from src.data.generate_descriptor_qas import (
     generate_descriptor_qas,
 )
 from src.data.generate_yolo_kois import generate_yolo_kois
+from src.data.get_sensor_calibration import get_calibration
 from src.utils.logger import get_logger
 from src.utils.utils import extract_children
 
@@ -95,4 +98,8 @@ def load_dataset(
     if use_grid:
         data = create_image_grid_dataset(data)
 
+    # TODO: We should add a switch for this.
+    data = get_calibration(data)
+    data = generate_bevs(data)
+
     return data

From 6c157c3ec4f299b4c9953255a3ce0efbe923edb1 Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Wed, 30 Jul 2025 16:59:32 +0200
Subject: [PATCH 06/16] Ensure correct axes, increase threshold for similarity
 detection, add TODOs

---
 src/data/basic_dataset.py      |  5 +++++
 src/data/generate_bev.py       | 25 +++++++++++++++----------
 src/data/generate_yolo_kois.py |  2 +-
 src/data/load_dataset.py       |  3 +--
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index e44c07b..78b5c7a 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -5,10 +5,12 @@
 
 from src.constants import drivelm_dir
 from src.data.create_image_grid_dataset import create_image_grid_dataset
+from src.data.generate_bev import generate_bevs
 from src.data.generate_descriptor_qas import (
     generate_descriptor_qas,
 )
 from src.data.generate_reasoning_context import generate_reasoning_context
+from src.data.get_sensor_calibration import get_calibration
 from src.data.load_dataset import load_dataset
 from src.data.message_formats import MessageFormat
 from src.data.query_item import QueryItem
@@ -77,6 +79,9 @@ def __init__(
 
         if split == "val" and add_kois:
             data = generate_yolo_kois(data)
+            data = get_calibration(data)
+            data = generate_bevs(data)
+            # NOTE: We need to make sure this is executed AFTER we need actual image locations
             data = normalise_key_object_infos(data, resize_factor, use_grid)
 
         if use_grid:
diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
index 6ae92c2..59469fc 100644
--- a/src/data/generate_bev.py
+++ b/src/data/generate_bev.py
@@ -6,7 +6,7 @@
 
 from src.constants import bev_dir, drivelm_dir
 from src.data.get_sensor_calibration import CameraCalibration
-from src.utils.utils import get_logger
+from src.utils.utils import get_logger, key_object_str_to_dict
 
 
 logger = get_logger(__name__)
@@ -47,7 +47,7 @@ def generate_bev_from_detections(
 
     total_items = 0
     for camera_name, cam_calib in calibration.items():
-        current_camera_kois = [koi_val for koi_key, koi_val in kois.items() if camera_name in koi_key]
+        current_camera_kois = [koi_val for koi_key, koi_val in kois.items() if camera_name == key_object_str_to_dict(koi_key)["camera"]]
         current_camera_boxes = [koi["2d_bbox"] for koi in current_camera_kois]
         total_items += len(current_camera_boxes)
         current_camera_names = [koi["Category"] for koi in current_camera_kois]
@@ -95,8 +95,8 @@ def generate_bev_from_detections(
             # Store the projected object's information
             projected_object_info = {
                 'class': obj_name,
-                'x_ego': point_3d_ego[0],
-                'y_ego': point_3d_ego[1],
+                'x_ego': point_3d_ego[1],
+                'y_ego': point_3d_ego[0],
                 'z_ego': point_3d_ego[2], # Should be close to 0
                 'camera_name': camera_name,
                 'original_bbox': bbox
@@ -109,7 +109,7 @@ def generate_bev_from_detections(
     # --- Remove Duplicate Objects ---
     # Group objects by spatial proximity and class, keep the one with best visibility
     unique_objects = []
-    proximity_threshold = 2.0  # meters - objects within this distance are considered duplicates
+    proximity_threshold = 10.0  # meters - objects within this distance are considered duplicates
     duplicates_removed = 0
     
     for obj in all_projected_objects:
@@ -117,6 +117,7 @@ def generate_bev_from_detections(
         for unique_obj in unique_objects:
             # Check if objects are of same class and spatially close
             if (obj['class'] == unique_obj['class'] and
+                # TODO: Tune the prox threshold
                 np.sqrt((obj['x_ego'] - unique_obj['x_ego'])**2 + 
                        (obj['y_ego'] - unique_obj['y_ego'])**2) < proximity_threshold):
                 
@@ -142,7 +143,7 @@ def generate_bev_from_detections(
 
     # --- Render Projected Objects onto the BEV Map ---
     for obj_info in unique_objects:
-        x_ego = obj_info['x_ego']
+        x_ego = -obj_info['x_ego']
         y_ego = obj_info['y_ego']
         obj_class = obj_info['class']
 
@@ -156,13 +157,16 @@ def generate_bev_from_detections(
         # Y+ (forward) should appear at top of image (lower row indices)
         row_bev = int(bev_map_height_pixels - 1 - ((y_ego - y_min_m) / bev_map_res_m_per_pixel))
 
+        # TODO: Chekc the classes generated by YOLO against this so we cover everything we need
         # Ensure projected point is within the defined BEV map boundaries
         if 0 <= col_bev < bev_map_width_pixels and 0 <= row_bev < bev_map_height_pixels:
             if 'car' in obj_class.lower() or 'vehicle' in obj_class.lower() or \
                'truck' in obj_class.lower() or 'bus' in obj_class.lower() or \
                'trailer' in obj_class.lower() or 'construction_vehicle' in obj_class.lower():
-                car_width_bev = int(2.0 / bev_map_res_m_per_pixel)
-                car_length_bev = int(4.5 / bev_map_res_m_per_pixel)
+                # TODO: Adapt the size of the objects relative to the BEV size
+                #       -> The distance currently looks to small in relation to the size of the cars
+                car_width_bev = int(1.5 / bev_map_res_m_per_pixel)
+                car_length_bev = int(3.0 / bev_map_res_m_per_pixel)
                 color = (0, 255, 255) # Yellow (BGR)
 
                 cv2.rectangle(bev_map,
@@ -173,8 +177,8 @@ def generate_bev_from_detections(
                             cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
 
             elif 'pedestrian' in obj_class.lower() or 'person' in obj_class.lower():
-                ped_width_bev = int(0.6 / bev_map_res_m_per_pixel) # Approx 0.6m wide
-                ped_length_bev = int(0.6 / bev_map_res_m_per_pixel) # Approx 0.6m long
+                ped_width_bev = int(1.0 / bev_map_res_m_per_pixel)
+                ped_length_bev = int(1.0 / bev_map_res_m_per_pixel)
                 color = (255, 0, 0) # Blue (BGR)
 
                 cv2.rectangle(bev_map,
@@ -345,6 +349,7 @@ def generate_bevs(data):
             bev_path = bev_dir / image_name
             image_paths["BEV"] = "../nuscenes/samples/BEV/" + image_name
 
+            # TODO: Uncomment once done
             #if not bev_path.exists():
             image_paths = {
                 key: os.path.join(drivelm_dir, path)
diff --git a/src/data/generate_yolo_kois.py b/src/data/generate_yolo_kois.py
index 0dcc77c..3fcd571 100644
--- a/src/data/generate_yolo_kois.py
+++ b/src/data/generate_yolo_kois.py
@@ -13,7 +13,7 @@ def generate_yolo_kois(data, max_results_per_cam: int = 5):
             i = 0
             kois = []
             for camera, image_path in image_paths_raw.items():
-                results = model(os.path.join(drivelm_dir, image_path))[:max_results_per_camera]
+                results = model(os.path.join(drivelm_dir, image_path))[:max_results_per_cam]
                 bbox = [xyxy for res in results for xyxy in res.boxes.xyxy.cpu().tolist()]
                 center_points = [
                     (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh.cpu()
diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index 08843f2..b1e8509 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -1,5 +1,4 @@
 import os
-import json
 from json import load
 
 import gdown
@@ -51,7 +50,7 @@ def get_ds(split: str) -> None:
     extract_children(out_name, nuscenes_dir)
 
 
-def load_dataset(split: str):
+def load_dataset(split: str) -> dict:
     dataset_paths = {
         "train": drivelm_train_json,
         "val": drivelm_val_json,

From 18d8b9aefb5e94f5029659d0ace18986084d16ab Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 2 Aug 2025 14:14:47 +0200
Subject: [PATCH 07/16] restrict yolo classes, mute yolo logging

---
 src/data/generate_yolo_kois.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/data/generate_yolo_kois.py b/src/data/generate_yolo_kois.py
index 3fcd571..cc89cbf 100644
--- a/src/data/generate_yolo_kois.py
+++ b/src/data/generate_yolo_kois.py
@@ -1,5 +1,6 @@
 import os
 
+from tqdm import tqdm
 from ultralytics import YOLO
 
 from src.constants import drivelm_dir
@@ -7,13 +8,18 @@
 
 def generate_yolo_kois(data, max_results_per_cam: int = 5):
     model = YOLO("yolo11n.pt")
-    for _, scene_obj in data.items():
+    for _, scene_obj in tqdm(data.items(), desc="Generating KOIs with YOLO"):
         for _, key_frame in scene_obj["key_frames"].items():
             image_paths_raw = key_frame["image_paths"]
             i = 0
             kois = []
             for camera, image_path in image_paths_raw.items():
-                results = model(os.path.join(drivelm_dir, image_path))[:max_results_per_cam]
+                results = model(
+                    os.path.join(drivelm_dir, image_path), 
+                    max_det=max_results_per_cam,
+                    classes=[0, 1, 2, 3, 5, 6, 7, 9, 11], # [person, bicycle, car, motorcycle, bus, train, truck, traffic light, stop sign]
+                    verbose=False
+                )
                 bbox = [xyxy for res in results for xyxy in res.boxes.xyxy.cpu().tolist()]
                 center_points = [
                     (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh.cpu()

From 63ed727fb458ca58aa02c4ca564ed74b0b93f800 Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 2 Aug 2025 14:15:25 +0200
Subject: [PATCH 08/16] Use lazyframes only for initial filtering and collect
 after join

---
 src/data/get_sensor_calibration.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/data/get_sensor_calibration.py b/src/data/get_sensor_calibration.py
index 18fffbb..5d93703 100644
--- a/src/data/get_sensor_calibration.py
+++ b/src/data/get_sensor_calibration.py
@@ -4,7 +4,7 @@
 from src.constants import nuscenes_dir
 
 
-def get_sample_data_and_calibrated_camera_lf() -> pl.LazyFrame:
+def get_sample_data_and_calibrated_camera_df() -> pl.DataFrame:
     sample_data_lf = pl.read_json(nuscenes_dir / "sample_data.json").lazy()
     sample_data_lf = sample_data_lf.filter(
         pl.col("is_key_frame") == True  # noqa: E712
@@ -29,7 +29,7 @@ def get_sample_data_and_calibrated_camera_lf() -> pl.LazyFrame:
         left_on="calibrated_sensor_token", 
         right_on="token", 
         suffix="_calibrated"
-    )
+    ).collect()
 
 
 cameras = [
@@ -52,7 +52,7 @@ def __init__(self, camera_intrinsic, translation, rotation):
         self.rotation = rotation
 
 
-def get_camera_calibration(lf: pl.LazyFrame, key_frame_id) -> dict[str, CameraCalibration]:
+def get_camera_calibration(lf: pl.DataFrame, key_frame_id) -> dict[str, CameraCalibration]:
     calibration_per_camera = {}
     for cam in cameras:
         calibration = lf.filter(
@@ -62,7 +62,7 @@ def get_camera_calibration(lf: pl.LazyFrame, key_frame_id) -> dict[str, CameraCa
             "translation",
             "rotation",
             "camera_intrinsic"
-        ).collect().to_dict()
+        ).to_dict()
         assert len(calibration["translation"]) == 1
         calibration_per_camera[cam] = CameraCalibration(
             camera_intrinsic=calibration["camera_intrinsic"][0].to_list(),
@@ -72,7 +72,7 @@ def get_camera_calibration(lf: pl.LazyFrame, key_frame_id) -> dict[str, CameraCa
     return calibration_per_camera
 
 def get_calibration(data: dict):
-    lf = get_sample_data_and_calibrated_camera_lf()
+    lf = get_sample_data_and_calibrated_camera_df()
     for _, scene in tqdm(data.items(), desc="Fetching camera calibration data"):
         for key_frame_id, key_frame in scene["key_frames"].items():
             key_frame["camera_calibration"] = get_camera_calibration(lf, key_frame_id)

From d00cf65f58da445123d60a2d334d445e9c6877e6 Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 2 Aug 2025 14:16:25 +0200
Subject: [PATCH 09/16] Clean up, use ids from KOI in image

---
 src/data/generate_bev.py | 167 +++++++++------------------------------
 1 file changed, 36 insertions(+), 131 deletions(-)

diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
index 59469fc..cce01df 100644
--- a/src/data/generate_bev.py
+++ b/src/data/generate_bev.py
@@ -41,13 +41,15 @@ def generate_bev_from_detections(
     bev_map_height_pixels = int(bev_map_y_range / bev_map_res_m_per_pixel)
 
     bev_map = np.zeros((bev_map_height_pixels, bev_map_width_pixels, 3), dtype=np.uint8)
-    bev_map.fill(20)  # Dark background for the BEV map
+    bev_map.fill(20)
 
     all_projected_objects = []
 
     total_items = 0
     for camera_name, cam_calib in calibration.items():
-        current_camera_kois = [koi_val for koi_key, koi_val in kois.items() if camera_name == key_object_str_to_dict(koi_key)["camera"]]
+        current_keys = [koi_key for koi_key in kois.keys() if camera_name == key_object_str_to_dict(koi_key)["camera"]]
+        current_identifiers = [key_object_str_to_dict(k)["id"] for k in current_keys]
+        current_camera_kois = [kois[k] for k in current_keys]
         current_camera_boxes = [koi["2d_bbox"] for koi in current_camera_kois]
         total_items += len(current_camera_boxes)
         current_camera_names = [koi["Category"] for koi in current_camera_kois]
@@ -95,21 +97,19 @@ def generate_bev_from_detections(
             # Store the projected object's information
             projected_object_info = {
                 'class': obj_name,
-                'x_ego': point_3d_ego[1],
+                'x_ego': -point_3d_ego[1],
                 'y_ego': point_3d_ego[0],
                 'z_ego': point_3d_ego[2], # Should be close to 0
                 'camera_name': camera_name,
-                'original_bbox': bbox
+                'original_bbox': bbox,
+                'identifier': current_identifiers[i],
             }
             all_projected_objects.append(projected_object_info)
 
-    logger.debug(f"Total objects detected across all cameras: {total_items}")
-    logger.debug(f"Total objects after initial projection: {len(all_projected_objects)}")
-    
     # --- Remove Duplicate Objects ---
     # Group objects by spatial proximity and class, keep the one with best visibility
     unique_objects = []
-    proximity_threshold = 10.0  # meters - objects within this distance are considered duplicates
+    proximity_threshold = 10.0
     duplicates_removed = 0
     
     for obj in all_projected_objects:
@@ -139,72 +139,45 @@ def generate_bev_from_detections(
         if not is_duplicate:
             unique_objects.append(obj)
 
-    logger.debug(f"Total objects after duplicate removal: {len(unique_objects)} (removed {duplicates_removed} duplicates)")
-
     # --- Render Projected Objects onto the BEV Map ---
     for obj_info in unique_objects:
-        x_ego = -obj_info['x_ego']
+        x_ego = obj_info['x_ego']
         y_ego = obj_info['y_ego']
         obj_class = obj_info['class']
+        identifier = obj_info['identifier']
 
-        # Convert ego coordinates (meters) to BEV map pixel coordinates.
-        # In nuScenes coordinate system: X+ is right, Y+ is forward, Z+ is up
-        # BEV map: columns represent X (left-right), rows represent Y (forward-back)
-        
-        # Ego X range: [x_min_m, x_max_m] -> BEV columns: [0, bev_map_width_pixels-1]
         col_bev = int((x_ego - x_min_m) / bev_map_res_m_per_pixel)
-        # Ego Y range: [y_min_m, y_max_m] -> BEV rows: [bev_map_height_pixels-1, 0] (inverted)
-        # Y+ (forward) should appear at top of image (lower row indices)
         row_bev = int(bev_map_height_pixels - 1 - ((y_ego - y_min_m) / bev_map_res_m_per_pixel))
 
-        # TODO: Chekc the classes generated by YOLO against this so we cover everything we need
-        # Ensure projected point is within the defined BEV map boundaries
         if 0 <= col_bev < bev_map_width_pixels and 0 <= row_bev < bev_map_height_pixels:
-            if 'car' in obj_class.lower() or 'vehicle' in obj_class.lower() or \
-               'truck' in obj_class.lower() or 'bus' in obj_class.lower() or \
-               'trailer' in obj_class.lower() or 'construction_vehicle' in obj_class.lower():
-                # TODO: Adapt the size of the objects relative to the BEV size
-                #       -> The distance currently looks to small in relation to the size of the cars
+            if 'car' in obj_class.lower() or 'truck' in obj_class.lower() or 'bus' in obj_class.lower() or \
+               'bicycle' in obj_class.lower() or 'motorcycle' in obj_class.lower():
                 car_width_bev = int(1.5 / bev_map_res_m_per_pixel)
                 car_length_bev = int(3.0 / bev_map_res_m_per_pixel)
                 color = (0, 255, 255) # Yellow (BGR)
 
-                cv2.rectangle(bev_map,
-                              (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2),
-                              (col_bev + car_width_bev // 2, row_bev + car_length_bev // 2),
-                              color, -1)
-                cv2.putText(bev_map, 'Car', (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2 - 5),
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+                if (0 <= (col_bev - car_width_bev) and (col_bev + car_width_bev) < bev_map_width_pixels
+                    and 0 <= (row_bev - car_length_bev) and (row_bev + car_length_bev) < bev_map_height_pixels):
+                    cv2.rectangle(bev_map,
+                                (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2),
+                                (col_bev + car_width_bev // 2, row_bev + car_length_bev // 2),
+                                color, -1)
+                    cv2.putText(bev_map, identifier, (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2 - 5),
+                                cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
 
-            elif 'pedestrian' in obj_class.lower() or 'person' in obj_class.lower():
+            elif 'person' in obj_class.lower():
                 ped_width_bev = int(1.0 / bev_map_res_m_per_pixel)
                 ped_length_bev = int(1.0 / bev_map_res_m_per_pixel)
                 color = (255, 0, 0) # Blue (BGR)
 
-                cv2.rectangle(bev_map,
-                              (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2),
-                              (col_bev + ped_width_bev // 2, row_bev + ped_length_bev // 2),
-                              color, -1)
-                cv2.putText(bev_map, 'Ped', (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2 - 5),
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
-
-            elif 'traffic_cone' in obj_class.lower():
-                cone_radius_bev = int(0.3 / bev_map_res_m_per_pixel / 2)
-                color = (0, 0, 255) # Red (BGR)
-                cv2.circle(bev_map, (col_bev, row_bev), cone_radius_bev, color, -1)
-                cv2.putText(bev_map, 'Cone', (col_bev - cone_radius_bev, row_bev - cone_radius_bev - 5),
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
-
-            elif 'barrier' in obj_class.lower():
-                barrier_width_bev = int(0.2 / bev_map_res_m_per_pixel)
-                barrier_length_bev = int(1.5 / bev_map_res_m_per_pixel)
-                color = (128, 128, 128) # Grey (BGR)
-                cv2.rectangle(bev_map,
-                              (col_bev - barrier_width_bev // 2, row_bev - barrier_length_bev // 2),
-                              (col_bev + barrier_width_bev // 2, row_bev + barrier_length_bev // 2),
-                              color, -1)
-                cv2.putText(bev_map, 'Barrier', (col_bev - barrier_width_bev // 2, row_bev - barrier_length_bev // 2 - 5),
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+                if (0 <= (col_bev - ped_width_bev) and (col_bev + ped_width_bev) < bev_map_width_pixels
+                    and 0 <= (row_bev - ped_length_bev) and (row_bev + ped_length_bev) < bev_map_height_pixels):
+                    cv2.rectangle(bev_map,
+                                    (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2),
+                                    (col_bev + ped_width_bev // 2, row_bev + ped_length_bev // 2),
+                                    color, -1)
+                    cv2.putText(bev_map, identifier, (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2 - 5),
+                                cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
 
     # --- Draw Ego Vehicle ---
     ego_x_m = 0.0 # Ego vehicle is at (0,0) in its own frame
@@ -215,10 +188,8 @@ def generate_bev_from_detections(
     ego_row_bev = int(bev_map_height_pixels - 1 - ((ego_y_m - y_min_m) / bev_map_res_m_per_pixel))
     
     # Ego vehicle dimensions (approximate typical car size)
-    ego_width_m = 2.0
-    ego_length_m = 5.0
-    ego_width_pixels = int(ego_width_m / bev_map_res_m_per_pixel)
-    ego_length_pixels = int(ego_length_m / bev_map_res_m_per_pixel)
+    ego_width_pixels = int(1.5 / bev_map_res_m_per_pixel)
+    ego_length_pixels = int(3.0 / bev_map_res_m_per_pixel)
     
     ego_color = (0, 0, 255) # Red (BGR)
     cv2.rectangle(bev_map,
@@ -228,82 +199,17 @@ def generate_bev_from_detections(
     cv2.putText(bev_map, 'Ego', (ego_col_bev - ego_width_pixels // 2, ego_row_bev - ego_length_pixels // 2 - 5),
                 cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
     
-    # Draw a forward arrow for ego vehicle (Y+ is forward, should point towards top of image)
-    # Arrow points from center towards smaller row index (upward in image = forward in world)
-    arrow_end_y = ego_row_bev - ego_length_pixels // 2 - 10
-    cv2.arrowedLine(bev_map, (ego_col_bev, ego_row_bev), (ego_col_bev, arrow_end_y), (0, 255, 0), 2)
-
-    # Add orientation verification markers
-    _add_orientation_markers(bev_map, bev_map_width_pixels, bev_map_height_pixels)
-
-    # Validate BEV orientation with front camera objects
-    _validate_bev_orientation(unique_objects, bev_map_height_pixels, bev_map_res_m_per_pixel, y_min_m)
-
-    return bev_map
-
-
-def _add_orientation_markers(bev_map, width, height):
-    """
-    Add orientation markers to verify BEV coordinate system.
-    Front should be at top, back at bottom, left on left side, right on right side.
-    """
     marker_color = (255, 255, 255)  # White
     font = cv2.FONT_HERSHEY_SIMPLEX
     font_scale = 0.6
     thickness = 2
     
-    # Add directional labels
-    cv2.putText(bev_map, 'FRONT', (width//2 - 30, 25), font, font_scale, marker_color, thickness)
-    cv2.putText(bev_map, 'BACK', (width//2 - 25, height - 10), font, font_scale, marker_color, thickness)
-    cv2.putText(bev_map, 'LEFT', (10, height//2), font, font_scale, marker_color, thickness)
-    cv2.putText(bev_map, 'RIGHT', (width - 60, height//2), font, font_scale, marker_color, thickness)
-    
-    # Add coordinate axes
-    center_x, center_y = width//2, height//2
-    axis_length = 30
-    
-    # X-axis (horizontal, positive to the right)
-    cv2.arrowedLine(bev_map, (center_x, center_y), (center_x + axis_length, center_y), (0, 255, 255), 2)
-    cv2.putText(bev_map, 'X+', (center_x + axis_length + 5, center_y + 5), font, 0.4, (0, 255, 255), 1)
-    
-    # Y-axis (vertical, positive upward/forward)
-    cv2.arrowedLine(bev_map, (center_x, center_y), (center_x, center_y - axis_length), (255, 0, 255), 2)
-    cv2.putText(bev_map, 'Y+', (center_x + 5, center_y - axis_length - 5), font, 0.4, (255, 0, 255), 1)
-
+    cv2.putText(bev_map, 'FRONT', (bev_map_width_pixels//2 - 30, 25), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'BACK', (bev_map_width_pixels//2 - 25, bev_map_height_pixels - 10), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'LEFT', (10, bev_map_height_pixels//2), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'RIGHT', (bev_map_width_pixels - 60, bev_map_height_pixels//2), font, font_scale, marker_color, thickness)
 
-def _validate_bev_orientation(objects, bev_height_pixels, resolution, y_min_m):
-    """
-    Validate that the BEV orientation is correct by checking if CAM_FRONT objects 
-    appear in the upper part of the image (smaller row indices).
-    """
-    front_objects = [obj for obj in objects if 'FRONT' in obj['camera_name'] and obj['y_ego'] > 0]
-    back_objects = [obj for obj in objects if 'BACK' in obj['camera_name'] and obj['y_ego'] < 0]
-    
-    if front_objects:
-        front_rows = []
-        for obj in front_objects:
-            y_ego = obj['y_ego']
-            row_bev = int(bev_height_pixels - 1 - ((y_ego - y_min_m) / resolution))
-            front_rows.append(row_bev)
-        
-        avg_front_row = np.mean(front_rows)
-        logger.debug(f"CAM_FRONT objects average row: {avg_front_row:.1f} (should be < {bev_height_pixels/2} for upper half)")
-        
-        if avg_front_row > bev_height_pixels / 2:
-            logger.warning("CAM_FRONT objects appear in lower half of BEV - check coordinate system!")
-    
-    if back_objects:
-        back_rows = []
-        for obj in back_objects:
-            y_ego = obj['y_ego']
-            row_bev = int(bev_height_pixels - 1 - ((y_ego - y_min_m) / resolution))
-            back_rows.append(row_bev)
-        
-        avg_back_row = np.mean(back_rows)
-        logger.debug(f"CAM_BACK objects average row: {avg_back_row:.1f} (should be > {bev_height_pixels/2} for lower half)")
-        
-        if avg_back_row < bev_height_pixels / 2:
-            logger.warning("CAM_BACK objects appear in upper half of BEV - check coordinate system!")
+    return bev_map
 
 
 def _is_better_camera_view(obj1, obj2):
@@ -315,7 +221,6 @@ def _is_better_camera_view(obj1, obj2):
     cam1 = obj1['camera_name']
     cam2 = obj2['camera_name']
     
-    # Score cameras based on how well they align with object position
     def get_camera_score(x, y, camera_name):
         score = 0
         # Front cameras are best for forward objects (y > 0)

From 4464c45fa0a4581258fc34b173907a28b8563a28 Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 2 Aug 2025 14:46:53 +0200
Subject: [PATCH 10/16] add switch for bev only inference + basic system prompt

---
 main.py                    |  1 +
 src/constants.py           |  1 +
 src/data/basic_dataset.py  | 15 +++++++++++----
 src/data/generate_bev.py   | 27 +++++++++++++--------------
 src/data/system_prompts.py |  9 ++++++---
 src/eval/eval_models.py    |  2 ++
 src/utils/approach.py      |  1 +
 src/utils/utils.py         |  6 ++++--
 8 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/main.py b/main.py
index 39f978f..fef943d 100644
--- a/main.py
+++ b/main.py
@@ -29,6 +29,7 @@
             "image_grid",
             "descriptor_qas",
             "add_kois",
+            "add_bev",
             "reasoning",
             "system_prompt",
         ],
diff --git a/src/constants.py b/src/constants.py
index ffde349..542ed0f 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -21,6 +21,7 @@
     IMAGE_SIZE[0] * GRID[0],
     IMAGE_SIZE[1] * GRID[1],
 )  # (height, width)
+BEV_IMG_SIZE = (500, 500)
 
 GRID_POSITIONS = {
     "CAM_FRONT_LEFT": (0, 0),
diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index 78b5c7a..f436148 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -50,6 +50,7 @@ def __init__(
         split="train",
         add_augmented=False,
         add_kois=False,
+        add_bev=False,
         use_grid=False,
         use_reasoning=False,
         use_system_prompt=False,
@@ -62,6 +63,7 @@ def __init__(
         self.split = split
         self.use_reasoning = use_reasoning
         self.use_grid = use_grid
+        self.add_bev = add_bev
         self.resize_factor = resize_factor
         self.system_prompt_provider = (
             SystemPromptProvider(config_path=system_prompt_config_path)
@@ -79,9 +81,9 @@ def __init__(
 
         if split == "val" and add_kois:
             data = generate_yolo_kois(data)
-            data = get_calibration(data)
-            data = generate_bevs(data)
-            # NOTE: We need to make sure this is executed AFTER we need actual image locations
+            if add_bev:
+                data = get_calibration(data)
+                data = generate_bevs(data)
             data = normalise_key_object_infos(data, resize_factor, use_grid)
 
         if use_grid:
@@ -92,13 +94,17 @@ def __init__(
         for scene_id in data.keys():
             scene_obj = data[scene_id]["key_frames"]
             for key_frame_id in scene_obj.keys():
-                # NOTE: Only consider FRONT camera images or GRID images for now
                 image_paths = scene_obj[key_frame_id]["image_paths"]
                 if use_grid:
                     image_path = os.path.join(
                         drivelm_dir,
                         image_paths["GRID"],
                     )
+                elif add_bev:
+                    image_path = os.path.join(
+                        drivelm_dir,
+                        image_paths["BEV"],
+                    )
                 else:
                     image_path = os.path.join(
                         drivelm_dir,
@@ -193,6 +199,7 @@ def __getitem__(self, idx):
                 question=question,
                 resize_factor=self.resize_factor,
                 use_grid=self.use_grid,
+                add_bev=self.add_bev,
                 use_reasoning=self.use_reasoning,
             )
             if self.system_prompt_provider
diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
index cce01df..1d586f9 100644
--- a/src/data/generate_bev.py
+++ b/src/data/generate_bev.py
@@ -254,18 +254,17 @@ def generate_bevs(data):
             bev_path = bev_dir / image_name
             image_paths["BEV"] = "../nuscenes/samples/BEV/" + image_name
 
-            # TODO: Uncomment once done
-            #if not bev_path.exists():
-            image_paths = {
-                key: os.path.join(drivelm_dir, path)
-                for key, path in image_paths.items()
-            }
-            kois = key_frame["key_object_infos"]
-            calibration = key_frame["camera_calibration"]
-            bev_img = generate_bev_from_detections(
-                kois=kois,
-                calibration=calibration,
-            )
-            cv2.imwrite(bev_path, bev_img)
-            logger.debug(f"Saved bev image: {bev_img}")
+            if not bev_path.exists():
+                image_paths = {
+                    key: os.path.join(drivelm_dir, path)
+                    for key, path in image_paths.items()
+                }
+                kois = key_frame["key_object_infos"]
+                calibration = key_frame["camera_calibration"]
+                bev_img = generate_bev_from_detections(
+                    kois=kois,
+                    calibration=calibration,
+                )
+                cv2.imwrite(bev_path, bev_img)
+                logger.debug(f"Saved bev image: {bev_img}")
     return data
diff --git a/src/data/system_prompts.py b/src/data/system_prompts.py
index 41fc68c..c0de28b 100644
--- a/src/data/system_prompts.py
+++ b/src/data/system_prompts.py
@@ -13,20 +13,23 @@ def __init__(self, config_path=None):
                 self.prompts = yaml.safe_load(f)
 
     def get_approach_prompt(
-        self, resize_factor: float, use_grid: bool = False, use_reasoning: bool = False, 
+        self, resize_factor: float, use_grid: bool = False, use_reasoning: bool = False, add_bev: bool = False
     ) -> str:
         approach = self.prompts.get("approach_prompt", {})
         prompt = approach.get("base", "You are an autonomous driving assistant. ")
 
         grid_prompts = approach.get("use_grid", {})
         if use_grid:
-            im_size = get_resize_image_size(resize_factor, True)
+            im_size = get_resize_image_size(resize_factor, grid=True)
             prompt += grid_prompts.get(
                 "enabled",
                 f"You are provided with a grid of images with size {im_size[1], im_size[0]} of the current situation. Starting from the upper left, the upper row shows images from the 'FRONT_LEFT', 'FRONT' and 'FRONT_RIGHT' cameras respectively. Starting from the bottom left, the lower row shows images from the 'BACK_LEFT', 'BACK' and 'BACK_RIGHT' cameras respectively. ",
             )
+        elif add_bev:
+            im_size = get_resize_image_size(resize_factor, bev=True)
+            prompt += f"You are provided with a birds eye view image with size {im_size[1], im_size[0]} of the vehicle and the sorrounding objects. The ego vehicle is marked in red, vehicles are marked in yellow and predestrians are marked in blue. Each of the objects is associated with an id, that corresponds to the id given in the list of key object infos. E.g. a vehicle with the id 'c1' would correspond to a key object '<c1,CAM_FRONT,200,400>'"
         else:
-            im_size = get_resize_image_size(resize_factor, False)
+            im_size = get_resize_image_size(resize_factor)
             prompt += grid_prompts.get(
                 "disabled",
                 f"You receive a single image with size {im_size[1], im_size[0]} from the front camera. ",
diff --git a/src/eval/eval_models.py b/src/eval/eval_models.py
index 86b40b4..d2eb361 100644
--- a/src/eval/eval_models.py
+++ b/src/eval/eval_models.py
@@ -30,6 +30,7 @@ def evaluate_model(
     system_prompt_config_path: Optional[str] = None,
     use_reasoning: bool = False,
     add_kois: bool = False,
+    add_bev: bool = False,
     approach_name: Optional[str] = None,
     exclude_question_tags: List[int] = [],
     exclude_question_types: List[str] = [],
@@ -39,6 +40,7 @@ def evaluate_model(
         message_format=engine.message_formatter,
         split=dataset_split,
         add_kois=add_kois,
+        add_bev=add_bev,
         use_grid=use_grid,
         use_reasoning=use_reasoning,
         use_system_prompt=use_system_prompt,
diff --git a/src/utils/approach.py b/src/utils/approach.py
index 651ebf2..c2075d0 100644
--- a/src/utils/approach.py
+++ b/src/utils/approach.py
@@ -6,6 +6,7 @@ def get_approach_kwargs(approaches: List[str]) -> Dict[str, Any]:
         "image_grid": {"use_grid": True},
         "descriptor_qas": {"use_augmented": True},
         "add_kois": {"add_kois": True},
+        "add_bev": {"add_bev": True},
         "reasoning": {"use_reasoning": True},
         "system_prompt": {"use_system_prompt": True},
         # Add more approaches here as needed
diff --git a/src/utils/utils.py b/src/utils/utils.py
index df46d2b..2a58633 100644
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -6,7 +6,7 @@
 import torch
 from torch.utils.data import Dataset, Subset
 
-from src.constants import GRID_IMG_SIZE, IMAGE_SIZE, GRID_POSITIONS
+from src.constants import BEV_IMG_SIZE, GRID_IMG_SIZE, IMAGE_SIZE, GRID_POSITIONS
 from src.data.query_item import QueryItem
 from src.utils.logger import get_logger
 
@@ -136,9 +136,11 @@ def tuple_mul(t: Tuple[float, float], scalar: float) -> Tuple[float, float]:
     return (t[0] * scalar, t[1] * scalar)
 
 
-def get_resize_image_size(resize_factor: float, grid: bool = False) -> Tuple[int, int]:
+def get_resize_image_size(resize_factor: float, grid: bool = False, bev: bool = False) -> Tuple[int, int]:
     if grid:
         size = tuple_mul(GRID_IMG_SIZE, resize_factor)
+    elif bev:
+        size = tuple_mul(BEV_IMG_SIZE, resize_factor)
     else:
         size = tuple_mul(IMAGE_SIZE, resize_factor)
     return tuple_cast(size, int)

From ab80b9ef55b6cdffc0a4a5af9064301bc41ff97c Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 2 Aug 2025 15:07:10 +0200
Subject: [PATCH 11/16] fix extraction of nuscenes data

---
 src/data/load_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index b1e8509..46f37ef 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 from json import load
 
 import gdown
@@ -47,7 +48,7 @@ def get_ds(split: str) -> None:
         id="1sqW1y2k346mtLCQnO0NAab3sEzxUyQ_d",
         output=out_name,
     )
-    extract_children(out_name, nuscenes_dir)
+    shutil.unpack_archive(out_name, nuscenes_dir)
 
 
 def load_dataset(split: str) -> dict:

From 8a9beebccdc0d43289b8ef186b4a80370acec8df Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sun, 3 Aug 2025 01:49:08 +0200
Subject: [PATCH 12/16] Ensure BEV dir creation

---
 src/data/generate_bev.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
index 1d586f9..fd2fba1 100644
--- a/src/data/generate_bev.py
+++ b/src/data/generate_bev.py
@@ -247,6 +247,8 @@ def get_camera_score(x, y, camera_name):
 
 
 def generate_bevs(data):
+    bev_dir.mkdir(parents=True, exist_ok=True)
+
     for scene_id, scene_obj in tqdm(data.items(), desc="Generating BEVs"):
         for key_frame_id, key_frame in scene_obj["key_frames"].items():
             image_paths = key_frame["image_paths"]

From efeb12ac12b5e4aacc2c80557c547f03a5e8c5ec Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Mon, 4 Aug 2025 19:22:26 +0200
Subject: [PATCH 13/16] add option to combine front cam and bev + corresponding
 prompt

---
 main.py                    |  2 +-
 src/constants.py           |  1 +
 src/data/basic_dataset.py  |  5 ++++-
 src/data/generate_bev.py   | 19 ++++++++++++++++---
 src/data/system_prompts.py |  9 ++++++---
 src/eval/eval_models.py    |  2 ++
 src/utils/approach.py      |  1 +
 src/utils/utils.py         |  8 +++++---
 8 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/main.py b/main.py
index fef943d..c19750e 100644
--- a/main.py
+++ b/main.py
@@ -90,7 +90,7 @@
         )
     elif args.eval:
         resize_image_size = get_resize_image_size(
-            resize_factor=resize_factor, grid="image_grid" in args.approach
+            resize_factor=resize_factor, grid="image_grid" in args.approach, bev="add_bev" in args.approach, front_cam="front_cam" in args.approach,
         )
         logger.debug(f"Using resize image size: {resize_image_size}")
         if is_cuda():
diff --git a/src/constants.py b/src/constants.py
index 542ed0f..c3705ef 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -22,6 +22,7 @@
     IMAGE_SIZE[1] * GRID[1],
 )  # (height, width)
 BEV_IMG_SIZE = (500, 500)
+BEV_AND_FRONT_CAM_IMG_SIZE = (500, 1388)
 
 GRID_POSITIONS = {
     "CAM_FRONT_LEFT": (0, 0),
diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index f436148..a7a6cc4 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -49,6 +49,7 @@ def __init__(
         message_format: MessageFormat,
         split="train",
         add_augmented=False,
+        front_cam=False,
         add_kois=False,
         add_bev=False,
         use_grid=False,
@@ -61,6 +62,7 @@ def __init__(
     ):
         self.message_format = message_format
         self.split = split
+        self.front_cam = front_cam
         self.use_reasoning = use_reasoning
         self.use_grid = use_grid
         self.add_bev = add_bev
@@ -83,7 +85,7 @@ def __init__(
             data = generate_yolo_kois(data)
             if add_bev:
                 data = get_calibration(data)
-                data = generate_bevs(data)
+                data = generate_bevs(data, front_cam=front_cam)
             data = normalise_key_object_infos(data, resize_factor, use_grid)
 
         if use_grid:
@@ -200,6 +202,7 @@ def __getitem__(self, idx):
                 resize_factor=self.resize_factor,
                 use_grid=self.use_grid,
                 add_bev=self.add_bev,
+                front_cam=self.front_cam,
                 use_reasoning=self.use_reasoning,
             )
             if self.system_prompt_provider
diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
index fd2fba1..98506d8 100644
--- a/src/data/generate_bev.py
+++ b/src/data/generate_bev.py
@@ -246,13 +246,16 @@ def get_camera_score(x, y, camera_name):
     return score1 > score2
 
 
-def generate_bevs(data):
+def generate_bevs(data, front_cam: bool = False):
     bev_dir.mkdir(parents=True, exist_ok=True)
 
     for scene_id, scene_obj in tqdm(data.items(), desc="Generating BEVs"):
         for key_frame_id, key_frame in scene_obj["key_frames"].items():
             image_paths = key_frame["image_paths"]
-            image_name = f"{scene_id}_{key_frame_id}__BEV.jpg"
+            if front_cam:
+                image_name = f"{scene_id}_{key_frame_id}__BEV_FRONT_CAM.jpg"
+            else:
+                image_name = f"{scene_id}_{key_frame_id}__BEV.jpg"
             bev_path = bev_dir / image_name
             image_paths["BEV"] = "../nuscenes/samples/BEV/" + image_name
 
@@ -267,6 +270,16 @@ def generate_bevs(data):
                     kois=kois,
                     calibration=calibration,
                 )
-                cv2.imwrite(bev_path, bev_img)
+                if front_cam:
+                    front_image = cv2.imread(image_paths["CAM_FRONT"])
+                    target_height = min(front_image.shape[0], bev_img.shape[0])
+                    front_aspect = front_image.shape[1] / front_image.shape[0]
+                    front_width = int(target_height * front_aspect)
+                    front_resized = cv2.resize(front_image, (front_width, target_height))
+                    bev_resized = cv2.resize(bev_img, (bev_img.shape[1], target_height))
+                    combined_img = np.hstack([front_resized, bev_resized])
+                    cv2.imwrite(bev_path, combined_img)
+                else:
+                    cv2.imwrite(bev_path, bev_img)
                 logger.debug(f"Saved bev image: {bev_img}")
     return data
diff --git a/src/data/system_prompts.py b/src/data/system_prompts.py
index c0de28b..85b070d 100644
--- a/src/data/system_prompts.py
+++ b/src/data/system_prompts.py
@@ -13,7 +13,7 @@ def __init__(self, config_path=None):
                 self.prompts = yaml.safe_load(f)
 
     def get_approach_prompt(
-        self, resize_factor: float, use_grid: bool = False, use_reasoning: bool = False, add_bev: bool = False
+        self, resize_factor: float, use_grid: bool = False, use_reasoning: bool = False, add_bev: bool = False, front_cam: bool = False
     ) -> str:
         approach = self.prompts.get("approach_prompt", {})
         prompt = approach.get("base", "You are an autonomous driving assistant. ")
@@ -25,9 +25,12 @@ def get_approach_prompt(
                 "enabled",
                 f"You are provided with a grid of images with size {im_size[1], im_size[0]} of the current situation. Starting from the upper left, the upper row shows images from the 'FRONT_LEFT', 'FRONT' and 'FRONT_RIGHT' cameras respectively. Starting from the bottom left, the lower row shows images from the 'BACK_LEFT', 'BACK' and 'BACK_RIGHT' cameras respectively. ",
             )
-        elif add_bev:
+        elif add_bev and not front_cam:
             im_size = get_resize_image_size(resize_factor, bev=True)
-            prompt += f"You are provided with a birds eye view image with size {im_size[1], im_size[0]} of the vehicle and the sorrounding objects. The ego vehicle is marked in red, vehicles are marked in yellow and predestrians are marked in blue. Each of the objects is associated with an id, that corresponds to the id given in the list of key object infos. E.g. a vehicle with the id 'c1' would correspond to a key object '<c1,CAM_FRONT,200,400>'"
+            prompt += f"You are provided with a birds eye view image with size {im_size[1], im_size[0]} of the vehicle and the sorrounding objects. The ego vehicle is marked in red, vehicles are marked in yellow and predestrians are marked in blue. Each of the objects is associated with an id, that corresponds to the id given in the list of key object infos. E.g. a vehicle with the id 'c1' would correspond to a key object '<c1,CAM_FRONT,200,400>'. This view should provide you with a good overview of the objects surrounding the vehicle and their relative distance. "
+        elif add_bev and front_cam:
+            im_size = get_resize_image_size(resize_factor, bev=True, front_cam=True)
+            prompt += f"You are provided with the front view of the car together with a birds eye view image with size {im_size[1], im_size[0]} of the vehicle and the sorrounding objects. The ego vehicle is marked in red, vehicles are marked in yellow and predestrians are marked in blue. Each of the objects is associated with an id, that corresponds to the id given in the list of key object infos. E.g. a vehicle with the id 'c1' would correspond to a key object '<c1,CAM_FRONT,200,400>'. This view should provide you with a good overview of the objects surrounding the vehicle and their relative distance. "
         else:
             im_size = get_resize_image_size(resize_factor)
             prompt += grid_prompts.get(
diff --git a/src/eval/eval_models.py b/src/eval/eval_models.py
index d2eb361..9669b00 100644
--- a/src/eval/eval_models.py
+++ b/src/eval/eval_models.py
@@ -29,6 +29,7 @@ def evaluate_model(
     use_system_prompt: bool = False,
     system_prompt_config_path: Optional[str] = None,
     use_reasoning: bool = False,
+    front_cam: bool = False,
     add_kois: bool = False,
     add_bev: bool = False,
     approach_name: Optional[str] = None,
@@ -39,6 +40,7 @@ def evaluate_model(
     dataset = DriveLMImageDataset(
         message_format=engine.message_formatter,
         split=dataset_split,
+        front_cam=front_cam,
         add_kois=add_kois,
         add_bev=add_bev,
         use_grid=use_grid,
diff --git a/src/utils/approach.py b/src/utils/approach.py
index c2075d0..7cba317 100644
--- a/src/utils/approach.py
+++ b/src/utils/approach.py
@@ -3,6 +3,7 @@
 
 def get_approach_kwargs(approaches: List[str]) -> Dict[str, Any]:
     approach_kwargs_map = {
+        "front_cam": {"front_cam": True},
         "image_grid": {"use_grid": True},
         "descriptor_qas": {"use_augmented": True},
         "add_kois": {"add_kois": True},
diff --git a/src/utils/utils.py b/src/utils/utils.py
index 2a58633..09f4aae 100644
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -6,7 +6,7 @@
 import torch
 from torch.utils.data import Dataset, Subset
 
-from src.constants import BEV_IMG_SIZE, GRID_IMG_SIZE, IMAGE_SIZE, GRID_POSITIONS
+from src.constants import BEV_IMG_SIZE, BEV_AND_FRONT_CAM_IMG_SIZE, GRID_IMG_SIZE, IMAGE_SIZE, GRID_POSITIONS
 from src.data.query_item import QueryItem
 from src.utils.logger import get_logger
 
@@ -136,11 +136,13 @@ def tuple_mul(t: Tuple[float, float], scalar: float) -> Tuple[float, float]:
     return (t[0] * scalar, t[1] * scalar)
 
 
-def get_resize_image_size(resize_factor: float, grid: bool = False, bev: bool = False) -> Tuple[int, int]:
+def get_resize_image_size(resize_factor: float, grid: bool = False, bev: bool = False, front_cam: bool = False) -> Tuple[int, int]:
     if grid:
         size = tuple_mul(GRID_IMG_SIZE, resize_factor)
-    elif bev:
+    elif bev and not front_cam:
         size = tuple_mul(BEV_IMG_SIZE, resize_factor)
+    elif bev and front_cam:
+        size = tuple_mul(BEV_AND_FRONT_CAM_IMG_SIZE, resize_factor)
     else:
         size = tuple_mul(IMAGE_SIZE, resize_factor)
     return tuple_cast(size, int)

From 926f961d957a384bb00c0a9d8d1a9ecd52738980 Mon Sep 17 00:00:00 2001
From: Veit Laule <83905032+vDawgg@users.noreply.github.com>
Date: Wed, 6 Aug 2025 12:34:24 +0200
Subject: [PATCH 14/16] Larger model (#107)

---
 src/data/basic_dataset.py       | 4 +++-
 src/models/qwen_vl_inference.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index a7a6cc4..0d8fef4 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -124,7 +124,9 @@ def __init__(
                     else None
                 )
 
-                camera_calibration = scene_obj[key_frame_id]["camera_calibration"]
+                camera_calibration = None
+                if split=="val" and add_kois and add_bev:
+                    camera_calibration = scene_obj[key_frame_id]["camera_calibration"]
 
                 qas = scene_obj[key_frame_id]["QA"]
 
diff --git a/src/models/qwen_vl_inference.py b/src/models/qwen_vl_inference.py
index 6487604..342c497 100644
--- a/src/models/qwen_vl_inference.py
+++ b/src/models/qwen_vl_inference.py
@@ -17,7 +17,7 @@
 class QwenVLInferenceEngine(BaseInferenceEngine):
     def __init__(
         self,
-        processor_path: str = "Qwen/Qwen2.5-VL-3B-Instruct",
+        processor_path: str = "Qwen/Qwen2.5-VL-7B-Instruct",
         model_path: Optional[str] = None,
         use_4bit: bool = False,
         torch_dtype: Optional[torch.dtype] = None,

From a9774bfd9dcf331abdece5e59e899d1d2d4095b6 Mon Sep 17 00:00:00 2001
From: Veit Laule <83905032+vDawgg@users.noreply.github.com>
Date: Wed, 6 Aug 2025 18:40:40 +0200
Subject: [PATCH 15/16] Apply suggestions from code review

Co-authored-by: csiemssen <100309871+csiemssen@users.noreply.github.com>
---
 src/data/basic_dataset.py | 4 ++--
 src/data/load_dataset.py  | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index 0d8fef4..24d0559 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -81,7 +81,7 @@ def __init__(
         if split == "train" and add_augmented:
             data = generate_descriptor_qas(data)
 
-        if split == "val" and add_kois:
+        if (split == "val" or split == "test") and add_kois:
             data = generate_yolo_kois(data)
             if add_bev:
                 data = get_calibration(data)
@@ -102,7 +102,7 @@ def __init__(
                         drivelm_dir,
                         image_paths["GRID"],
                     )
-                elif add_bev:
+                elif add_kois and add_bev:
                     image_path = os.path.join(
                         drivelm_dir,
                         image_paths["BEV"],
diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index 46f37ef..8aa030e 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -43,12 +43,21 @@ def get_ds(split: str) -> None:
             id="1fsVP7jOpvChcpoXVdypaZ4HREX1gA7As",
             output=os.path.join(drivelm_dir, "v1_1_val_nus_q_only.json"),
         )
+    get_nuscenes_ds()
+
+
+def get_nuscenes_ds():
     out_name = os.path.join(nuscenes_dir, "nuscenes_json.zip")
     gdown.download(
         id="1sqW1y2k346mtLCQnO0NAab3sEzxUyQ_d",
         output=out_name,
     )
     shutil.unpack_archive(out_name, nuscenes_dir)
+    gdown.download(
+        id="1sqW1y2k346mtLCQnO0NAab3sEzxUyQ_d",
+        output=out_name,
+    )
+    shutil.unpack_archive(out_name, nuscenes_dir)
 
 
 def load_dataset(split: str) -> dict:

From 1067af05693bb60ce433b32e3624f13cdefc5a4c Mon Sep 17 00:00:00 2001
From: Veit Laule <83905032+vDawgg@users.noreply.github.com>
Date: Fri, 8 Aug 2025 14:25:22 +0200
Subject: [PATCH 16/16] Update training (#108)

---
 src/data/basic_dataset.py         |  3 +++
 src/reasoning/reasoning_engine.py |  2 +-
 src/train/train_qwen.py           | 11 ++++++++---
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index 24d0559..fb97071 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -76,6 +76,9 @@ def __init__(
         data = load_dataset(split)
 
         if split == "train":
+            if add_bev:
+                data = get_calibration(data)
+                data = generate_bevs(data, front_cam=front_cam)
             data = normalise_key_object_infos(data, resize_factor, use_grid)
 
         if split == "train" and add_augmented:
diff --git a/src/reasoning/reasoning_engine.py b/src/reasoning/reasoning_engine.py
index 02f11aa..eb79c85 100644
--- a/src/reasoning/reasoning_engine.py
+++ b/src/reasoning/reasoning_engine.py
@@ -37,7 +37,7 @@ def process_batch(self, batch_items: List[QueryItem]) -> List[QueryItem]:
                     image_path=item.image_path,
                     qa_id=f"{item.qa_id}_reasoning",
                     qa_type=item.qa_type,
-                    key_object_info=item.key_object_info,  # note not available in eval mode
+                    key_object_info=item.key_object_info,
                     system_prompt=item.system_prompt,
                 )
                 desc_item.formatted_message = desc_item.format_message(
diff --git a/src/train/train_qwen.py b/src/train/train_qwen.py
index 36c8e0c..6f9655a 100644
--- a/src/train/train_qwen.py
+++ b/src/train/train_qwen.py
@@ -41,7 +41,7 @@ class TrainingArguments(transformers.TrainingArguments):
     cache_dir: Optional[str] = field(default=None)
     optim: str = field(default="adamw_torch")
     model_max_length: int = field(
-        default=512,
+        default=1028,
         metadata={
             "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
         },
@@ -242,7 +242,6 @@ def create_optimizer(self):
     return self.optimizer
 
 
-# TODO: Look into the deepspeed config
 def train(
     approach_name: str,
     resize_factor: float,
@@ -252,6 +251,9 @@ def train(
     use_augmented: bool = False,
     use_reasoning: bool = False,
     use_system_prompt: bool = False,
+    add_kois: bool = False,
+    add_bev: bool = False,
+    front_cam: bool = False,
     **kwargs,
 ):
     name = approach_name + datetime.now().strftime("%H:%M:%S-%m-%d-%Y%")
@@ -310,6 +312,9 @@ def collator(batch: Any):
     dataset = DriveLMImageDataset(
         engine.training_message_formatter,
         split="train",
+        front_cam=front_cam,
+        add_kois=add_kois,
+        add_bev=add_bev,
         use_grid=use_grid,
         add_augmented=use_augmented,
         use_reasoning=use_reasoning,
@@ -317,7 +322,7 @@ def collator(batch: Any):
         resize_factor=resize_factor,
     )
     if test_set_size is not None:
-        dataset = create_subset(dataset, int(test_set_size))
+        dataset = create_subset(dataset, int(test_set_size), equal_distribution=True)
     dataset = [item.formatted_message for item in dataset]
 
     engine.load_model(flash_attn=False)