From bd712781d485607e67767c7a19a0c3004c14387f Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 26 Jul 2025 14:13:39 +0200
Subject: [PATCH 01/25] get camera calibration for query items

---
 requirements.txt                   |  1 +
 src/data/basic_dataset.py          |  7 +++
 src/data/get_sensor_calibration.py | 72 ++++++++++++++++++++++++++++++
 src/data/query_item.py             |  2 +
 4 files changed, 82 insertions(+)
 create mode 100644 src/data/get_sensor_calibration.py

diff --git a/requirements.txt b/requirements.txt
index 8e0bfb3..d7f6619 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ gdown~=5.2.0
 pre-commit~=4.2.0
 peft~=0.15.2
 trl~=0.18.1
+polars==1.31.0
diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index c3c37a8..0ee73c0 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -5,6 +5,7 @@
 
 from src.constants import drivelm_dir
 from src.data.generate_reasoning_context import generate_reasoning_context
+from src.data.get_sensor_calibration import get_sample_data_and_calibrated_camera_lf, get_camera_calibration
 from src.data.load_dataset import load_dataset
 from src.data.message_formats import MessageFormat
 from src.data.query_item import QueryItem
@@ -48,6 +49,7 @@ def __init__(
         self.use_reasoning = use_reasoning
         self.use_grid = use_grid
         self.use_system_prompt = use_system_prompt
+        self.calibration_lf = get_sample_data_and_calibrated_camera_lf()
 
         data = load_dataset(
             split,
@@ -114,6 +116,7 @@ def __init__(
                             "qa": remove_nones(qa),
                             "qa_type": qa_types[i],
                             "id": scene_id + "_" + key_frame_id + "_" + str(i),
+                            "key_frame_id": key_frame_id,
                             "key_object_info": key_object_infos
                             if qa_types[i] != "perception"
                             else None,
@@ -132,6 +135,7 @@ def __len__(self):
 
     def __getitem__(self, idx):
         qa = self.qas[idx]
+        key_frame_id = qa["key_frame_id"]
         question = qa["qa"]["Q"]
         answer = qa["qa"]["A"]
         tags = qa["qa"].get("tag", [])
@@ -147,6 +151,8 @@ def __getitem__(self, idx):
             else None
         )
 
+        camera_calibration = get_camera_calibration(self.calibration_lf, key_frame_id)
+
         query_item = QueryItem(
             question=question,
             image_path=image_path,
@@ -156,6 +162,7 @@ def __getitem__(self, idx):
             key_object_info=key_object_info,
             system_prompt=system_prompt,
             ground_truth_answer=answer,
+            camera_calibration=camera_calibration,
         )
 
         if self.use_reasoning and self.split == "train":
diff --git a/src/data/get_sensor_calibration.py b/src/data/get_sensor_calibration.py
new file mode 100644
index 0000000..07916b3
--- /dev/null
+++ b/src/data/get_sensor_calibration.py
@@ -0,0 +1,72 @@
+import polars as pl
+
+from src.constants import nuscenes_dir
+
+
+def get_sample_data_and_calibrated_camera_lf() -> pl.LazyFrame:
+    sample_data_lf = pl.read_json(nuscenes_dir / "sample_data.json").lazy()
+    sample_data_lf = sample_data_lf.filter(
+        pl.col("is_key_frame") == True  # noqa: E712
+    ).select([
+        "token",
+        "sample_token",
+        "calibrated_sensor_token"
+    ])
+    calibrated_camera_lf = pl.read_json(nuscenes_dir / "calibrated_sensor.json").lazy()
+    calibrated_camera_lf = calibrated_camera_lf.filter(
+        pl.col("camera_intrinsic").len() != 0
+    )
+    sensor_lf = pl.read_json(nuscenes_dir / "sensor.json").lazy()
+    calibrated_camera_with_sensor_type_lf = calibrated_camera_lf.join(
+        sensor_lf,
+        left_on="sensor_token",
+        right_on="token",
+        suffix="_sensor",
+    )
+    return sample_data_lf.join(
+        calibrated_camera_with_sensor_type_lf, 
+        left_on="calibrated_sensor_token", 
+        right_on="token", 
+        suffix="_calibrated"
+    )
+
+
+cameras = [
+    "CAM_FRONT",
+    "CAM_FRONT_LEFT",
+    "CAM_FRONT_RIGHT",
+    "CAM_BACK",
+    "CAM_BACK_LEFT",
+    "CAM_BACK_RIGHT",
+]
+
+# TODO: Think about using np arrays here instead
+class CameraCalibration:
+    camera_intrinsic: list[list[float]]
+    translation: list[float]
+    rotation: list[float]
+
+    def __init__(self, camera_intrinsic, translation, rotation):
+        self.camera_intrinsic = camera_intrinsic
+        self.translation = translation
+        self.rotation = rotation
+
+
+def get_camera_calibration(lf: pl.LazyFrame, key_frame_id) -> dict[str, CameraCalibration]:
+    calibration_per_camera = {}
+    for cam in cameras:
+        calibration = lf.filter(
+            pl.col("channel") == cam,
+            pl.col("sample_token") == key_frame_id
+        ).select(
+            "translation",
+            "rotation",
+            "camera_intrinsic"
+        ).collect().to_dict()
+        assert len(calibration["translation"]) == 1
+        calibration_per_camera[cam] = CameraCalibration(
+            camera_intrinsic=calibration["camera_intrinsic"][0].to_list(),
+            translation=calibration["translation"][0].to_list(),
+            rotation=calibration["rotation"][0].to_list(),
+        )
+    return calibration_per_camera
diff --git a/src/data/query_item.py b/src/data/query_item.py
index a138034..a2a644e 100644
--- a/src/data/query_item.py
+++ b/src/data/query_item.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple
 
+from src.data.get_sensor_calibration import CameraCalibration
 from src.data.message_formats import MessageFormat
 
 
@@ -11,6 +12,7 @@ class QueryItem:
     qa_id: str
     qa_type: str
     tags: List[str]
+    camera_calibration: CameraCalibration
     key_object_info: Optional[Dict[str, Any]] = None
     system_prompt: str = None
     ground_truth_answer: Optional[str] = None

From 6104fe99f15edcea3f5d54146c7cbe2af9176c67 Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 26 Jul 2025 14:42:07 +0200
Subject: [PATCH 02/25] add download link to nuscenes json

---
 src/data/load_dataset.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index 1f0afe3..48778dd 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -47,6 +47,12 @@ def get_ds(split: str) -> None:
             id="1fsVP7jOpvChcpoXVdypaZ4HREX1gA7As",
             output=os.path.join(drivelm_dir, "v1_1_val_nus_q_only.json"),
         )
+    out_name = os.path.join(nuscenes_dir, "nuscenes_json.zip")
+    gdown.download(
+        id="1sqW1y2k346mtLCQnO0NAab3sEzxUyQ_d",
+        output=out_name,
+    )
+    extract_children(out_name, nuscenes_dir)
 
 
 def load_dataset(

From 82c92963accc55f1f05f3d23bebd9b05a277fc90 Mon Sep 17 00:00:00 2001
From: csiemssen <100309871+csiemssen@users.noreply.github.com>
Date: Fri, 25 Jul 2025 15:00:40 +0200
Subject: [PATCH 03/25] add milestone 3 and report to readme (#94)

---
 README.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index d6a709c..38e22a3 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,16 @@
-## ⚒️ Tools
-#### Zotero
-Citation/Research Manager
-[Group](https://www.zotero.org/groups/5975647/app-ras-driving-with-language)
-
-## 📚 DriveLM Challenge
+## 🏆 DriveLM Challenge
 - [Challenge Website](https://opendrivelab.com/challenge2024/#driving_with_language)
 - [GitHub Repository](https://github.com/OpenDriveLab/DriveLM)
 - [Team Google Form](https://docs.google.com/forms/d/e/1FAIpQLSef_L4L9jXV_88pXkuFmaloifhRuFjVARbjsV-8GWETc6aNCA/viewform)
 
-
-## 💬 Milestone Presentations
+## 📄 Presentations and Report
 - [Milestone 1](https://docs.google.com/presentation/d/13reSKMykn5WhVyi5zi5oK5OygVjTZljeMWflJejQZlw/edit?slide=id.g32bc6f01e94_0_43#slide=id.g32bc6f01e94_0_43)
 - [Milestone 2](https://docs.google.com/presentation/d/1suusmSruqXyRdfvViq1NKfDEqTpH5-M9w7zgh7HDCAo/edit?slide=id.g32bc6f01e94_0_74#slide=id.g32bc6f01e94_0_74)
+- [Milestone 3](https://docs.google.com/presentation/d/1Hpav8SiMT5LqfAGE8KdR5SxGAkOWBNx9zaBSD1SdWUQ/edit?slide=id.g32bc6f01e94_0_74#slide=id.g32bc6f01e94_0_74)
+- [Report](https://www.overleaf.com/project/6877602e966f2ddf5c867888)
+
+#### ⚒️ Tools
+ - [Zotero](https://www.zotero.org/groups/5975647/app-ras-driving-with-language)
 
 ## Setup
 - Download the [NuScenes](https://github.com/OpenDriveLab/DriveLM/tree/main/challenge) training and validation datasets, and place them together in the `data/nuscenes` directory

From 094e05908f980869437d404769ccae2bcd861c2f Mon Sep 17 00:00:00 2001
From: Veit Laule <83905032+vDawgg@users.noreply.github.com>
Date: Sun, 27 Jul 2025 13:32:07 +0200
Subject: [PATCH 04/25] yolo test pipeline for KOI generation with quality
 check (#100)

---
 .gitignore                         |  2 ++
 main.py                            |  1 +
 merge_model_and_adapter.py         |  4 ++-
 requirements.txt                   |  1 +
 src/data/basic_dataset.py          |  4 ++-
 src/data/generate_yolo_kois.py     | 40 ++++++++++++++++++++++++++++++
 src/data/load_dataset.py           |  5 ++++
 src/data/message_formats.py        | 17 +++++++------
 src/eval/eval_models.py            |  2 ++
 src/train/train_qwen.py            |  1 +
 src/utils/approach.py              |  1 +
 src/utils/intern_vl_image_utils.py |  8 ++----
 src/utils/logger.py                |  4 +--
 tests/test_message_format.py       |  6 ++---
 14 files changed, 74 insertions(+), 22 deletions(-)
 create mode 100644 src/data/generate_yolo_kois.py

diff --git a/.gitignore b/.gitignore
index 21d1fbb..74149e9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -199,3 +199,5 @@ backup/
 
 # ignore vscode config
 .vscode/
+
+yolo11*
diff --git a/main.py b/main.py
index 59f6df7..6463c1a 100644
--- a/main.py
+++ b/main.py
@@ -28,6 +28,7 @@
             "front_cam",
             "image_grid",
             "descriptor_qas",
+            "add_kois",
             "reasoning",
             "system_prompt",
         ],
diff --git a/merge_model_and_adapter.py b/merge_model_and_adapter.py
index 4615297..2388d01 100644
--- a/merge_model_and_adapter.py
+++ b/merge_model_and_adapter.py
@@ -16,7 +16,9 @@
 )
 args = parser.parse_args()
 
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2.5-VL-3B-Instruct"
+)
 
 model = PeftModel.from_pretrained(
     model=model,
diff --git a/requirements.txt b/requirements.txt
index d7f6619..200f3dc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ pre-commit~=4.2.0
 peft~=0.15.2
 trl~=0.18.1
 polars==1.31.0
+ultralytics==8.3.168
diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index 0ee73c0..74c555a 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -39,6 +39,7 @@ def __init__(
         message_format: MessageFormat,
         split="train",
         add_augmented=False,
+        add_kois=False,
         use_grid=False,
         use_reasoning=False,
         use_system_prompt=False,
@@ -54,6 +55,7 @@ def __init__(
         data = load_dataset(
             split,
             add_augmented=add_augmented,
+            add_kois=add_kois,
             use_grid=use_grid,
             exclude_tags=exclude_question_tags,
         )
@@ -83,7 +85,7 @@ def __init__(
 
                 key_object_infos = (
                     scene_obj[key_frame_id]["key_object_infos"]
-                    if split == "train"
+                    if split == "train" or add_kois
                     else None
                 )
 
diff --git a/src/data/generate_yolo_kois.py b/src/data/generate_yolo_kois.py
new file mode 100644
index 0000000..4a2562a
--- /dev/null
+++ b/src/data/generate_yolo_kois.py
@@ -0,0 +1,40 @@
+import os
+
+from ultralytics import YOLO
+
+from src.constants import drivelm_dir
+
+
+def generate_yolo_kois(data):
+    model = YOLO("yolo11n.pt")
+    for _, scene_obj in data.items():
+        for _, key_frame in scene_obj["key_frames"].items():
+            image_paths_raw = key_frame["image_paths"]
+            i = 0
+            kois = []
+            for camera, image_path in image_paths_raw.items():
+                results = model(os.path.join(drivelm_dir, image_path))
+                center_points = [
+                    (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh
+                ]
+                categories = [
+                    res.names[cls.item()]
+                    for res in results
+                    for cls in res.boxes.cls.int()
+                ]
+                for j in range(len(center_points)):
+                    i += 1
+                    kois.append(
+                        (
+                            f"<c{i},{camera},{center_points[j][0]},{center_points[j][1]}>",
+                            categories[j],
+                        )
+                    )
+            key_frame["key_object_infos"] = {
+                descriptor: {
+                    "Category": category,
+                }
+                for descriptor, category in kois
+            }
+
+    return data
diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index 48778dd..9f44e0c 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -16,6 +16,7 @@
 from src.data.generate_descriptor_qas import (
     generate_descriptor_qas,
 )
+from src.data.generate_yolo_kois import generate_yolo_kois
 from src.utils.logger import get_logger
 from src.utils.utils import extract_children
 
@@ -58,6 +59,7 @@ def get_ds(split: str) -> None:
 def load_dataset(
     split: str,
     add_augmented: bool = False,
+    add_kois: bool = False,
     use_grid: bool = False,
     exclude_tags: List[int] = [],
 ):
@@ -87,6 +89,9 @@ def load_dataset(
     if split == "train" and add_augmented:
         data = generate_descriptor_qas(data)
 
+    if split == "val" and add_kois:
+        data = generate_yolo_kois(data)
+
     if use_grid:
         data = create_image_grid_dataset(data)
 
diff --git a/src/data/message_formats.py b/src/data/message_formats.py
index f4e7231..6d63cf4 100644
--- a/src/data/message_formats.py
+++ b/src/data/message_formats.py
@@ -29,19 +29,13 @@ def format(
         content = []
         if system_prompt:
             content.append({"type": "text", "text": system_prompt})
-        content.append({"type": "text", "text": "Question: " + question})
-        content.append(
-            {
-                "type": "image",
-                "image": f"file://{image_path}",
-            }
-        )
 
         if key_object_info:
             content.append(
                 {
                     "type": "text",
-                    "text": "Key object infos:\n" + key_object_info.__str__(),
+                    "text": "List of objects in the scene:\n"
+                    + key_object_info.__str__(),
                 }
             )
 
@@ -52,6 +46,13 @@ def format(
                 )
                 content.append({"type": "text", "text": f"Context Answer: {context_a}"})
 
+        content.append({"type": "text", "text": "Question: " + question})
+        content.append(
+            {
+                "type": "image",
+                "image": f"file://{image_path}",
+            }
+        )
         return {
             "role": "user",
             "content": content,
diff --git a/src/eval/eval_models.py b/src/eval/eval_models.py
index 9c203f6..cd22780 100644
--- a/src/eval/eval_models.py
+++ b/src/eval/eval_models.py
@@ -23,11 +23,13 @@ def evaluate_model(
     use_grid: bool = False,
     use_system_prompt: bool = False,
     use_reasoning: bool = False,
+    add_kois: bool = False,
     approach_name: Optional[str] = None,
 ):
     dataset = DriveLMImageDataset(
         message_format=engine.message_formatter,
         split=dataset_split,
+        add_kois=add_kois,
         use_grid=use_grid,
         use_system_prompt=use_system_prompt,
         use_reasoning=use_reasoning,
diff --git a/src/train/train_qwen.py b/src/train/train_qwen.py
index f97c6be..77bb01a 100644
--- a/src/train/train_qwen.py
+++ b/src/train/train_qwen.py
@@ -252,6 +252,7 @@ def train(
     use_augmented: bool = False,
     use_reasoning: bool = False,
     use_system_prompt: bool = False,
+    **kwargs,
 ):
     name = approach_name + datetime.now().strftime("%H:%M:%S-%m-%d-%Y%")
 
diff --git a/src/utils/approach.py b/src/utils/approach.py
index 7aa21ce..651ebf2 100644
--- a/src/utils/approach.py
+++ b/src/utils/approach.py
@@ -5,6 +5,7 @@ def get_approach_kwargs(approaches: List[str]) -> Dict[str, Any]:
     approach_kwargs_map = {
         "image_grid": {"use_grid": True},
         "descriptor_qas": {"use_augmented": True},
+        "add_kois": {"add_kois": True},
         "reasoning": {"use_reasoning": True},
         "system_prompt": {"use_system_prompt": True},
         # Add more approaches here as needed
diff --git a/src/utils/intern_vl_image_utils.py b/src/utils/intern_vl_image_utils.py
index 646cb34..e805423 100644
--- a/src/utils/intern_vl_image_utils.py
+++ b/src/utils/intern_vl_image_utils.py
@@ -11,9 +11,7 @@
 def build_transform(input_size: int):
     return T.Compose(
         [
-            T.Lambda(
-                lambda img: img.convert("RGB") if img.mode != "RGB" else img
-            ),
+            T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
             T.Resize(
                 (input_size, input_size),
                 interpolation=InterpolationMode.BICUBIC,
@@ -24,9 +22,7 @@ def build_transform(input_size: int):
     )
 
 
-def find_closest_aspect_ratio(
-    aspect_ratio, target_ratios, width, height, image_size
-):
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
     best_ratio_diff = float("inf")
     best_ratio = (1, 1)
     area = width * height
diff --git a/src/utils/logger.py b/src/utils/logger.py
index 0071fa3..2caa869 100644
--- a/src/utils/logger.py
+++ b/src/utils/logger.py
@@ -11,9 +11,7 @@
 root_logger = logging.getLogger()
 root_logger.setLevel(logging.INFO)
 
-formatter = logging.Formatter(
-    "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-)
+formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 
 stream_handler = logging.StreamHandler(sys.stdout)
 stream_handler.setFormatter(formatter)
diff --git a/tests/test_message_format.py b/tests/test_message_format.py
index f9fa58c..e9debe8 100644
--- a/tests/test_message_format.py
+++ b/tests/test_message_format.py
@@ -28,14 +28,14 @@ def test_format_of_qwen_message(self):
             "role": "user",
             "content": [
                 {"type": "text", "text": system_prompt},
-                {"type": "text", "text": "Question: " + question},
-                {"type": "image", "image": "file:///path/to/your/image.jpg"},
                 {
                     "type": "text",
-                    "text": "Key object infos:\n{'object': 'car', 'color': 'red'}",
+                    "text": "List of objects in the scene:\n{'object': 'car', 'color': 'red'}",
                 },
                 {"type": "text", "text": "Context Question: What is this?"},
                 {"type": "text", "text": "Context Answer: This is a car."},
+                {"type": "text", "text": "Question: " + question},
+                {"type": "image", "image": "file:///path/to/your/image.jpg"},
             ],
         }
         self.assertEqual(

From 432ce688aeedc01bffc7ea6ba5176a821c828b6d Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Wed, 30 Jul 2025 14:22:21 +0200
Subject: [PATCH 05/25] initial (convoluted) bev generation

---
 main.py                            |   2 +-
 src/constants.py                   |   1 +
 src/data/basic_dataset.py          |   9 +-
 src/data/generate_bev.py           | 361 +++++++++++++++++++++++++++++
 src/data/generate_yolo_kois.py     |  11 +-
 src/data/get_sensor_calibration.py |   9 +-
 src/data/load_dataset.py           |   7 +
 7 files changed, 389 insertions(+), 11 deletions(-)
 create mode 100644 src/data/generate_bev.py

diff --git a/main.py b/main.py
index 6463c1a..2baf380 100644
--- a/main.py
+++ b/main.py
@@ -44,7 +44,7 @@
         "--dataset_split",
         help="The dataset split to use for training / evaluation.",
         type=str,
-        choices=["train", "val"],
+        choices=["train", "val", "test"],
         default="val",
     )
     parser.add_argument(
diff --git a/src/constants.py b/src/constants.py
index 770ee4b..737b39d 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -5,6 +5,7 @@
 drivelm_dir = data_dir / "drivelm"
 nuscenes_dir = data_dir / "nuscenes"
 grid_dir = nuscenes_dir / "samples" / "GRID"
+bev_dir = nuscenes_dir / "samples" / "BEV"
 drivelm_train_json = drivelm_dir / "v1_1_train_nus.json"
 drivelm_val_json = drivelm_dir / "v1_1_val_nus_q_only.json"
 drivelm_test_json = drivelm_dir / "v1_1_test_nus.json"
diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index 74c555a..9d24fc8 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -5,7 +5,6 @@
 
 from src.constants import drivelm_dir
 from src.data.generate_reasoning_context import generate_reasoning_context
-from src.data.get_sensor_calibration import get_sample_data_and_calibrated_camera_lf, get_camera_calibration
 from src.data.load_dataset import load_dataset
 from src.data.message_formats import MessageFormat
 from src.data.query_item import QueryItem
@@ -50,7 +49,6 @@ def __init__(
         self.use_reasoning = use_reasoning
         self.use_grid = use_grid
         self.use_system_prompt = use_system_prompt
-        self.calibration_lf = get_sample_data_and_calibrated_camera_lf()
 
         data = load_dataset(
             split,
@@ -89,6 +87,8 @@ def __init__(
                     else None
                 )
 
+                camera_calibration = scene_obj[key_frame_id]["camera_calibration"]
+
                 qas = scene_obj[key_frame_id]["QA"]
 
                 qas_perception = qas["perception"]
@@ -119,6 +119,7 @@ def __init__(
                             "qa_type": qa_types[i],
                             "id": scene_id + "_" + key_frame_id + "_" + str(i),
                             "key_frame_id": key_frame_id,
+                            "camera_calibration": camera_calibration,
                             "key_object_info": key_object_infos
                             if qa_types[i] != "perception"
                             else None,
@@ -137,10 +138,10 @@ def __len__(self):
 
     def __getitem__(self, idx):
         qa = self.qas[idx]
-        key_frame_id = qa["key_frame_id"]
         question = qa["qa"]["Q"]
         answer = qa["qa"]["A"]
         tags = qa["qa"].get("tag", [])
+        camera_calibration = qa["camera_calibration"]
         key_object_info = qa["key_object_info"]
         image_path = qa["image_path"]
         system_prompt = (
@@ -153,8 +154,6 @@ def __getitem__(self, idx):
             else None
         )
 
-        camera_calibration = get_camera_calibration(self.calibration_lf, key_frame_id)
-
         query_item = QueryItem(
             question=question,
             image_path=image_path,
diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
new file mode 100644
index 0000000..6ae92c2
--- /dev/null
+++ b/src/data/generate_bev.py
@@ -0,0 +1,361 @@
+import cv2
+import os
+import numpy as np
+from scipy.spatial.transform import Rotation as R_scipy
+from tqdm import tqdm
+
+from src.constants import bev_dir, drivelm_dir
+from src.data.get_sensor_calibration import CameraCalibration
+from src.utils.utils import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def generate_bev_from_detections(
+        calibration: dict[str, CameraCalibration], 
+        kois: dict,
+    ) -> np.ndarray:
+    """
+    Generates a Bird's-Eye View (BEV) map from detected objects for a keyframe,
+    using nuScenes camera calibration information.
+
+    Args:
+        calibration: A dictionary where keys are camera names (e.g., 'CAM_FRONT')
+                     and values are CameraCalibration objects.
+        kois: A dictionary where keys contain camera names and values contain
+              object detection information including 2d_bbox and Category.
+    Returns:
+        A NumPy array representing the BEV map (H, W, 3).
+    """
+    bev_map_res_m_per_pixel = 0.1
+    bev_map_x_range = 50.0
+    bev_map_y_range = 50.0
+
+    # --- BEV Map Initialization ---
+    # Calculate min/max extents based on ranges to center ego (0,0)
+    x_min_m = -bev_map_x_range / 2.0
+    y_min_m = -bev_map_y_range / 2.0
+
+    bev_map_width_pixels = int(bev_map_x_range / bev_map_res_m_per_pixel)
+    bev_map_height_pixels = int(bev_map_y_range / bev_map_res_m_per_pixel)
+
+    bev_map = np.zeros((bev_map_height_pixels, bev_map_width_pixels, 3), dtype=np.uint8)
+    bev_map.fill(20)  # Dark background for the BEV map
+
+    all_projected_objects = []
+
+    total_items = 0
+    for camera_name, cam_calib in calibration.items():
+        current_camera_kois = [koi_val for koi_key, koi_val in kois.items() if camera_name in koi_key]
+        current_camera_boxes = [koi["2d_bbox"] for koi in current_camera_kois]
+        total_items += len(current_camera_boxes)
+        current_camera_names = [koi["Category"] for koi in current_camera_kois]
+
+        K = np.array(cam_calib.camera_intrinsic, dtype=np.float64)
+        t_camera_to_ego = np.array(cam_calib.translation, dtype=np.float64) # (x, y, z)
+        q_camera_to_ego = np.array(cam_calib.rotation, dtype=np.float64) # (w, x, y, z)
+
+        # 1. Convert quaternion to rotation matrix: R_ego_from_camera
+        # nuScenes quaternion is (w, x, y, z) -> scipy Rotation.from_quat expects (x, y, z, w)
+        r_ego_from_camera_scipy = R_scipy.from_quat([q_camera_to_ego[1], q_camera_to_ego[2], q_camera_to_ego[3], q_camera_to_ego[0]])
+        R_ego_from_camera = r_ego_from_camera_scipy.as_matrix() # 3x3 rotation matrix from camera to ego
+
+        for i in range(len(current_camera_boxes)):
+            bbox = current_camera_boxes[i]
+            obj_name = current_camera_names[i]
+
+            # Use the bottom-center of the 2D bounding box as the ground contact point heuristic.
+            x1, y1, x2, y2 = bbox
+            bottom_center_2d = np.array([(x1 + x2) / 2, y2], dtype=np.float64)
+
+            # --- Project 2D image point back to 3D on the ground plane (Z=0 in ego frame) ---
+
+            # Convert 2D image point to a 3D ray direction in the camera frame (normalized coordinates).
+            uv_hom = np.array([bottom_center_2d[0], bottom_center_2d[1], 1.0], dtype=np.float64).reshape(3, 1)
+            K_inv = np.linalg.inv(K)
+            ray_direction_camera_frame = np.dot(K_inv, uv_hom).flatten()
+
+            # Transform the ray from the camera frame to the ego vehicle frame.
+            ray_origin_ego = t_camera_to_ego
+            ray_direction_ego = np.dot(R_ego_from_camera, ray_direction_camera_frame)
+
+            # Intersect the ray with the ground plane (Z_ego = 0).
+            if np.isclose(ray_direction_ego[2], 0.0):
+                continue # Ray is parallel or near-parallel to ground plane
+
+            lam = -ray_origin_ego[2] / ray_direction_ego[2]
+
+            # Ensure the intersection point is in front of the camera (positive lambda).
+            if lam < 0:
+                continue
+
+            point_3d_ego = ray_origin_ego + lam * ray_direction_ego
+
+            # Store the projected object's information
+            projected_object_info = {
+                'class': obj_name,
+                'x_ego': point_3d_ego[0],
+                'y_ego': point_3d_ego[1],
+                'z_ego': point_3d_ego[2], # Should be close to 0
+                'camera_name': camera_name,
+                'original_bbox': bbox
+            }
+            all_projected_objects.append(projected_object_info)
+
+    logger.debug(f"Total objects detected across all cameras: {total_items}")
+    logger.debug(f"Total objects after initial projection: {len(all_projected_objects)}")
+    
+    # --- Remove Duplicate Objects ---
+    # Group objects by spatial proximity and class, keep the one with best visibility
+    unique_objects = []
+    proximity_threshold = 2.0  # meters - objects within this distance are considered duplicates
+    duplicates_removed = 0
+    
+    for obj in all_projected_objects:
+        is_duplicate = False
+        for unique_obj in unique_objects:
+            # Check if objects are of same class and spatially close
+            if (obj['class'] == unique_obj['class'] and
+                np.sqrt((obj['x_ego'] - unique_obj['x_ego'])**2 + 
+                       (obj['y_ego'] - unique_obj['y_ego'])**2) < proximity_threshold):
+                
+                # Keep the object from the camera that provides better view
+                # Prefer front cameras for forward objects, side cameras for side objects, etc.
+                current_distance = np.sqrt(obj['x_ego']**2 + obj['y_ego']**2)
+                unique_distance = np.sqrt(unique_obj['x_ego']**2 + unique_obj['y_ego']**2)
+                
+                # Replace if current object is closer or from a more appropriate camera
+                if (current_distance < unique_distance or 
+                    _is_better_camera_view(obj, unique_obj)):
+                    unique_objects.remove(unique_obj)
+                    unique_objects.append(obj)
+                else:
+                    duplicates_removed += 1
+                is_duplicate = True
+                break
+        
+        if not is_duplicate:
+            unique_objects.append(obj)
+
+    logger.debug(f"Total objects after duplicate removal: {len(unique_objects)} (removed {duplicates_removed} duplicates)")
+
+    # --- Render Projected Objects onto the BEV Map ---
+    for obj_info in unique_objects:
+        x_ego = obj_info['x_ego']
+        y_ego = obj_info['y_ego']
+        obj_class = obj_info['class']
+
+        # Convert ego coordinates (meters) to BEV map pixel coordinates.
+        # In nuScenes coordinate system: X+ is right, Y+ is forward, Z+ is up
+        # BEV map: columns represent X (left-right), rows represent Y (forward-back)
+        
+        # Ego X range: [x_min_m, x_max_m] -> BEV columns: [0, bev_map_width_pixels-1]
+        col_bev = int((x_ego - x_min_m) / bev_map_res_m_per_pixel)
+        # Ego Y range: [y_min_m, y_max_m] -> BEV rows: [bev_map_height_pixels-1, 0] (inverted)
+        # Y+ (forward) should appear at top of image (lower row indices)
+        row_bev = int(bev_map_height_pixels - 1 - ((y_ego - y_min_m) / bev_map_res_m_per_pixel))
+
+        # Ensure projected point is within the defined BEV map boundaries
+        if 0 <= col_bev < bev_map_width_pixels and 0 <= row_bev < bev_map_height_pixels:
+            if 'car' in obj_class.lower() or 'vehicle' in obj_class.lower() or \
+               'truck' in obj_class.lower() or 'bus' in obj_class.lower() or \
+               'trailer' in obj_class.lower() or 'construction_vehicle' in obj_class.lower():
+                car_width_bev = int(2.0 / bev_map_res_m_per_pixel)
+                car_length_bev = int(4.5 / bev_map_res_m_per_pixel)
+                color = (0, 255, 255) # Yellow (BGR)
+
+                cv2.rectangle(bev_map,
+                              (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2),
+                              (col_bev + car_width_bev // 2, row_bev + car_length_bev // 2),
+                              color, -1)
+                cv2.putText(bev_map, 'Car', (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2 - 5),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+
+            elif 'pedestrian' in obj_class.lower() or 'person' in obj_class.lower():
+                ped_width_bev = int(0.6 / bev_map_res_m_per_pixel) # Approx 0.6m wide
+                ped_length_bev = int(0.6 / bev_map_res_m_per_pixel) # Approx 0.6m long
+                color = (255, 0, 0) # Blue (BGR)
+
+                cv2.rectangle(bev_map,
+                              (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2),
+                              (col_bev + ped_width_bev // 2, row_bev + ped_length_bev // 2),
+                              color, -1)
+                cv2.putText(bev_map, 'Ped', (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2 - 5),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+
+            elif 'traffic_cone' in obj_class.lower():
+                cone_radius_bev = int(0.3 / bev_map_res_m_per_pixel / 2)
+                color = (0, 0, 255) # Red (BGR)
+                cv2.circle(bev_map, (col_bev, row_bev), cone_radius_bev, color, -1)
+                cv2.putText(bev_map, 'Cone', (col_bev - cone_radius_bev, row_bev - cone_radius_bev - 5),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+
+            elif 'barrier' in obj_class.lower():
+                barrier_width_bev = int(0.2 / bev_map_res_m_per_pixel)
+                barrier_length_bev = int(1.5 / bev_map_res_m_per_pixel)
+                color = (128, 128, 128) # Grey (BGR)
+                cv2.rectangle(bev_map,
+                              (col_bev - barrier_width_bev // 2, row_bev - barrier_length_bev // 2),
+                              (col_bev + barrier_width_bev // 2, row_bev + barrier_length_bev // 2),
+                              color, -1)
+                cv2.putText(bev_map, 'Barrier', (col_bev - barrier_width_bev // 2, row_bev - barrier_length_bev // 2 - 5),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+
+    # --- Draw Ego Vehicle ---
+    ego_x_m = 0.0 # Ego vehicle is at (0,0) in its own frame
+    ego_y_m = 0.0
+
+    # Convert ego (0,0) to BEV map pixel coordinates
+    ego_col_bev = int((ego_x_m - x_min_m) / bev_map_res_m_per_pixel)
+    ego_row_bev = int(bev_map_height_pixels - 1 - ((ego_y_m - y_min_m) / bev_map_res_m_per_pixel))
+    
+    # Ego vehicle dimensions (approximate typical car size)
+    ego_width_m = 2.0
+    ego_length_m = 5.0
+    ego_width_pixels = int(ego_width_m / bev_map_res_m_per_pixel)
+    ego_length_pixels = int(ego_length_m / bev_map_res_m_per_pixel)
+    
+    ego_color = (0, 0, 255) # Red (BGR)
+    cv2.rectangle(bev_map,
+                  (ego_col_bev - ego_width_pixels // 2, ego_row_bev - ego_length_pixels // 2),
+                  (ego_col_bev + ego_width_pixels // 2, ego_row_bev + ego_length_pixels // 2),
+                  ego_color, -1)
+    cv2.putText(bev_map, 'Ego', (ego_col_bev - ego_width_pixels // 2, ego_row_bev - ego_length_pixels // 2 - 5),
+                cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+    
+    # Draw a forward arrow for ego vehicle (Y+ is forward, should point towards top of image)
+    # Arrow points from center towards smaller row index (upward in image = forward in world)
+    arrow_end_y = ego_row_bev - ego_length_pixels // 2 - 10
+    cv2.arrowedLine(bev_map, (ego_col_bev, ego_row_bev), (ego_col_bev, arrow_end_y), (0, 255, 0), 2)
+
+    # Add orientation verification markers
+    _add_orientation_markers(bev_map, bev_map_width_pixels, bev_map_height_pixels)
+
+    # Validate BEV orientation with front camera objects
+    _validate_bev_orientation(unique_objects, bev_map_height_pixels, bev_map_res_m_per_pixel, y_min_m)
+
+    return bev_map
+
+
+def _add_orientation_markers(bev_map, width, height):
+    """
+    Add orientation markers to verify BEV coordinate system.
+    Front should be at top, back at bottom, left on left side, right on right side.
+    """
+    marker_color = (255, 255, 255)  # White
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.6
+    thickness = 2
+    
+    # Add directional labels
+    cv2.putText(bev_map, 'FRONT', (width//2 - 30, 25), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'BACK', (width//2 - 25, height - 10), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'LEFT', (10, height//2), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'RIGHT', (width - 60, height//2), font, font_scale, marker_color, thickness)
+    
+    # Add coordinate axes
+    center_x, center_y = width//2, height//2
+    axis_length = 30
+    
+    # X-axis (horizontal, positive to the right)
+    cv2.arrowedLine(bev_map, (center_x, center_y), (center_x + axis_length, center_y), (0, 255, 255), 2)
+    cv2.putText(bev_map, 'X+', (center_x + axis_length + 5, center_y + 5), font, 0.4, (0, 255, 255), 1)
+    
+    # Y-axis (vertical, positive upward/forward)
+    cv2.arrowedLine(bev_map, (center_x, center_y), (center_x, center_y - axis_length), (255, 0, 255), 2)
+    cv2.putText(bev_map, 'Y+', (center_x + 5, center_y - axis_length - 5), font, 0.4, (255, 0, 255), 1)
+
+
+def _validate_bev_orientation(objects, bev_height_pixels, resolution, y_min_m):
+    """
+    Validate that the BEV orientation is correct by checking if CAM_FRONT objects 
+    appear in the upper part of the image (smaller row indices).
+    """
+    front_objects = [obj for obj in objects if 'FRONT' in obj['camera_name'] and obj['y_ego'] > 0]
+    back_objects = [obj for obj in objects if 'BACK' in obj['camera_name'] and obj['y_ego'] < 0]
+    
+    if front_objects:
+        front_rows = []
+        for obj in front_objects:
+            y_ego = obj['y_ego']
+            row_bev = int(bev_height_pixels - 1 - ((y_ego - y_min_m) / resolution))
+            front_rows.append(row_bev)
+        
+        avg_front_row = np.mean(front_rows)
+        logger.debug(f"CAM_FRONT objects average row: {avg_front_row:.1f} (should be < {bev_height_pixels/2} for upper half)")
+        
+        if avg_front_row > bev_height_pixels / 2:
+            logger.warning("CAM_FRONT objects appear in lower half of BEV - check coordinate system!")
+    
+    if back_objects:
+        back_rows = []
+        for obj in back_objects:
+            y_ego = obj['y_ego']
+            row_bev = int(bev_height_pixels - 1 - ((y_ego - y_min_m) / resolution))
+            back_rows.append(row_bev)
+        
+        avg_back_row = np.mean(back_rows)
+        logger.debug(f"CAM_BACK objects average row: {avg_back_row:.1f} (should be > {bev_height_pixels/2} for lower half)")
+        
+        if avg_back_row < bev_height_pixels / 2:
+            logger.warning("CAM_BACK objects appear in upper half of BEV - check coordinate system!")
+
+
+def _is_better_camera_view(obj1, obj2):
+    """
+    Determine if obj1 has a better camera view than obj2 based on object position and camera type.
+    """
+    x1, y1 = obj1['x_ego'], obj1['y_ego']
+    x2, y2 = obj2['x_ego'], obj2['y_ego']
+    cam1 = obj1['camera_name']
+    cam2 = obj2['camera_name']
+    
+    # Score cameras based on how well they align with object position
+    def get_camera_score(x, y, camera_name):
+        score = 0
+        # Front cameras are best for forward objects (y > 0)
+        if 'FRONT' in camera_name and y > 0:
+            score += 3
+        # Back cameras are best for rear objects (y < 0)
+        elif 'BACK' in camera_name and y < 0:
+            score += 3
+        # Left cameras are best for left objects (x < 0)
+        if 'LEFT' in camera_name and x < 0:
+            score += 2
+        # Right cameras are best for right objects (x > 0)
+        elif 'RIGHT' in camera_name and x > 0:
+            score += 2
+        # Center cameras (FRONT, BACK) are good for center objects
+        if camera_name in ['CAM_FRONT', 'CAM_BACK'] and abs(x) < 5:
+            score += 1
+        return score
+    
+    score1 = get_camera_score(x1, y1, cam1)
+    score2 = get_camera_score(x2, y2, cam2)
+    
+    return score1 > score2
+
+
+def generate_bevs(data):
+    for scene_id, scene_obj in tqdm(data.items(), desc="Generating BEVs"):
+        for key_frame_id, key_frame in scene_obj["key_frames"].items():
+            image_paths = key_frame["image_paths"]
+            image_name = f"{scene_id}_{key_frame_id}__BEV.jpg"
+            bev_path = bev_dir / image_name
+            image_paths["BEV"] = "../nuscenes/samples/BEV/" + image_name
+
+            #if not bev_path.exists():
+            image_paths = {
+                key: os.path.join(drivelm_dir, path)
+                for key, path in image_paths.items()
+            }
+            kois = key_frame["key_object_infos"]
+            calibration = key_frame["camera_calibration"]
+            bev_img = generate_bev_from_detections(
+                kois=kois,
+                calibration=calibration,
+            )
+            cv2.imwrite(bev_path, bev_img)
+            logger.debug(f"Saved bev image: {bev_img}")
+    return data
diff --git a/src/data/generate_yolo_kois.py b/src/data/generate_yolo_kois.py
index 4a2562a..b70669c 100644
--- a/src/data/generate_yolo_kois.py
+++ b/src/data/generate_yolo_kois.py
@@ -5,7 +5,7 @@
 from src.constants import drivelm_dir
 
 
-def generate_yolo_kois(data):
+def generate_yolo_kois(data, max_results_per_camera:int = 5):
     model = YOLO("yolo11n.pt")
     for _, scene_obj in data.items():
         for _, key_frame in scene_obj["key_frames"].items():
@@ -13,9 +13,10 @@ def generate_yolo_kois(data):
             i = 0
             kois = []
             for camera, image_path in image_paths_raw.items():
-                results = model(os.path.join(drivelm_dir, image_path))
+                results = model(os.path.join(drivelm_dir, image_path))[:max_results_per_camera]
+                bbox = [xyxy for res in results for xyxy in res.boxes.xyxy.cpu().tolist()]
                 center_points = [
-                    (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh
+                    (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh.cpu()
                 ]
                 categories = [
                     res.names[cls.item()]
@@ -28,13 +29,15 @@ def generate_yolo_kois(data):
                         (
                             f"<c{i},{camera},{center_points[j][0]},{center_points[j][1]}>",
                             categories[j],
+                            bbox[j]
                         )
                     )
             key_frame["key_object_infos"] = {
                 descriptor: {
                     "Category": category,
+                    "2d_bbox": bbox,
                 }
-                for descriptor, category in kois
+                for descriptor, category, bbox in kois
             }
 
     return data
diff --git a/src/data/get_sensor_calibration.py b/src/data/get_sensor_calibration.py
index 07916b3..18fffbb 100644
--- a/src/data/get_sensor_calibration.py
+++ b/src/data/get_sensor_calibration.py
@@ -1,4 +1,5 @@
 import polars as pl
+from tqdm import tqdm
 
 from src.constants import nuscenes_dir
 
@@ -40,7 +41,6 @@ def get_sample_data_and_calibrated_camera_lf() -> pl.LazyFrame:
     "CAM_BACK_RIGHT",
 ]
 
-# TODO: Think about using np arrays here instead
 class CameraCalibration:
     camera_intrinsic: list[list[float]]
     translation: list[float]
@@ -70,3 +70,10 @@ def get_camera_calibration(lf: pl.LazyFrame, key_frame_id) -> dict[str, CameraCa
             rotation=calibration["rotation"][0].to_list(),
         )
     return calibration_per_camera
+
+def get_calibration(data: dict):
+    lf = get_sample_data_and_calibrated_camera_lf()
+    for _, scene in tqdm(data.items(), desc="Fetching camera calibration data"):
+        for key_frame_id, key_frame in scene["key_frames"].items():
+            key_frame["camera_calibration"] = get_camera_calibration(lf, key_frame_id)
+    return data
diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index 9f44e0c..6249833 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -1,4 +1,5 @@
 import os
+import json
 from json import load
 from typing import List
 
@@ -13,10 +14,12 @@
 )
 from src.data.create_image_grid_dataset import create_image_grid_dataset
 from src.data.extract_test_dataset import extract_data
+from src.data.generate_bev import generate_bevs
 from src.data.generate_descriptor_qas import (
     generate_descriptor_qas,
 )
 from src.data.generate_yolo_kois import generate_yolo_kois
+from src.data.get_sensor_calibration import get_calibration
 from src.utils.logger import get_logger
 from src.utils.utils import extract_children
 
@@ -95,4 +98,8 @@ def load_dataset(
     if use_grid:
         data = create_image_grid_dataset(data)
 
+    # TODO: We should add a switch for this.
+    data = get_calibration(data)
+    data = generate_bevs(data)
+
     return data

From 6c157c3ec4f299b4c9953255a3ce0efbe923edb1 Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Wed, 30 Jul 2025 16:59:32 +0200
Subject: [PATCH 06/25] Ensure correct axes, increase threshold for similarity
 detection, add TODOs

---
 src/data/basic_dataset.py      |  5 +++++
 src/data/generate_bev.py       | 25 +++++++++++++++----------
 src/data/generate_yolo_kois.py |  2 +-
 src/data/load_dataset.py       |  3 +--
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index e44c07b..78b5c7a 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -5,10 +5,12 @@
 
 from src.constants import drivelm_dir
 from src.data.create_image_grid_dataset import create_image_grid_dataset
+from src.data.generate_bev import generate_bevs
 from src.data.generate_descriptor_qas import (
     generate_descriptor_qas,
 )
 from src.data.generate_reasoning_context import generate_reasoning_context
+from src.data.get_sensor_calibration import get_calibration
 from src.data.load_dataset import load_dataset
 from src.data.message_formats import MessageFormat
 from src.data.query_item import QueryItem
@@ -77,6 +79,9 @@ def __init__(
 
         if split == "val" and add_kois:
             data = generate_yolo_kois(data)
+            data = get_calibration(data)
+            data = generate_bevs(data)
+            # NOTE: We need to make sure this is executed AFTER we need actual image locations
             data = normalise_key_object_infos(data, resize_factor, use_grid)
 
         if use_grid:
diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
index 6ae92c2..59469fc 100644
--- a/src/data/generate_bev.py
+++ b/src/data/generate_bev.py
@@ -6,7 +6,7 @@
 
 from src.constants import bev_dir, drivelm_dir
 from src.data.get_sensor_calibration import CameraCalibration
-from src.utils.utils import get_logger
+from src.utils.utils import get_logger, key_object_str_to_dict
 
 
 logger = get_logger(__name__)
@@ -47,7 +47,7 @@ def generate_bev_from_detections(
 
     total_items = 0
     for camera_name, cam_calib in calibration.items():
-        current_camera_kois = [koi_val for koi_key, koi_val in kois.items() if camera_name in koi_key]
+        current_camera_kois = [koi_val for koi_key, koi_val in kois.items() if camera_name == key_object_str_to_dict(koi_key)["camera"]]
         current_camera_boxes = [koi["2d_bbox"] for koi in current_camera_kois]
         total_items += len(current_camera_boxes)
         current_camera_names = [koi["Category"] for koi in current_camera_kois]
@@ -95,8 +95,8 @@ def generate_bev_from_detections(
             # Store the projected object's information
             projected_object_info = {
                 'class': obj_name,
-                'x_ego': point_3d_ego[0],
-                'y_ego': point_3d_ego[1],
+                'x_ego': point_3d_ego[1],
+                'y_ego': point_3d_ego[0],
                 'z_ego': point_3d_ego[2], # Should be close to 0
                 'camera_name': camera_name,
                 'original_bbox': bbox
@@ -109,7 +109,7 @@ def generate_bev_from_detections(
     # --- Remove Duplicate Objects ---
     # Group objects by spatial proximity and class, keep the one with best visibility
     unique_objects = []
-    proximity_threshold = 2.0  # meters - objects within this distance are considered duplicates
+    proximity_threshold = 10.0  # meters - objects within this distance are considered duplicates
     duplicates_removed = 0
     
     for obj in all_projected_objects:
@@ -117,6 +117,7 @@ def generate_bev_from_detections(
         for unique_obj in unique_objects:
             # Check if objects are of same class and spatially close
             if (obj['class'] == unique_obj['class'] and
+                # TODO: Tune the prox threshold
                 np.sqrt((obj['x_ego'] - unique_obj['x_ego'])**2 + 
                        (obj['y_ego'] - unique_obj['y_ego'])**2) < proximity_threshold):
                 
@@ -142,7 +143,7 @@ def generate_bev_from_detections(
 
     # --- Render Projected Objects onto the BEV Map ---
     for obj_info in unique_objects:
-        x_ego = obj_info['x_ego']
+        x_ego = -obj_info['x_ego']
         y_ego = obj_info['y_ego']
         obj_class = obj_info['class']
 
@@ -156,13 +157,16 @@ def generate_bev_from_detections(
         # Y+ (forward) should appear at top of image (lower row indices)
         row_bev = int(bev_map_height_pixels - 1 - ((y_ego - y_min_m) / bev_map_res_m_per_pixel))
 
+        # TODO: Chekc the classes generated by YOLO against this so we cover everything we need
         # Ensure projected point is within the defined BEV map boundaries
         if 0 <= col_bev < bev_map_width_pixels and 0 <= row_bev < bev_map_height_pixels:
             if 'car' in obj_class.lower() or 'vehicle' in obj_class.lower() or \
                'truck' in obj_class.lower() or 'bus' in obj_class.lower() or \
                'trailer' in obj_class.lower() or 'construction_vehicle' in obj_class.lower():
-                car_width_bev = int(2.0 / bev_map_res_m_per_pixel)
-                car_length_bev = int(4.5 / bev_map_res_m_per_pixel)
+                # TODO: Adapt the size of the objects relative to the BEV size
+                #       -> The distance currently looks to small in relation to the size of the cars
+                car_width_bev = int(1.5 / bev_map_res_m_per_pixel)
+                car_length_bev = int(3.0 / bev_map_res_m_per_pixel)
                 color = (0, 255, 255) # Yellow (BGR)
 
                 cv2.rectangle(bev_map,
@@ -173,8 +177,8 @@ def generate_bev_from_detections(
                             cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
 
             elif 'pedestrian' in obj_class.lower() or 'person' in obj_class.lower():
-                ped_width_bev = int(0.6 / bev_map_res_m_per_pixel) # Approx 0.6m wide
-                ped_length_bev = int(0.6 / bev_map_res_m_per_pixel) # Approx 0.6m long
+                ped_width_bev = int(1.0 / bev_map_res_m_per_pixel)
+                ped_length_bev = int(1.0 / bev_map_res_m_per_pixel)
                 color = (255, 0, 0) # Blue (BGR)
 
                 cv2.rectangle(bev_map,
@@ -345,6 +349,7 @@ def generate_bevs(data):
             bev_path = bev_dir / image_name
             image_paths["BEV"] = "../nuscenes/samples/BEV/" + image_name
 
+            # TODO: Uncomment once done
             #if not bev_path.exists():
             image_paths = {
                 key: os.path.join(drivelm_dir, path)
diff --git a/src/data/generate_yolo_kois.py b/src/data/generate_yolo_kois.py
index 0dcc77c..3fcd571 100644
--- a/src/data/generate_yolo_kois.py
+++ b/src/data/generate_yolo_kois.py
@@ -13,7 +13,7 @@ def generate_yolo_kois(data, max_results_per_cam: int = 5):
             i = 0
             kois = []
             for camera, image_path in image_paths_raw.items():
-                results = model(os.path.join(drivelm_dir, image_path))[:max_results_per_camera]
+                results = model(os.path.join(drivelm_dir, image_path))[:max_results_per_cam]
                 bbox = [xyxy for res in results for xyxy in res.boxes.xyxy.cpu().tolist()]
                 center_points = [
                     (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh.cpu()
diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index 08843f2..b1e8509 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -1,5 +1,4 @@
 import os
-import json
 from json import load
 
 import gdown
@@ -51,7 +50,7 @@ def get_ds(split: str) -> None:
     extract_children(out_name, nuscenes_dir)
 
 
-def load_dataset(split: str):
+def load_dataset(split: str) -> dict:
     dataset_paths = {
         "train": drivelm_train_json,
         "val": drivelm_val_json,

From 18d8b9aefb5e94f5029659d0ace18986084d16ab Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 2 Aug 2025 14:14:47 +0200
Subject: [PATCH 07/25] restrict yolo classes, mute yolo logging

---
 src/data/generate_yolo_kois.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/data/generate_yolo_kois.py b/src/data/generate_yolo_kois.py
index 3fcd571..cc89cbf 100644
--- a/src/data/generate_yolo_kois.py
+++ b/src/data/generate_yolo_kois.py
@@ -1,5 +1,6 @@
 import os
 
+from tqdm import tqdm
 from ultralytics import YOLO
 
 from src.constants import drivelm_dir
@@ -7,13 +8,18 @@
 
 def generate_yolo_kois(data, max_results_per_cam: int = 5):
     model = YOLO("yolo11n.pt")
-    for _, scene_obj in data.items():
+    for _, scene_obj in tqdm(data.items(), desc="Generating KOIs with YOLO"):
         for _, key_frame in scene_obj["key_frames"].items():
             image_paths_raw = key_frame["image_paths"]
             i = 0
             kois = []
             for camera, image_path in image_paths_raw.items():
-                results = model(os.path.join(drivelm_dir, image_path))[:max_results_per_cam]
+                results = model(
+                    os.path.join(drivelm_dir, image_path), 
+                    max_det=max_results_per_cam,
+                    classes=[0, 1, 2, 3, 5, 6, 7, 9, 11], # [person, bicycle, car, motorcycle, bus, train, truck, traffic light, stop sign]
+                    verbose=False
+                )
                 bbox = [xyxy for res in results for xyxy in res.boxes.xyxy.cpu().tolist()]
                 center_points = [
                     (xywh[0], xywh[1]) for res in results for xywh in res.boxes.xywh.cpu()

From 63ed727fb458ca58aa02c4ca564ed74b0b93f800 Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 2 Aug 2025 14:15:25 +0200
Subject: [PATCH 08/25] Use lazyframes only for initial filtering and collect
 after join

---
 src/data/get_sensor_calibration.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/data/get_sensor_calibration.py b/src/data/get_sensor_calibration.py
index 18fffbb..5d93703 100644
--- a/src/data/get_sensor_calibration.py
+++ b/src/data/get_sensor_calibration.py
@@ -4,7 +4,7 @@
 from src.constants import nuscenes_dir
 
 
-def get_sample_data_and_calibrated_camera_lf() -> pl.LazyFrame:
+def get_sample_data_and_calibrated_camera_df() -> pl.DataFrame:
     sample_data_lf = pl.read_json(nuscenes_dir / "sample_data.json").lazy()
     sample_data_lf = sample_data_lf.filter(
         pl.col("is_key_frame") == True  # noqa: E712
@@ -29,7 +29,7 @@ def get_sample_data_and_calibrated_camera_lf() -> pl.LazyFrame:
         left_on="calibrated_sensor_token", 
         right_on="token", 
         suffix="_calibrated"
-    )
+    ).collect()
 
 
 cameras = [
@@ -52,7 +52,7 @@ def __init__(self, camera_intrinsic, translation, rotation):
         self.rotation = rotation
 
 
-def get_camera_calibration(lf: pl.LazyFrame, key_frame_id) -> dict[str, CameraCalibration]:
+def get_camera_calibration(lf: pl.DataFrame, key_frame_id) -> dict[str, CameraCalibration]:
     calibration_per_camera = {}
     for cam in cameras:
         calibration = lf.filter(
@@ -62,7 +62,7 @@ def get_camera_calibration(lf: pl.LazyFrame, key_frame_id) -> dict[str, CameraCa
             "translation",
             "rotation",
             "camera_intrinsic"
-        ).collect().to_dict()
+        ).to_dict()
         assert len(calibration["translation"]) == 1
         calibration_per_camera[cam] = CameraCalibration(
             camera_intrinsic=calibration["camera_intrinsic"][0].to_list(),
@@ -72,7 +72,7 @@ def get_camera_calibration(lf: pl.LazyFrame, key_frame_id) -> dict[str, CameraCa
     return calibration_per_camera
 
 def get_calibration(data: dict):
-    lf = get_sample_data_and_calibrated_camera_lf()
+    lf = get_sample_data_and_calibrated_camera_df()
     for _, scene in tqdm(data.items(), desc="Fetching camera calibration data"):
         for key_frame_id, key_frame in scene["key_frames"].items():
             key_frame["camera_calibration"] = get_camera_calibration(lf, key_frame_id)

From d00cf65f58da445123d60a2d334d445e9c6877e6 Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 2 Aug 2025 14:16:25 +0200
Subject: [PATCH 09/25] Clean up, use ids from KOI in image

---
 src/data/generate_bev.py | 167 +++++++++------------------------------
 1 file changed, 36 insertions(+), 131 deletions(-)

diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
index 59469fc..cce01df 100644
--- a/src/data/generate_bev.py
+++ b/src/data/generate_bev.py
@@ -41,13 +41,15 @@ def generate_bev_from_detections(
     bev_map_height_pixels = int(bev_map_y_range / bev_map_res_m_per_pixel)
 
     bev_map = np.zeros((bev_map_height_pixels, bev_map_width_pixels, 3), dtype=np.uint8)
-    bev_map.fill(20)  # Dark background for the BEV map
+    bev_map.fill(20)
 
     all_projected_objects = []
 
     total_items = 0
     for camera_name, cam_calib in calibration.items():
-        current_camera_kois = [koi_val for koi_key, koi_val in kois.items() if camera_name == key_object_str_to_dict(koi_key)["camera"]]
+        current_keys = [koi_key for koi_key in kois.keys() if camera_name == key_object_str_to_dict(koi_key)["camera"]]
+        current_identifiers = [key_object_str_to_dict(k)["id"] for k in current_keys]
+        current_camera_kois = [kois[k] for k in current_keys]
         current_camera_boxes = [koi["2d_bbox"] for koi in current_camera_kois]
         total_items += len(current_camera_boxes)
         current_camera_names = [koi["Category"] for koi in current_camera_kois]
@@ -95,21 +97,19 @@ def generate_bev_from_detections(
             # Store the projected object's information
             projected_object_info = {
                 'class': obj_name,
-                'x_ego': point_3d_ego[1],
+                'x_ego': -point_3d_ego[1],
                 'y_ego': point_3d_ego[0],
                 'z_ego': point_3d_ego[2], # Should be close to 0
                 'camera_name': camera_name,
-                'original_bbox': bbox
+                'original_bbox': bbox,
+                'identifier': current_identifiers[i],
             }
             all_projected_objects.append(projected_object_info)
 
-    logger.debug(f"Total objects detected across all cameras: {total_items}")
-    logger.debug(f"Total objects after initial projection: {len(all_projected_objects)}")
-    
     # --- Remove Duplicate Objects ---
     # Group objects by spatial proximity and class, keep the one with best visibility
     unique_objects = []
-    proximity_threshold = 10.0  # meters - objects within this distance are considered duplicates
+    proximity_threshold = 10.0
     duplicates_removed = 0
     
     for obj in all_projected_objects:
@@ -139,72 +139,45 @@ def generate_bev_from_detections(
         if not is_duplicate:
             unique_objects.append(obj)
 
-    logger.debug(f"Total objects after duplicate removal: {len(unique_objects)} (removed {duplicates_removed} duplicates)")
-
     # --- Render Projected Objects onto the BEV Map ---
     for obj_info in unique_objects:
-        x_ego = -obj_info['x_ego']
+        x_ego = obj_info['x_ego']
         y_ego = obj_info['y_ego']
         obj_class = obj_info['class']
+        identifier = obj_info['identifier']
 
-        # Convert ego coordinates (meters) to BEV map pixel coordinates.
-        # In nuScenes coordinate system: X+ is right, Y+ is forward, Z+ is up
-        # BEV map: columns represent X (left-right), rows represent Y (forward-back)
-        
-        # Ego X range: [x_min_m, x_max_m] -> BEV columns: [0, bev_map_width_pixels-1]
         col_bev = int((x_ego - x_min_m) / bev_map_res_m_per_pixel)
-        # Ego Y range: [y_min_m, y_max_m] -> BEV rows: [bev_map_height_pixels-1, 0] (inverted)
-        # Y+ (forward) should appear at top of image (lower row indices)
         row_bev = int(bev_map_height_pixels - 1 - ((y_ego - y_min_m) / bev_map_res_m_per_pixel))
 
-        # TODO: Chekc the classes generated by YOLO against this so we cover everything we need
-        # Ensure projected point is within the defined BEV map boundaries
         if 0 <= col_bev < bev_map_width_pixels and 0 <= row_bev < bev_map_height_pixels:
-            if 'car' in obj_class.lower() or 'vehicle' in obj_class.lower() or \
-               'truck' in obj_class.lower() or 'bus' in obj_class.lower() or \
-               'trailer' in obj_class.lower() or 'construction_vehicle' in obj_class.lower():
-                # TODO: Adapt the size of the objects relative to the BEV size
-                #       -> The distance currently looks to small in relation to the size of the cars
+            if 'car' in obj_class.lower() or 'truck' in obj_class.lower() or 'bus' in obj_class.lower() or \
+               'bicycle' in obj_class.lower() or 'motorcycle' in obj_class.lower():
                 car_width_bev = int(1.5 / bev_map_res_m_per_pixel)
                 car_length_bev = int(3.0 / bev_map_res_m_per_pixel)
                 color = (0, 255, 255) # Yellow (BGR)
 
-                cv2.rectangle(bev_map,
-                              (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2),
-                              (col_bev + car_width_bev // 2, row_bev + car_length_bev // 2),
-                              color, -1)
-                cv2.putText(bev_map, 'Car', (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2 - 5),
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+                if (0 <= (col_bev - car_width_bev) and (col_bev + car_width_bev) < bev_map_width_pixels
+                    and 0 <= (row_bev - car_length_bev) and (row_bev + car_length_bev) < bev_map_height_pixels):
+                    cv2.rectangle(bev_map,
+                                (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2),
+                                (col_bev + car_width_bev // 2, row_bev + car_length_bev // 2),
+                                color, -1)
+                    cv2.putText(bev_map, identifier, (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2 - 5),
+                                cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
 
-            elif 'pedestrian' in obj_class.lower() or 'person' in obj_class.lower():
+            elif 'person' in obj_class.lower():
                 ped_width_bev = int(1.0 / bev_map_res_m_per_pixel)
                 ped_length_bev = int(1.0 / bev_map_res_m_per_pixel)
                 color = (255, 0, 0) # Blue (BGR)
 
-                cv2.rectangle(bev_map,
-                              (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2),
-                              (col_bev + ped_width_bev // 2, row_bev + ped_length_bev // 2),
-                              color, -1)
-                cv2.putText(bev_map, 'Ped', (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2 - 5),
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
-
-            elif 'traffic_cone' in obj_class.lower():
-                cone_radius_bev = int(0.3 / bev_map_res_m_per_pixel / 2)
-                color = (0, 0, 255) # Red (BGR)
-                cv2.circle(bev_map, (col_bev, row_bev), cone_radius_bev, color, -1)
-                cv2.putText(bev_map, 'Cone', (col_bev - cone_radius_bev, row_bev - cone_radius_bev - 5),
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
-
-            elif 'barrier' in obj_class.lower():
-                barrier_width_bev = int(0.2 / bev_map_res_m_per_pixel)
-                barrier_length_bev = int(1.5 / bev_map_res_m_per_pixel)
-                color = (128, 128, 128) # Grey (BGR)
-                cv2.rectangle(bev_map,
-                              (col_bev - barrier_width_bev // 2, row_bev - barrier_length_bev // 2),
-                              (col_bev + barrier_width_bev // 2, row_bev + barrier_length_bev // 2),
-                              color, -1)
-                cv2.putText(bev_map, 'Barrier', (col_bev - barrier_width_bev // 2, row_bev - barrier_length_bev // 2 - 5),
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+                if (0 <= (col_bev - ped_width_bev) and (col_bev + ped_width_bev) < bev_map_width_pixels
+                    and 0 <= (row_bev - ped_length_bev) and (row_bev + ped_length_bev) < bev_map_height_pixels):
+                    cv2.rectangle(bev_map,
+                                    (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2),
+                                    (col_bev + ped_width_bev // 2, row_bev + ped_length_bev // 2),
+                                    color, -1)
+                    cv2.putText(bev_map, identifier, (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2 - 5),
+                                cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
 
     # --- Draw Ego Vehicle ---
     ego_x_m = 0.0 # Ego vehicle is at (0,0) in its own frame
@@ -215,10 +188,8 @@ def generate_bev_from_detections(
     ego_row_bev = int(bev_map_height_pixels - 1 - ((ego_y_m - y_min_m) / bev_map_res_m_per_pixel))
     
     # Ego vehicle dimensions (approximate typical car size)
-    ego_width_m = 2.0
-    ego_length_m = 5.0
-    ego_width_pixels = int(ego_width_m / bev_map_res_m_per_pixel)
-    ego_length_pixels = int(ego_length_m / bev_map_res_m_per_pixel)
+    ego_width_pixels = int(1.5 / bev_map_res_m_per_pixel)
+    ego_length_pixels = int(3.0 / bev_map_res_m_per_pixel)
     
     ego_color = (0, 0, 255) # Red (BGR)
     cv2.rectangle(bev_map,
@@ -228,82 +199,17 @@ def generate_bev_from_detections(
     cv2.putText(bev_map, 'Ego', (ego_col_bev - ego_width_pixels // 2, ego_row_bev - ego_length_pixels // 2 - 5),
                 cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
     
-    # Draw a forward arrow for ego vehicle (Y+ is forward, should point towards top of image)
-    # Arrow points from center towards smaller row index (upward in image = forward in world)
-    arrow_end_y = ego_row_bev - ego_length_pixels // 2 - 10
-    cv2.arrowedLine(bev_map, (ego_col_bev, ego_row_bev), (ego_col_bev, arrow_end_y), (0, 255, 0), 2)
-
-    # Add orientation verification markers
-    _add_orientation_markers(bev_map, bev_map_width_pixels, bev_map_height_pixels)
-
-    # Validate BEV orientation with front camera objects
-    _validate_bev_orientation(unique_objects, bev_map_height_pixels, bev_map_res_m_per_pixel, y_min_m)
-
-    return bev_map
-
-
-def _add_orientation_markers(bev_map, width, height):
-    """
-    Add orientation markers to verify BEV coordinate system.
-    Front should be at top, back at bottom, left on left side, right on right side.
-    """
     marker_color = (255, 255, 255)  # White
     font = cv2.FONT_HERSHEY_SIMPLEX
     font_scale = 0.6
     thickness = 2
     
-    # Add directional labels
-    cv2.putText(bev_map, 'FRONT', (width//2 - 30, 25), font, font_scale, marker_color, thickness)
-    cv2.putText(bev_map, 'BACK', (width//2 - 25, height - 10), font, font_scale, marker_color, thickness)
-    cv2.putText(bev_map, 'LEFT', (10, height//2), font, font_scale, marker_color, thickness)
-    cv2.putText(bev_map, 'RIGHT', (width - 60, height//2), font, font_scale, marker_color, thickness)
-    
-    # Add coordinate axes
-    center_x, center_y = width//2, height//2
-    axis_length = 30
-    
-    # X-axis (horizontal, positive to the right)
-    cv2.arrowedLine(bev_map, (center_x, center_y), (center_x + axis_length, center_y), (0, 255, 255), 2)
-    cv2.putText(bev_map, 'X+', (center_x + axis_length + 5, center_y + 5), font, 0.4, (0, 255, 255), 1)
-    
-    # Y-axis (vertical, positive upward/forward)
-    cv2.arrowedLine(bev_map, (center_x, center_y), (center_x, center_y - axis_length), (255, 0, 255), 2)
-    cv2.putText(bev_map, 'Y+', (center_x + 5, center_y - axis_length - 5), font, 0.4, (255, 0, 255), 1)
-
+    cv2.putText(bev_map, 'FRONT', (bev_map_width_pixels//2 - 30, 25), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'BACK', (bev_map_width_pixels//2 - 25, bev_map_height_pixels - 10), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'LEFT', (10, bev_map_height_pixels//2), font, font_scale, marker_color, thickness)
+    cv2.putText(bev_map, 'RIGHT', (bev_map_width_pixels - 60, bev_map_height_pixels//2), font, font_scale, marker_color, thickness)
 
-def _validate_bev_orientation(objects, bev_height_pixels, resolution, y_min_m):
-    """
-    Validate that the BEV orientation is correct by checking if CAM_FRONT objects 
-    appear in the upper part of the image (smaller row indices).
-    """
-    front_objects = [obj for obj in objects if 'FRONT' in obj['camera_name'] and obj['y_ego'] > 0]
-    back_objects = [obj for obj in objects if 'BACK' in obj['camera_name'] and obj['y_ego'] < 0]
-    
-    if front_objects:
-        front_rows = []
-        for obj in front_objects:
-            y_ego = obj['y_ego']
-            row_bev = int(bev_height_pixels - 1 - ((y_ego - y_min_m) / resolution))
-            front_rows.append(row_bev)
-        
-        avg_front_row = np.mean(front_rows)
-        logger.debug(f"CAM_FRONT objects average row: {avg_front_row:.1f} (should be < {bev_height_pixels/2} for upper half)")
-        
-        if avg_front_row > bev_height_pixels / 2:
-            logger.warning("CAM_FRONT objects appear in lower half of BEV - check coordinate system!")
-    
-    if back_objects:
-        back_rows = []
-        for obj in back_objects:
-            y_ego = obj['y_ego']
-            row_bev = int(bev_height_pixels - 1 - ((y_ego - y_min_m) / resolution))
-            back_rows.append(row_bev)
-        
-        avg_back_row = np.mean(back_rows)
-        logger.debug(f"CAM_BACK objects average row: {avg_back_row:.1f} (should be > {bev_height_pixels/2} for lower half)")
-        
-        if avg_back_row < bev_height_pixels / 2:
-            logger.warning("CAM_BACK objects appear in upper half of BEV - check coordinate system!")
+    return bev_map
 
 
 def _is_better_camera_view(obj1, obj2):
@@ -315,7 +221,6 @@ def _is_better_camera_view(obj1, obj2):
     cam1 = obj1['camera_name']
     cam2 = obj2['camera_name']
     
-    # Score cameras based on how well they align with object position
     def get_camera_score(x, y, camera_name):
         score = 0
         # Front cameras are best for forward objects (y > 0)

From 4464c45fa0a4581258fc34b173907a28b8563a28 Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 2 Aug 2025 14:46:53 +0200
Subject: [PATCH 10/25] add switch for bev only inference + basic system prompt

---
 main.py                    |  1 +
 src/constants.py           |  1 +
 src/data/basic_dataset.py  | 15 +++++++++++----
 src/data/generate_bev.py   | 27 +++++++++++++--------------
 src/data/system_prompts.py |  9 ++++++---
 src/eval/eval_models.py    |  2 ++
 src/utils/approach.py      |  1 +
 src/utils/utils.py         |  6 ++++--
 8 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/main.py b/main.py
index 39f978f..fef943d 100644
--- a/main.py
+++ b/main.py
@@ -29,6 +29,7 @@
             "image_grid",
             "descriptor_qas",
             "add_kois",
+            "add_bev",
             "reasoning",
             "system_prompt",
         ],
diff --git a/src/constants.py b/src/constants.py
index ffde349..542ed0f 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -21,6 +21,7 @@
     IMAGE_SIZE[0] * GRID[0],
     IMAGE_SIZE[1] * GRID[1],
 )  # (height, width)
+BEV_IMG_SIZE = (500, 500)
 
 GRID_POSITIONS = {
     "CAM_FRONT_LEFT": (0, 0),
diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index 78b5c7a..f436148 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -50,6 +50,7 @@ def __init__(
         split="train",
         add_augmented=False,
         add_kois=False,
+        add_bev=False,
         use_grid=False,
         use_reasoning=False,
         use_system_prompt=False,
@@ -62,6 +63,7 @@ def __init__(
         self.split = split
         self.use_reasoning = use_reasoning
         self.use_grid = use_grid
+        self.add_bev = add_bev
         self.resize_factor = resize_factor
         self.system_prompt_provider = (
             SystemPromptProvider(config_path=system_prompt_config_path)
@@ -79,9 +81,9 @@ def __init__(
 
         if split == "val" and add_kois:
             data = generate_yolo_kois(data)
-            data = get_calibration(data)
-            data = generate_bevs(data)
-            # NOTE: We need to make sure this is executed AFTER we need actual image locations
+            if add_bev:
+                data = get_calibration(data)
+                data = generate_bevs(data)
             data = normalise_key_object_infos(data, resize_factor, use_grid)
 
         if use_grid:
@@ -92,13 +94,17 @@ def __init__(
         for scene_id in data.keys():
             scene_obj = data[scene_id]["key_frames"]
             for key_frame_id in scene_obj.keys():
-                # NOTE: Only consider FRONT camera images or GRID images for now
                 image_paths = scene_obj[key_frame_id]["image_paths"]
                 if use_grid:
                     image_path = os.path.join(
                         drivelm_dir,
                         image_paths["GRID"],
                     )
+                elif add_bev:
+                    image_path = os.path.join(
+                        drivelm_dir,
+                        image_paths["BEV"],
+                    )
                 else:
                     image_path = os.path.join(
                         drivelm_dir,
@@ -193,6 +199,7 @@ def __getitem__(self, idx):
                 question=question,
                 resize_factor=self.resize_factor,
                 use_grid=self.use_grid,
+                add_bev=self.add_bev,
                 use_reasoning=self.use_reasoning,
             )
             if self.system_prompt_provider
diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
index cce01df..1d586f9 100644
--- a/src/data/generate_bev.py
+++ b/src/data/generate_bev.py
@@ -254,18 +254,17 @@ def generate_bevs(data):
             bev_path = bev_dir / image_name
             image_paths["BEV"] = "../nuscenes/samples/BEV/" + image_name
 
-            # TODO: Uncomment once done
-            #if not bev_path.exists():
-            image_paths = {
-                key: os.path.join(drivelm_dir, path)
-                for key, path in image_paths.items()
-            }
-            kois = key_frame["key_object_infos"]
-            calibration = key_frame["camera_calibration"]
-            bev_img = generate_bev_from_detections(
-                kois=kois,
-                calibration=calibration,
-            )
-            cv2.imwrite(bev_path, bev_img)
-            logger.debug(f"Saved bev image: {bev_img}")
+            if not bev_path.exists():
+                image_paths = {
+                    key: os.path.join(drivelm_dir, path)
+                    for key, path in image_paths.items()
+                }
+                kois = key_frame["key_object_infos"]
+                calibration = key_frame["camera_calibration"]
+                bev_img = generate_bev_from_detections(
+                    kois=kois,
+                    calibration=calibration,
+                )
+                cv2.imwrite(bev_path, bev_img)
+                logger.debug(f"Saved bev image: {bev_img}")
     return data
diff --git a/src/data/system_prompts.py b/src/data/system_prompts.py
index 41fc68c..c0de28b 100644
--- a/src/data/system_prompts.py
+++ b/src/data/system_prompts.py
@@ -13,20 +13,23 @@ def __init__(self, config_path=None):
                 self.prompts = yaml.safe_load(f)
 
     def get_approach_prompt(
-        self, resize_factor: float, use_grid: bool = False, use_reasoning: bool = False, 
+        self, resize_factor: float, use_grid: bool = False, use_reasoning: bool = False, add_bev: bool = False
     ) -> str:
         approach = self.prompts.get("approach_prompt", {})
         prompt = approach.get("base", "You are an autonomous driving assistant. ")
 
         grid_prompts = approach.get("use_grid", {})
         if use_grid:
-            im_size = get_resize_image_size(resize_factor, True)
+            im_size = get_resize_image_size(resize_factor, grid=True)
             prompt += grid_prompts.get(
                 "enabled",
                 f"You are provided with a grid of images with size {im_size[1], im_size[0]} of the current situation. Starting from the upper left, the upper row shows images from the 'FRONT_LEFT', 'FRONT' and 'FRONT_RIGHT' cameras respectively. Starting from the bottom left, the lower row shows images from the 'BACK_LEFT', 'BACK' and 'BACK_RIGHT' cameras respectively. ",
             )
+        elif add_bev:
+            im_size = get_resize_image_size(resize_factor, bev=True)
+            prompt += f"You are provided with a birds eye view image with size {im_size[1], im_size[0]} of the vehicle and the sorrounding objects. The ego vehicle is marked in red, vehicles are marked in yellow and predestrians are marked in blue. Each of the objects is associated with an id, that corresponds to the id given in the list of key object infos. E.g. a vehicle with the id 'c1' would correspond to a key object '<c1,CAM_FRONT,200,400>'"
         else:
-            im_size = get_resize_image_size(resize_factor, False)
+            im_size = get_resize_image_size(resize_factor)
             prompt += grid_prompts.get(
                 "disabled",
                 f"You receive a single image with size {im_size[1], im_size[0]} from the front camera. ",
diff --git a/src/eval/eval_models.py b/src/eval/eval_models.py
index 86b40b4..d2eb361 100644
--- a/src/eval/eval_models.py
+++ b/src/eval/eval_models.py
@@ -30,6 +30,7 @@ def evaluate_model(
     system_prompt_config_path: Optional[str] = None,
     use_reasoning: bool = False,
     add_kois: bool = False,
+    add_bev: bool = False,
     approach_name: Optional[str] = None,
     exclude_question_tags: List[int] = [],
     exclude_question_types: List[str] = [],
@@ -39,6 +40,7 @@ def evaluate_model(
         message_format=engine.message_formatter,
         split=dataset_split,
         add_kois=add_kois,
+        add_bev=add_bev,
         use_grid=use_grid,
         use_reasoning=use_reasoning,
         use_system_prompt=use_system_prompt,
diff --git a/src/utils/approach.py b/src/utils/approach.py
index 651ebf2..c2075d0 100644
--- a/src/utils/approach.py
+++ b/src/utils/approach.py
@@ -6,6 +6,7 @@ def get_approach_kwargs(approaches: List[str]) -> Dict[str, Any]:
         "image_grid": {"use_grid": True},
         "descriptor_qas": {"use_augmented": True},
         "add_kois": {"add_kois": True},
+        "add_bev": {"add_bev": True},
         "reasoning": {"use_reasoning": True},
         "system_prompt": {"use_system_prompt": True},
         # Add more approaches here as needed
diff --git a/src/utils/utils.py b/src/utils/utils.py
index df46d2b..2a58633 100644
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -6,7 +6,7 @@
 import torch
 from torch.utils.data import Dataset, Subset
 
-from src.constants import GRID_IMG_SIZE, IMAGE_SIZE, GRID_POSITIONS
+from src.constants import BEV_IMG_SIZE, GRID_IMG_SIZE, IMAGE_SIZE, GRID_POSITIONS
 from src.data.query_item import QueryItem
 from src.utils.logger import get_logger
 
@@ -136,9 +136,11 @@ def tuple_mul(t: Tuple[float, float], scalar: float) -> Tuple[float, float]:
     return (t[0] * scalar, t[1] * scalar)
 
 
-def get_resize_image_size(resize_factor: float, grid: bool = False) -> Tuple[int, int]:
+def get_resize_image_size(resize_factor: float, grid: bool = False, bev: bool = False) -> Tuple[int, int]:
     if grid:
         size = tuple_mul(GRID_IMG_SIZE, resize_factor)
+    elif bev:
+        size = tuple_mul(BEV_IMG_SIZE, resize_factor)
     else:
         size = tuple_mul(IMAGE_SIZE, resize_factor)
     return tuple_cast(size, int)

From ab80b9ef55b6cdffc0a4a5af9064301bc41ff97c Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sat, 2 Aug 2025 15:07:10 +0200
Subject: [PATCH 11/25] fix extraction of nuscenes data

---
 src/data/load_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index b1e8509..46f37ef 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 from json import load
 
 import gdown
@@ -47,7 +48,7 @@ def get_ds(split: str) -> None:
         id="1sqW1y2k346mtLCQnO0NAab3sEzxUyQ_d",
         output=out_name,
     )
-    extract_children(out_name, nuscenes_dir)
+    shutil.unpack_archive(out_name, nuscenes_dir)
 
 
 def load_dataset(split: str) -> dict:

From 8a9beebccdc0d43289b8ef186b4a80370acec8df Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Sun, 3 Aug 2025 01:49:08 +0200
Subject: [PATCH 12/25] Ensure BEV dir creation

---
 src/data/generate_bev.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
index 1d586f9..fd2fba1 100644
--- a/src/data/generate_bev.py
+++ b/src/data/generate_bev.py
@@ -247,6 +247,8 @@ def get_camera_score(x, y, camera_name):
 
 
 def generate_bevs(data):
+    bev_dir.mkdir(parents=True, exist_ok=True)
+
     for scene_id, scene_obj in tqdm(data.items(), desc="Generating BEVs"):
         for key_frame_id, key_frame in scene_obj["key_frames"].items():
             image_paths = key_frame["image_paths"]

From efeb12ac12b5e4aacc2c80557c547f03a5e8c5ec Mon Sep 17 00:00:00 2001
From: Veit Laule <veit.laule@gmail.com>
Date: Mon, 4 Aug 2025 19:22:26 +0200
Subject: [PATCH 13/25] add option to combine front cam and bev + corresponding
 prompt

---
 main.py                    |  2 +-
 src/constants.py           |  1 +
 src/data/basic_dataset.py  |  5 ++++-
 src/data/generate_bev.py   | 19 ++++++++++++++++---
 src/data/system_prompts.py |  9 ++++++---
 src/eval/eval_models.py    |  2 ++
 src/utils/approach.py      |  1 +
 src/utils/utils.py         |  8 +++++---
 8 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/main.py b/main.py
index fef943d..c19750e 100644
--- a/main.py
+++ b/main.py
@@ -90,7 +90,7 @@
         )
     elif args.eval:
         resize_image_size = get_resize_image_size(
-            resize_factor=resize_factor, grid="image_grid" in args.approach
+            resize_factor=resize_factor, grid="image_grid" in args.approach, bev="add_bev" in args.approach, front_cam="front_cam" in args.approach,
         )
         logger.debug(f"Using resize image size: {resize_image_size}")
         if is_cuda():
diff --git a/src/constants.py b/src/constants.py
index 542ed0f..c3705ef 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -22,6 +22,7 @@
     IMAGE_SIZE[1] * GRID[1],
 )  # (height, width)
 BEV_IMG_SIZE = (500, 500)
+BEV_AND_FRONT_CAM_IMG_SIZE = (500, 1388)
 
 GRID_POSITIONS = {
     "CAM_FRONT_LEFT": (0, 0),
diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index f436148..a7a6cc4 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -49,6 +49,7 @@ def __init__(
         message_format: MessageFormat,
         split="train",
         add_augmented=False,
+        front_cam=False,
         add_kois=False,
         add_bev=False,
         use_grid=False,
@@ -61,6 +62,7 @@ def __init__(
     ):
         self.message_format = message_format
         self.split = split
+        self.front_cam = front_cam
         self.use_reasoning = use_reasoning
         self.use_grid = use_grid
         self.add_bev = add_bev
@@ -83,7 +85,7 @@ def __init__(
             data = generate_yolo_kois(data)
             if add_bev:
                 data = get_calibration(data)
-                data = generate_bevs(data)
+                data = generate_bevs(data, front_cam=front_cam)
             data = normalise_key_object_infos(data, resize_factor, use_grid)
 
         if use_grid:
@@ -200,6 +202,7 @@ def __getitem__(self, idx):
                 resize_factor=self.resize_factor,
                 use_grid=self.use_grid,
                 add_bev=self.add_bev,
+                front_cam=self.front_cam,
                 use_reasoning=self.use_reasoning,
             )
             if self.system_prompt_provider
diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
index fd2fba1..98506d8 100644
--- a/src/data/generate_bev.py
+++ b/src/data/generate_bev.py
@@ -246,13 +246,16 @@ def get_camera_score(x, y, camera_name):
     return score1 > score2
 
 
-def generate_bevs(data):
+def generate_bevs(data, front_cam: bool = False):
     bev_dir.mkdir(parents=True, exist_ok=True)
 
     for scene_id, scene_obj in tqdm(data.items(), desc="Generating BEVs"):
         for key_frame_id, key_frame in scene_obj["key_frames"].items():
             image_paths = key_frame["image_paths"]
-            image_name = f"{scene_id}_{key_frame_id}__BEV.jpg"
+            if front_cam:
+                image_name = f"{scene_id}_{key_frame_id}__BEV_FRONT_CAM.jpg"
+            else:
+                image_name = f"{scene_id}_{key_frame_id}__BEV.jpg"
             bev_path = bev_dir / image_name
             image_paths["BEV"] = "../nuscenes/samples/BEV/" + image_name
 
@@ -267,6 +270,16 @@ def generate_bevs(data):
                     kois=kois,
                     calibration=calibration,
                 )
-                cv2.imwrite(bev_path, bev_img)
+                if front_cam:
+                    front_image = cv2.imread(image_paths["CAM_FRONT"])
+                    target_height = min(front_image.shape[0], bev_img.shape[0])
+                    front_aspect = front_image.shape[1] / front_image.shape[0]
+                    front_width = int(target_height * front_aspect)
+                    front_resized = cv2.resize(front_image, (front_width, target_height))
+                    bev_resized = cv2.resize(bev_img, (bev_img.shape[1], target_height))
+                    combined_img = np.hstack([front_resized, bev_resized])
+                    cv2.imwrite(bev_path, combined_img)
+                else:
+                    cv2.imwrite(bev_path, bev_img)
                 logger.debug(f"Saved bev image: {bev_img}")
     return data
diff --git a/src/data/system_prompts.py b/src/data/system_prompts.py
index c0de28b..85b070d 100644
--- a/src/data/system_prompts.py
+++ b/src/data/system_prompts.py
@@ -13,7 +13,7 @@ def __init__(self, config_path=None):
                 self.prompts = yaml.safe_load(f)
 
     def get_approach_prompt(
-        self, resize_factor: float, use_grid: bool = False, use_reasoning: bool = False, add_bev: bool = False
+        self, resize_factor: float, use_grid: bool = False, use_reasoning: bool = False, add_bev: bool = False, front_cam: bool = False
     ) -> str:
         approach = self.prompts.get("approach_prompt", {})
         prompt = approach.get("base", "You are an autonomous driving assistant. ")
@@ -25,9 +25,12 @@ def get_approach_prompt(
                 "enabled",
                 f"You are provided with a grid of images with size {im_size[1], im_size[0]} of the current situation. Starting from the upper left, the upper row shows images from the 'FRONT_LEFT', 'FRONT' and 'FRONT_RIGHT' cameras respectively. Starting from the bottom left, the lower row shows images from the 'BACK_LEFT', 'BACK' and 'BACK_RIGHT' cameras respectively. ",
             )
-        elif add_bev:
+        elif add_bev and not front_cam:
             im_size = get_resize_image_size(resize_factor, bev=True)
-            prompt += f"You are provided with a birds eye view image with size {im_size[1], im_size[0]} of the vehicle and the sorrounding objects. The ego vehicle is marked in red, vehicles are marked in yellow and predestrians are marked in blue. Each of the objects is associated with an id, that corresponds to the id given in the list of key object infos. E.g. a vehicle with the id 'c1' would correspond to a key object '<c1,CAM_FRONT,200,400>'"
+            prompt += f"You are provided with a birds eye view image with size {im_size[1], im_size[0]} of the vehicle and the sorrounding objects. The ego vehicle is marked in red, vehicles are marked in yellow and predestrians are marked in blue. Each of the objects is associated with an id, that corresponds to the id given in the list of key object infos. E.g. a vehicle with the id 'c1' would correspond to a key object '<c1,CAM_FRONT,200,400>'. This view should provide you with a good overview of the objects surrounding the vehicle and their relative distance. "
+        elif add_bev and front_cam:
+            im_size = get_resize_image_size(resize_factor, bev=True, front_cam=True)
+            prompt += f"You are provided with the front view of the car together with a birds eye view image with size {im_size[1], im_size[0]} of the vehicle and the sorrounding objects. The ego vehicle is marked in red, vehicles are marked in yellow and predestrians are marked in blue. Each of the objects is associated with an id, that corresponds to the id given in the list of key object infos. E.g. a vehicle with the id 'c1' would correspond to a key object '<c1,CAM_FRONT,200,400>'. This view should provide you with a good overview of the objects surrounding the vehicle and their relative distance. "
         else:
             im_size = get_resize_image_size(resize_factor)
             prompt += grid_prompts.get(
diff --git a/src/eval/eval_models.py b/src/eval/eval_models.py
index d2eb361..9669b00 100644
--- a/src/eval/eval_models.py
+++ b/src/eval/eval_models.py
@@ -29,6 +29,7 @@ def evaluate_model(
     use_system_prompt: bool = False,
     system_prompt_config_path: Optional[str] = None,
     use_reasoning: bool = False,
+    front_cam: bool = False,
     add_kois: bool = False,
     add_bev: bool = False,
     approach_name: Optional[str] = None,
@@ -39,6 +40,7 @@ def evaluate_model(
     dataset = DriveLMImageDataset(
         message_format=engine.message_formatter,
         split=dataset_split,
+        front_cam=front_cam,
         add_kois=add_kois,
         add_bev=add_bev,
         use_grid=use_grid,
diff --git a/src/utils/approach.py b/src/utils/approach.py
index c2075d0..7cba317 100644
--- a/src/utils/approach.py
+++ b/src/utils/approach.py
@@ -3,6 +3,7 @@
 
 def get_approach_kwargs(approaches: List[str]) -> Dict[str, Any]:
     approach_kwargs_map = {
+        "front_cam": {"front_cam": True},
         "image_grid": {"use_grid": True},
         "descriptor_qas": {"use_augmented": True},
         "add_kois": {"add_kois": True},
diff --git a/src/utils/utils.py b/src/utils/utils.py
index 2a58633..09f4aae 100644
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -6,7 +6,7 @@
 import torch
 from torch.utils.data import Dataset, Subset
 
-from src.constants import BEV_IMG_SIZE, GRID_IMG_SIZE, IMAGE_SIZE, GRID_POSITIONS
+from src.constants import BEV_IMG_SIZE, BEV_AND_FRONT_CAM_IMG_SIZE, GRID_IMG_SIZE, IMAGE_SIZE, GRID_POSITIONS
 from src.data.query_item import QueryItem
 from src.utils.logger import get_logger
 
@@ -136,11 +136,13 @@ def tuple_mul(t: Tuple[float, float], scalar: float) -> Tuple[float, float]:
     return (t[0] * scalar, t[1] * scalar)
 
 
-def get_resize_image_size(resize_factor: float, grid: bool = False, bev: bool = False) -> Tuple[int, int]:
+def get_resize_image_size(resize_factor: float, grid: bool = False, bev: bool = False, front_cam: bool = False) -> Tuple[int, int]:
     if grid:
         size = tuple_mul(GRID_IMG_SIZE, resize_factor)
-    elif bev:
+    elif bev and not front_cam:
         size = tuple_mul(BEV_IMG_SIZE, resize_factor)
+    elif bev and front_cam:
+        size = tuple_mul(BEV_AND_FRONT_CAM_IMG_SIZE, resize_factor)
     else:
         size = tuple_mul(IMAGE_SIZE, resize_factor)
     return tuple_cast(size, int)

From 926f961d957a384bb00c0a9d8d1a9ecd52738980 Mon Sep 17 00:00:00 2001
From: Veit Laule <83905032+vDawgg@users.noreply.github.com>
Date: Wed, 6 Aug 2025 12:34:24 +0200
Subject: [PATCH 14/25] Larger model (#107)

---
 src/data/basic_dataset.py       | 4 +++-
 src/models/qwen_vl_inference.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index a7a6cc4..0d8fef4 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -124,7 +124,9 @@ def __init__(
                     else None
                 )
 
-                camera_calibration = scene_obj[key_frame_id]["camera_calibration"]
+                camera_calibration = None
+                if split=="val" and add_kois and add_bev:
+                    camera_calibration = scene_obj[key_frame_id]["camera_calibration"]
 
                 qas = scene_obj[key_frame_id]["QA"]
 
diff --git a/src/models/qwen_vl_inference.py b/src/models/qwen_vl_inference.py
index 6487604..342c497 100644
--- a/src/models/qwen_vl_inference.py
+++ b/src/models/qwen_vl_inference.py
@@ -17,7 +17,7 @@
 class QwenVLInferenceEngine(BaseInferenceEngine):
     def __init__(
         self,
-        processor_path: str = "Qwen/Qwen2.5-VL-3B-Instruct",
+        processor_path: str = "Qwen/Qwen2.5-VL-7B-Instruct",
         model_path: Optional[str] = None,
         use_4bit: bool = False,
         torch_dtype: Optional[torch.dtype] = None,

From a9774bfd9dcf331abdece5e59e899d1d2d4095b6 Mon Sep 17 00:00:00 2001
From: Veit Laule <83905032+vDawgg@users.noreply.github.com>
Date: Wed, 6 Aug 2025 18:40:40 +0200
Subject: [PATCH 15/25] Apply suggestions from code review

Co-authored-by: csiemssen <100309871+csiemssen@users.noreply.github.com>
---
 src/data/basic_dataset.py | 4 ++--
 src/data/load_dataset.py  | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index 0d8fef4..24d0559 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -81,7 +81,7 @@ def __init__(
         if split == "train" and add_augmented:
             data = generate_descriptor_qas(data)
 
-        if split == "val" and add_kois:
+        if (split == "val" or split == "test") and add_kois:
             data = generate_yolo_kois(data)
             if add_bev:
                 data = get_calibration(data)
@@ -102,7 +102,7 @@ def __init__(
                         drivelm_dir,
                         image_paths["GRID"],
                     )
-                elif add_bev:
+                elif add_kois and add_bev:
                     image_path = os.path.join(
                         drivelm_dir,
                         image_paths["BEV"],
diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index 46f37ef..8aa030e 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -43,12 +43,21 @@ def get_ds(split: str) -> None:
             id="1fsVP7jOpvChcpoXVdypaZ4HREX1gA7As",
             output=os.path.join(drivelm_dir, "v1_1_val_nus_q_only.json"),
         )
+    get_nuscenes_ds()
+
+
+def get_nuscenes_ds():
     out_name = os.path.join(nuscenes_dir, "nuscenes_json.zip")
     gdown.download(
         id="1sqW1y2k346mtLCQnO0NAab3sEzxUyQ_d",
         output=out_name,
     )
     shutil.unpack_archive(out_name, nuscenes_dir)
+    gdown.download(
+        id="1sqW1y2k346mtLCQnO0NAab3sEzxUyQ_d",
+        output=out_name,
+    )
+    shutil.unpack_archive(out_name, nuscenes_dir)
 
 
 def load_dataset(split: str) -> dict:

From 195d21c2d1443d57306fb87a5bb34f60f5e79d80 Mon Sep 17 00:00:00 2001
From: Caspar Siemssen <c.siemssen@campus.tu-berlin.de>
Date: Thu, 7 Aug 2025 18:24:00 +0200
Subject: [PATCH 16/25] add first gradio version

---
 gradio_app.py    | 698 +++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |   1 +
 2 files changed, 699 insertions(+)
 create mode 100644 gradio_app.py

diff --git a/gradio_app.py b/gradio_app.py
new file mode 100644
index 0000000..7a28bb7
--- /dev/null
+++ b/gradio_app.py
@@ -0,0 +1,698 @@
+import copy
+import os
+
+import gradio as gr
+
+from src.constants import bev_dir, drivelm_dir, grid_dir
+from src.data.basic_dataset import DriveLMImageDataset
+from src.data.load_dataset import load_dataset
+from src.models.gemma_inference import GemmaInferenceEngine
+from src.models.intern_vl_inference import InternVLInferenceEngine
+from src.models.qwen_vl_inference import QwenVLInferenceEngine
+
+PROVIDERS = {
+    "Google": {
+        "Gemma-3-4B-IT": {
+            "engine_class": GemmaInferenceEngine,
+            "kwargs": {"model_path": "google/gemma-3-4b-it"},
+        },
+    },
+    "Qwen": {
+        "Qwen2.5-VL-7B-Instruct": {
+            "engine_class": QwenVLInferenceEngine,
+            "kwargs": {"processor_path": "Qwen/Qwen2.5-VL-7B-Instruct"},
+        },
+        "Qwen2.5-VL-3B-Instruct": {
+            "engine_class": QwenVLInferenceEngine,
+            "kwargs": {"processor_path": "Qwen/Qwen2.5-VL-3B-Instruct"},
+        },
+    },
+    "InternVL": {
+        "OpenGVLab/InternVL3-2B": {
+            "engine_class": InternVLInferenceEngine,
+            "kwargs": {"model_path": "OpenGVLab/InternVL3-2B"},
+        }
+    },
+    # Add more providers/models here as needed
+}
+
+raw_dataset = None
+inference_engine = None
+dataset = None
+image_paths_list = []
+selected_question_item = None
+kois_active = True
+
+
+def get_engine(provider, model):
+    global inference_engine
+    engine_info = PROVIDERS[provider][model]
+    engine_class = engine_info["engine_class"]
+    inference_engine = engine_class(**engine_info.get("kwargs", {}))
+    inference_engine.load_model()
+    return inference_engine.model_path
+
+
+def get_dataset(split, add_kois, add_bev, use_grid, use_system_prompt):
+    global dataset, selected_question_item, raw_dataset
+
+    if inference_engine is None:
+        raise gr.Error("Please load model first", duration=2)
+
+    dataset = DriveLMImageDataset(
+        message_format=inference_engine.message_formatter,
+        split=split,
+        front_cam=True,
+        add_kois=add_kois,
+        add_bev=add_bev,
+        use_grid=use_grid,
+        use_system_prompt=use_system_prompt,
+    )
+    raw_dataset = load_dataset(split)
+
+    gr.Info(
+        f"Dataset {split} loaded successfully with length of {str(len(dataset))}",
+        duration=2,
+    )
+
+    return split
+
+
+def get_models(provider):
+    models = list(PROVIDERS[provider].keys())
+    return gr.update(choices=models, value=models[0] if models else None)
+
+
+def parse_keyframe_id(id: str):
+    return id.split("_")[1]
+
+
+def parse_scene_id(id: str):
+    return id.split("_")[0]
+
+
+def parse_question_id(id: str):
+    return id.split("_")[2]
+
+
+def filter_question_items(
+    items, scene_id=None, keyframe_id=None, question_type=None, question_id=None
+):
+    return [
+        item
+        for item in items
+        if (scene_id is None or parse_scene_id(item.qa_id) == scene_id)
+        and (keyframe_id is None or parse_keyframe_id(item.qa_id) == keyframe_id)
+        and (question_type is None or item.qa_type == question_type)
+        and (question_id is None or parse_question_id(item.qa_id) == question_id)
+    ]
+
+
+def get_scene_id():
+    global selected_question_item
+    if selected_question_item is not None:
+        return parse_scene_id(selected_question_item.qa_id)
+    return None
+
+
+def get_scenes(items):
+    return sorted({parse_scene_id(item.qa_id) for item in items})
+
+
+def render_scenes(items):
+    scene_ids = get_scenes(items)
+    return gr.update(
+        choices=scene_ids,
+        value=get_scene_id(),
+    )
+
+
+def get_keyframe_id():
+    global selected_question_item
+    if selected_question_item is not None:
+        return parse_keyframe_id(selected_question_item.qa_id)
+    return None
+
+
+def get_keyframes(items):
+    return sorted({parse_keyframe_id(item.qa_id) for item in items})
+
+
+def render_keyframes(items):
+    keyframe_ids = get_keyframes(items)
+    return gr.update(
+        choices=keyframe_ids,
+        value=get_keyframe_id(),
+    )
+
+
+def get_question_types(items):
+    return sorted({item.qa_type for item in items})
+
+
+def render_question_types(items):
+    question_types = get_question_types(items)
+    return gr.update(
+        choices=question_types,
+        value=get_question_type(),
+    )
+
+
+def get_question_type():
+    global selected_question_item
+    if selected_question_item is not None:
+        return selected_question_item.qa_type
+    return None
+
+
+def get_question_id():
+    global selected_question_item
+    if selected_question_item is not None:
+        return parse_question_id(selected_question_item.qa_id)
+    return None
+
+
+def get_question_ids(items):
+    return sorted({parse_question_id(item.qa_id) for item in items})
+
+
+def render_question_ids(items):
+    question_ids = get_question_ids(items)
+    return gr.update(
+        choices=question_ids,
+        value=get_question_id(),
+    )
+
+
+def get_images(items):
+    global image_paths_list
+    scene_id = get_scene_id()
+    keyframe_id = get_keyframe_id()
+
+    if raw_dataset is None:
+        raise_dataset_error()
+        return []
+
+    scene_data = raw_dataset[scene_id]
+    keyframe_data = scene_data["key_frames"][keyframe_id]
+
+    all_native_image_paths = set(keyframe_data["image_paths"].values())
+
+    other_image_files = [
+        os.path.join(bev_dir, f"{scene_id}_{keyframe_id}__BEV_FRONT_CAM.jpg"),
+        os.path.join(bev_dir, f"{scene_id}_{keyframe_id}__BEV.jpg"),
+        os.path.join(grid_dir, f"{scene_id}_{keyframe_id}__GRID.jpg"),
+    ]
+
+    for file in other_image_files:
+        if os.path.isfile(file):
+            all_native_image_paths.add(file)
+
+    image_paths_list = [
+        os.path.join(drivelm_dir, path) for path in all_native_image_paths
+    ]
+    return image_paths_list
+
+
+def get_image():
+    global selected_question_item
+    return (
+        selected_question_item.image_path
+        if selected_question_item is not None
+        else None
+    )
+
+
+def update_image(evt: gr.SelectData):
+    global image_paths_list, selected_question_item
+    if 0 <= evt.index < len(image_paths_list):
+        selected_question_item.image_path = image_paths_list[evt.index]
+        return [selected_question_item.image_path, get_formatted_question()]
+    return [None, get_formatted_question()]
+
+
+def get_question():
+    global selected_question_item
+    return (
+        selected_question_item.question if selected_question_item is not None else None
+    )
+
+
+def update_question(text):
+    global selected_question_item
+    if selected_question_item is None:
+        raise_dataset_error()
+        return None
+    selected_question_item.question = text
+    return [
+        selected_question_item.question,
+        get_formatted_question(),
+        get_ground_truth(invalid=True),
+    ]
+
+
+def get_system_prompt():
+    global selected_question_item
+    return (
+        selected_question_item.system_prompt
+        if selected_question_item is not None
+        else None
+    )
+
+
+def update_system_prompt(text):
+    global selected_question_item
+    if selected_question_item is None:
+        raise_dataset_error()
+        return None
+    selected_question_item.system_prompt = text
+    return [selected_question_item.system_prompt, get_formatted_question()]
+
+
+def get_ground_truth(invalid=False):
+    global selected_question_item
+
+    if invalid:
+        return gr.update(value=None, visible=False)
+
+    if selected_question_item is not None:
+        if selected_question_item.ground_truth_answer is None:
+            return gr.update(value=None, visible=False)
+
+        return gr.update(value=selected_question_item.ground_truth_answer, visible=True)
+    return gr.update(value=None, visible=False)
+
+
+def get_kois():
+    global selected_question_item
+    if selected_question_item is not None:
+        return [
+            selected_question_item.key_object_info,
+            get_formatted_question(),
+        ]
+    return [None, get_formatted_question()]
+
+
+def update_kois_active(active):
+    global kois_active
+    kois_active = active
+    return [active, get_formatted_question()]
+
+
+def update_question_item():
+    global dataset, selected_question_item
+    if dataset is None or len(dataset) == 0:
+        raise_dataset_error()
+        return None
+
+    selected_question_item = copy.deepcopy(dataset[0])
+    scene_items = filter_question_items(dataset, scene_id=get_scene_id())
+    return [
+        render_scenes(dataset),
+        *render_question_item_change_on_scene_id(scene_items),
+    ]
+
+
+def update_question_item_on_scene_id(scene_id):
+    global dataset, selected_question_item
+    if dataset is None or len(dataset) == 0:
+        raise_dataset_error()
+        return None
+    for item in dataset:
+        if parse_scene_id(item.qa_id) == scene_id:
+            selected_question_item = copy.deepcopy(item)
+            break
+
+    scene_items = filter_question_items(items=dataset, scene_id=scene_id)
+    return [*render_question_item_change_on_scene_id(scene_items)]
+
+
+def update_question_item_on_keyframe_id(scene_id, keyframe_id):
+    global dataset, selected_question_item
+    keyframe_items = filter_question_items(
+        items=dataset, scene_id=scene_id, keyframe_id=keyframe_id
+    )
+    for item in keyframe_items:
+        selected_question_item = copy.deepcopy(item)
+        break
+
+    return [*render_question_item_change_on_keyframe(keyframe_items)]
+
+
+def update_question_item_on_question_type(scene_id, keyframe_id, question_type):
+    global dataset, selected_question_item
+
+    question_type_items = filter_question_items(
+        items=dataset,
+        scene_id=scene_id,
+        keyframe_id=keyframe_id,
+        question_type=question_type,
+    )
+
+    for item in question_type_items:
+        selected_question_item = copy.deepcopy(item)
+        break
+
+    return [
+        *render_question_item_change_on_question_type(question_type_items),
+    ]
+
+
+def update_question_item_on_question_id(
+    scene_id, keyframe_id, question_type, question_id
+):
+    global dataset, selected_question_item
+
+    question_id_items = filter_question_items(
+        items=dataset,
+        scene_id=scene_id,
+        keyframe_id=keyframe_id,
+        question_type=question_type,
+        question_id=question_id,
+    )
+
+    for item in question_id_items:
+        if parse_question_id(item.qa_id) == question_id:
+            selected_question_item = copy.deepcopy(item)
+            break
+
+    return [get_question(), get_system_prompt(), get_formatted_question()]
+
+
+def render_question_item_change_on_scene_id(items):
+    keyframe_items = filter_question_items(items, keyframe_id=get_keyframe_id())
+    return [
+        render_keyframes(items),
+        *render_question_item_change_on_keyframe(keyframe_items),
+    ]
+
+
+def render_question_item_change_on_keyframe(items):
+    question_type_items = filter_question_items(
+        items, question_type=get_question_type()
+    )
+    return [
+        render_question_types(items),
+        *render_question_item_change_on_question_type(question_type_items),
+    ]
+
+
+def render_question_item_change_on_question_type(items):
+    return [
+        render_question_ids(items),
+        *render_question_item_change_on_question_id(),
+        get_kois()[0],
+        get_images(items),
+        get_image(),
+    ]
+
+
+def render_question_item_change_on_question_id():
+    return [
+        get_question(),
+        get_system_prompt(),
+        get_formatted_question(),
+        get_ground_truth(),
+    ]
+
+
+def get_formatted_question():
+    global inference_engine, selected_question_item, kois_active
+
+    if selected_question_item is None:
+        raise_dataset_error()
+
+    formatted_question = selected_question_item
+    if not kois_active:
+        formatted_question = copy.deepcopy(selected_question_item)
+        formatted_question.key_object_info = None
+
+    formatted_question.formatted_message = formatted_question.format_message(
+        inference_engine.message_formatter
+    )
+
+    return formatted_question.formatted_message
+
+
+def predict_question(formatted_question):
+    global inference_engine
+
+    responses = inference_engine.predict_batch([[formatted_question]])
+    return responses[0]
+
+
+def raise_dataset_error():
+    raise gr.Error("Please load a dataset first.", duration=2)
+
+
+with gr.Blocks() as demo:
+    # UI COMPONENTS
+    with gr.Row():
+        with gr.Column(scale=0):
+            loaded_model_textbox = gr.Textbox(
+                label="Model",
+                interactive=False,
+                value="None",
+            )
+        with gr.Column(scale=0):
+            loaded_dataset_textbox = gr.Textbox(
+                label="Dataset",
+                interactive=False,
+                value="None",
+            )
+    with gr.Tab("Settings"):
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    default_provider = "Qwen"
+                    default_model = "Qwen2.5-VL-7B-Instruct"
+                    provider_dropdown = gr.Dropdown(
+                        choices=list(PROVIDERS.keys()),
+                        label="Provider",
+                        value=default_provider,
+                    )
+                    model_dropdown = gr.Dropdown(
+                        choices=list(PROVIDERS[default_provider].keys()),
+                        label="Model",
+                        value=default_model,
+                    )
+
+                with gr.Row():
+                    load_model_button = gr.Button("Load Model", scale=0)
+
+            with gr.Column():
+                split = gr.Radio(
+                    choices=["val", "test", "train"],
+                    label="Split",
+                    value="val",
+                )
+                add_kois = gr.Checkbox(label="Yolo KOIs", value=False)
+                add_bev = gr.Checkbox(label="BEVs", value=False)
+                use_grid = gr.Checkbox(label="Grid", value=True)
+                use_system_prompt = gr.Checkbox(label="System Prompts", value=True)
+
+                get_dataset_button = gr.Button("Load Dataset", scale=0)
+
+    with gr.Tab("Chat"):
+        with gr.Accordion("Question Picker", open=False):
+            with gr.Row():
+                scene_id_dropdown = gr.Dropdown(
+                    label="Scene",
+                    choices=[],
+                    value=None,
+                )
+                keyframe_id_dropdown = gr.Dropdown(
+                    label="Keyframe",
+                    choices=[],
+                    value=None,
+                )
+                question_type_radio = gr.Radio(
+                    label="Question Type",
+                    choices=[],
+                    value=None,
+                )
+                question_ids_dropdown = gr.Dropdown(
+                    label="Question",
+                    choices=[],
+                    value=None,
+                )
+        with gr.Row():
+            with gr.Column():
+                with gr.Accordion("Image Gallery", open=False):
+                    image_gallery = gr.Gallery(
+                        value=[],
+                        object_fit="contain",
+                        columns=3,
+                        height="auto",
+                    )
+            with gr.Column():
+                image = gr.Image(label="Selected Image", interactive=False)
+
+        with gr.Accordion("Key Object Infos", open=False):
+            kois_checkbox = gr.Checkbox(
+                label="Pass Key Object Infos",
+                value=kois_active,
+            )
+
+            kois_json = gr.JSON(label="Key Object Infos", value=None)
+
+        system_prompt_textbox = gr.Textbox(
+            label="System Prompt",
+            value=None,
+            interactive=True,
+        )
+
+        question_textbox = gr.Textbox(
+            label="Question",
+            value=None,
+            interactive=True,
+        )
+        with gr.Accordion(label="Formatted Message", open=False):
+            format_message_json = gr.JSON(value=None)
+
+        response_textbox = gr.Textbox(label="Answer", value=None, interactive=False)
+
+        ground_truth_textbox = gr.Textbox(
+            label="Ground Truth Answer",
+            value=None,
+            interactive=False,
+            visible=False,
+        )
+
+        send_button = gr.Button("Send")
+
+    # EVENT HANDLERS
+    get_dataset_button.click(
+        fn=get_dataset,
+        inputs=[
+            split,
+            add_kois,
+            add_bev,
+            use_grid,
+            use_system_prompt,
+        ],
+        outputs=loaded_dataset_textbox,
+    ).then(
+        fn=update_question_item,
+        inputs=None,
+        outputs=[
+            scene_id_dropdown,
+            keyframe_id_dropdown,
+            question_type_radio,
+            question_ids_dropdown,
+            question_textbox,
+            system_prompt_textbox,
+            format_message_json,
+            ground_truth_textbox,
+            kois_json,
+            image_gallery,
+            image,
+        ],
+    )
+
+    load_model_button.click(
+        fn=get_engine,
+        inputs=[provider_dropdown, model_dropdown],
+        outputs=loaded_model_textbox,
+    )
+
+    provider_dropdown.change(
+        fn=get_models, inputs=provider_dropdown, outputs=model_dropdown
+    )
+
+    scene_id_dropdown.input(
+        fn=update_question_item_on_scene_id,
+        inputs=scene_id_dropdown,
+        outputs=[
+            keyframe_id_dropdown,
+            question_type_radio,
+            question_ids_dropdown,
+            question_textbox,
+            system_prompt_textbox,
+            format_message_json,
+            ground_truth_textbox,
+            kois_json,
+            image_gallery,
+            image,
+        ],
+    )
+    keyframe_id_dropdown.input(
+        fn=update_question_item_on_keyframe_id,
+        inputs=[scene_id_dropdown, keyframe_id_dropdown],
+        outputs=[
+            question_type_radio,
+            question_ids_dropdown,
+            question_textbox,
+            system_prompt_textbox,
+            format_message_json,
+            ground_truth_textbox,
+            kois_json,
+            image_gallery,
+            image,
+        ],
+    )
+
+    question_type_radio.input(
+        fn=update_question_item_on_question_type,
+        inputs=[scene_id_dropdown, keyframe_id_dropdown, question_type_radio],
+        outputs=[
+            question_ids_dropdown,
+            question_textbox,
+            system_prompt_textbox,
+            format_message_json,
+            ground_truth_textbox,
+            kois_json,
+            image_gallery,
+            image,
+        ],
+    )
+
+    question_ids_dropdown.input(
+        fn=update_question_item_on_question_id,
+        inputs=[
+            scene_id_dropdown,
+            keyframe_id_dropdown,
+            question_type_radio,
+            question_ids_dropdown,
+        ],
+        outputs=[
+            question_textbox,
+            system_prompt_textbox,
+            format_message_json,
+            ground_truth_textbox,
+        ],
+    )
+
+    image_gallery.select(
+        fn=update_image,
+        inputs=None,
+        outputs=[image, format_message_json],
+    )
+
+    question_textbox.submit(
+        fn=update_question,
+        inputs=question_textbox,
+        outputs=[question_textbox, format_message_json, ground_truth_textbox],
+    )
+
+    system_prompt_textbox.submit(
+        fn=update_system_prompt,
+        inputs=system_prompt_textbox,
+        outputs=[system_prompt_textbox, format_message_json],
+    )
+
+    kois_checkbox.change(
+        fn=update_kois_active,
+        inputs=kois_checkbox,
+        outputs=[kois_checkbox, format_message_json],
+    )
+
+    send_button.click(
+        fn=predict_question,
+        inputs=format_message_json,
+        outputs=response_textbox,
+    )
+
+
+if __name__ == "__main__":
+    demo.launch()
diff --git a/requirements.txt b/requirements.txt
index 200f3dc..1dac300 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,4 @@ peft~=0.15.2
 trl~=0.18.1
 polars==1.31.0
 ultralytics==8.3.168
+gradio~=5.41.0

From 3583e3ef5663691c3639427536cf27aa6cc94411 Mon Sep 17 00:00:00 2001
From: Caspar Siemssen <c.siemssen@campus.tu-berlin.de>
Date: Thu, 7 Aug 2025 22:18:23 +0200
Subject: [PATCH 17/25] add gradio notebook

---
 gradio_app.py          | 18 +++++++++--------
 notebooks/gradio.ipynb | 44 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 8 deletions(-)
 create mode 100644 notebooks/gradio.ipynb

diff --git a/gradio_app.py b/gradio_app.py
index 7a28bb7..033ecb8 100644
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -69,12 +69,6 @@ def get_dataset(split, add_kois, add_bev, use_grid, use_system_prompt):
         use_system_prompt=use_system_prompt,
     )
     raw_dataset = load_dataset(split)
-
-    gr.Info(
-        f"Dataset {split} loaded successfully with length of {str(len(dataset))}",
-        duration=2,
-    )
-
     return split
 
 
@@ -276,7 +270,10 @@ def get_ground_truth(invalid=False):
         return gr.update(value=None, visible=False)
 
     if selected_question_item is not None:
-        if selected_question_item.ground_truth_answer is None:
+        if (
+            selected_question_item.ground_truth_answer is None
+            or selected_question_item.ground_truth_answer == ""
+        ):
             return gr.update(value=None, visible=False)
 
         return gr.update(value=selected_question_item.ground_truth_answer, visible=True)
@@ -376,7 +373,12 @@ def update_question_item_on_question_id(
             selected_question_item = copy.deepcopy(item)
             break
 
-    return [get_question(), get_system_prompt(), get_formatted_question()]
+    return [
+        get_question(),
+        get_system_prompt(),
+        get_formatted_question(),
+        get_ground_truth(),
+    ]
 
 
 def render_question_item_change_on_scene_id(items):
diff --git a/notebooks/gradio.ipynb b/notebooks/gradio.ipynb
new file mode 100644
index 0000000..e99ffd5
--- /dev/null
+++ b/notebooks/gradio.ipynb
@@ -0,0 +1,44 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "36ae6fa4",
+   "metadata": {},
+   "source": [
+    "## Notebook to run gradio ui"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6141baff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# NOTE: Set the branch the eval should run on here\n",
+    "!git clone -b <branch> https://github.com/csiemssen/APP-RAS-Driving-with-Language\n",
+    "!rsync -a APP-RAS-Driving-with-Language/* .\n",
+    "!rm -rf APP-RAS-Driving-with-Language"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "511f2b42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gradio_app import demo\n",
+    "\n",
+    "demo.launch()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From a8f8d68552d94de78f26d553ef3cd5df41c029f2 Mon Sep 17 00:00:00 2001
From: Caspar Siemssen <c.siemssen@campus.tu-berlin.de>
Date: Thu, 7 Aug 2025 22:25:56 +0200
Subject: [PATCH 18/25] fix missing requirements install in notebook

---
 notebooks/gradio.ipynb | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/notebooks/gradio.ipynb b/notebooks/gradio.ipynb
index e99ffd5..7b874ed 100644
--- a/notebooks/gradio.ipynb
+++ b/notebooks/gradio.ipynb
@@ -21,6 +21,25 @@
     "!rm -rf APP-RAS-Driving-with-Language"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a204899",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -r requirements.txt\n",
+    "%pip install flash-attn==2.8.0.post2 --no-build-isolation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55797d47",
+   "metadata": {},
+   "source": [
+    "## Start gradio"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From a1dc4c0d9bd56ca7f0d4bf439bf94de08c9a36b8 Mon Sep 17 00:00:00 2001
From: Caspar Siemssen <c.siemssen@campus.tu-berlin.de>
Date: Fri, 8 Aug 2025 14:02:30 +0200
Subject: [PATCH 19/25] fix wrong denormalisation of grid

---
 src/data/basic_dataset.py   |  23 +--
 src/data/generate_bev.py    | 314 +++++++++++++++++++++++++-----------
 src/data/load_dataset.py    |   5 +-
 src/eval/eval_models.py     |  10 +-
 src/utils/utils.py          | 286 +++++++++++++++++++++-----------
 tests/test_basic_dataset.py |   6 +-
 tests/test_utils.py         | 121 +++++++++++++-
 7 files changed, 553 insertions(+), 212 deletions(-)

diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py
index 24d0559..81087fd 100644
--- a/src/data/basic_dataset.py
+++ b/src/data/basic_dataset.py
@@ -10,17 +10,17 @@
     generate_descriptor_qas,
 )
 from src.data.generate_reasoning_context import generate_reasoning_context
+from src.data.generate_yolo_kois import generate_yolo_kois
 from src.data.get_sensor_calibration import get_calibration
 from src.data.load_dataset import load_dataset
 from src.data.message_formats import MessageFormat
 from src.data.query_item import QueryItem
 from src.data.system_prompts import SystemPromptProvider
-from src.data.generate_yolo_kois import generate_yolo_kois
 from src.utils.logger import get_logger
 from src.utils.utils import (
+    normalize_key_object_infos,
+    normalize_key_objects_in_text,
     remove_nones,
-    normalise_key_object_infos,
-    normalise_key_objects_in_text,
 )
 
 logger = get_logger(__name__)
@@ -75,9 +75,6 @@ def __init__(
 
         data = load_dataset(split)
 
-        if split == "train":
-            data = normalise_key_object_infos(data, resize_factor, use_grid)
-
         if split == "train" and add_augmented:
             data = generate_descriptor_qas(data)
 
@@ -86,11 +83,19 @@ def __init__(
             if add_bev:
                 data = get_calibration(data)
                 data = generate_bevs(data, front_cam=front_cam)
-            data = normalise_key_object_infos(data, resize_factor, use_grid)
 
         if use_grid:
             data = create_image_grid_dataset(data)
 
+        for scene_data in data.values():
+            for key_frame_data in scene_data["key_frames"].values():
+                infos = key_frame_data.get("key_object_infos")
+
+                if infos:
+                    key_frame_data["key_object_infos"] = normalize_key_object_infos(
+                        infos, resize_factor, use_grid
+                    )
+
         removed = 0
         qa_list = []
         for scene_id in data.keys():
@@ -125,7 +130,7 @@ def __init__(
                 )
 
                 camera_calibration = None
-                if split=="val" and add_kois and add_bev:
+                if split == "val" and add_kois and add_bev:
                     camera_calibration = scene_obj[key_frame_id]["camera_calibration"]
 
                 qas = scene_obj[key_frame_id]["QA"]
@@ -152,7 +157,7 @@ def __init__(
                     + qas_behavior
                     + qas_augmented
                 ):
-                    qa["Q"] = normalise_key_objects_in_text(
+                    qa["Q"] = normalize_key_objects_in_text(
                         qa["Q"],
                         resize_factor=resize_factor,
                         use_grid=use_grid,
diff --git a/src/data/generate_bev.py b/src/data/generate_bev.py
index 98506d8..7f625e7 100644
--- a/src/data/generate_bev.py
+++ b/src/data/generate_bev.py
@@ -1,21 +1,21 @@
-import cv2
 import os
+
+import cv2
 import numpy as np
 from scipy.spatial.transform import Rotation as R_scipy
 from tqdm import tqdm
 
 from src.constants import bev_dir, drivelm_dir
 from src.data.get_sensor_calibration import CameraCalibration
-from src.utils.utils import get_logger, key_object_str_to_dict
-
+from src.utils.utils import get_logger, key_object_key_to_dict
 
 logger = get_logger(__name__)
 
 
 def generate_bev_from_detections(
-        calibration: dict[str, CameraCalibration], 
-        kois: dict,
-    ) -> np.ndarray:
+    calibration: dict[str, CameraCalibration],
+    kois: dict,
+) -> np.ndarray:
     """
     Generates a Bird's-Eye View (BEV) map from detected objects for a keyframe,
     using nuScenes camera calibration information.
@@ -47,21 +47,34 @@ def generate_bev_from_detections(
 
     total_items = 0
     for camera_name, cam_calib in calibration.items():
-        current_keys = [koi_key for koi_key in kois.keys() if camera_name == key_object_str_to_dict(koi_key)["camera"]]
-        current_identifiers = [key_object_str_to_dict(k)["id"] for k in current_keys]
+        current_keys = [
+            koi_key
+            for koi_key in kois.keys()
+            if camera_name == key_object_key_to_dict(koi_key)["camera"]
+        ]
+        current_identifiers = [key_object_key_to_dict(k)["id"] for k in current_keys]
         current_camera_kois = [kois[k] for k in current_keys]
         current_camera_boxes = [koi["2d_bbox"] for koi in current_camera_kois]
         total_items += len(current_camera_boxes)
         current_camera_names = [koi["Category"] for koi in current_camera_kois]
 
         K = np.array(cam_calib.camera_intrinsic, dtype=np.float64)
-        t_camera_to_ego = np.array(cam_calib.translation, dtype=np.float64) # (x, y, z)
-        q_camera_to_ego = np.array(cam_calib.rotation, dtype=np.float64) # (w, x, y, z)
+        t_camera_to_ego = np.array(cam_calib.translation, dtype=np.float64)  # (x, y, z)
+        q_camera_to_ego = np.array(cam_calib.rotation, dtype=np.float64)  # (w, x, y, z)
 
         # 1. Convert quaternion to rotation matrix: R_ego_from_camera
         # nuScenes quaternion is (w, x, y, z) -> scipy Rotation.from_quat expects (x, y, z, w)
-        r_ego_from_camera_scipy = R_scipy.from_quat([q_camera_to_ego[1], q_camera_to_ego[2], q_camera_to_ego[3], q_camera_to_ego[0]])
-        R_ego_from_camera = r_ego_from_camera_scipy.as_matrix() # 3x3 rotation matrix from camera to ego
+        r_ego_from_camera_scipy = R_scipy.from_quat(
+            [
+                q_camera_to_ego[1],
+                q_camera_to_ego[2],
+                q_camera_to_ego[3],
+                q_camera_to_ego[0],
+            ]
+        )
+        R_ego_from_camera = (
+            r_ego_from_camera_scipy.as_matrix()
+        )  # 3x3 rotation matrix from camera to ego
 
         for i in range(len(current_camera_boxes)):
             bbox = current_camera_boxes[i]
@@ -74,7 +87,10 @@ def generate_bev_from_detections(
             # --- Project 2D image point back to 3D on the ground plane (Z=0 in ego frame) ---
 
             # Convert 2D image point to a 3D ray direction in the camera frame (normalized coordinates).
-            uv_hom = np.array([bottom_center_2d[0], bottom_center_2d[1], 1.0], dtype=np.float64).reshape(3, 1)
+            uv_hom = np.array(
+                [bottom_center_2d[0], bottom_center_2d[1], 1.0],
+                dtype=np.float64,
+            ).reshape(3, 1)
             K_inv = np.linalg.inv(K)
             ray_direction_camera_frame = np.dot(K_inv, uv_hom).flatten()
 
@@ -84,7 +100,7 @@ def generate_bev_from_detections(
 
             # Intersect the ray with the ground plane (Z_ego = 0).
             if np.isclose(ray_direction_ego[2], 0.0):
-                continue # Ray is parallel or near-parallel to ground plane
+                continue  # Ray is parallel or near-parallel to ground plane
 
             lam = -ray_origin_ego[2] / ray_direction_ego[2]
 
@@ -96,13 +112,13 @@ def generate_bev_from_detections(
 
             # Store the projected object's information
             projected_object_info = {
-                'class': obj_name,
-                'x_ego': -point_3d_ego[1],
-                'y_ego': point_3d_ego[0],
-                'z_ego': point_3d_ego[2], # Should be close to 0
-                'camera_name': camera_name,
-                'original_bbox': bbox,
-                'identifier': current_identifiers[i],
+                "class": obj_name,
+                "x_ego": -point_3d_ego[1],
+                "y_ego": point_3d_ego[0],
+                "z_ego": point_3d_ego[2],  # Should be close to 0
+                "camera_name": camera_name,
+                "original_bbox": bbox,
+                "identifier": current_identifiers[i],
             }
             all_projected_objects.append(projected_object_info)
 
@@ -111,103 +127,217 @@ def generate_bev_from_detections(
     unique_objects = []
     proximity_threshold = 10.0
     duplicates_removed = 0
-    
+
     for obj in all_projected_objects:
         is_duplicate = False
         for unique_obj in unique_objects:
             # Check if objects are of same class and spatially close
-            if (obj['class'] == unique_obj['class'] and
+            if (
+                obj["class"] == unique_obj["class"]
+                and
                 # TODO: Tune the prox threshold
-                np.sqrt((obj['x_ego'] - unique_obj['x_ego'])**2 + 
-                       (obj['y_ego'] - unique_obj['y_ego'])**2) < proximity_threshold):
-                
+                np.sqrt(
+                    (obj["x_ego"] - unique_obj["x_ego"]) ** 2
+                    + (obj["y_ego"] - unique_obj["y_ego"]) ** 2
+                )
+                < proximity_threshold
+            ):
                 # Keep the object from the camera that provides better view
                 # Prefer front cameras for forward objects, side cameras for side objects, etc.
-                current_distance = np.sqrt(obj['x_ego']**2 + obj['y_ego']**2)
-                unique_distance = np.sqrt(unique_obj['x_ego']**2 + unique_obj['y_ego']**2)
-                
+                current_distance = np.sqrt(obj["x_ego"] ** 2 + obj["y_ego"] ** 2)
+                unique_distance = np.sqrt(
+                    unique_obj["x_ego"] ** 2 + unique_obj["y_ego"] ** 2
+                )
+
                 # Replace if current object is closer or from a more appropriate camera
-                if (current_distance < unique_distance or 
-                    _is_better_camera_view(obj, unique_obj)):
+                if current_distance < unique_distance or _is_better_camera_view(
+                    obj, unique_obj
+                ):
                     unique_objects.remove(unique_obj)
                     unique_objects.append(obj)
                 else:
                     duplicates_removed += 1
                 is_duplicate = True
                 break
-        
+
         if not is_duplicate:
             unique_objects.append(obj)
 
     # --- Render Projected Objects onto the BEV Map ---
     for obj_info in unique_objects:
-        x_ego = obj_info['x_ego']
-        y_ego = obj_info['y_ego']
-        obj_class = obj_info['class']
-        identifier = obj_info['identifier']
+        x_ego = obj_info["x_ego"]
+        y_ego = obj_info["y_ego"]
+        obj_class = obj_info["class"]
+        identifier = obj_info["identifier"]
 
         col_bev = int((x_ego - x_min_m) / bev_map_res_m_per_pixel)
-        row_bev = int(bev_map_height_pixels - 1 - ((y_ego - y_min_m) / bev_map_res_m_per_pixel))
+        row_bev = int(
+            bev_map_height_pixels - 1 - ((y_ego - y_min_m) / bev_map_res_m_per_pixel)
+        )
 
         if 0 <= col_bev < bev_map_width_pixels and 0 <= row_bev < bev_map_height_pixels:
-            if 'car' in obj_class.lower() or 'truck' in obj_class.lower() or 'bus' in obj_class.lower() or \
-               'bicycle' in obj_class.lower() or 'motorcycle' in obj_class.lower():
+            if (
+                "car" in obj_class.lower()
+                or "truck" in obj_class.lower()
+                or "bus" in obj_class.lower()
+                or "bicycle" in obj_class.lower()
+                or "motorcycle" in obj_class.lower()
+            ):
                 car_width_bev = int(1.5 / bev_map_res_m_per_pixel)
                 car_length_bev = int(3.0 / bev_map_res_m_per_pixel)
-                color = (0, 255, 255) # Yellow (BGR)
-
-                if (0 <= (col_bev - car_width_bev) and (col_bev + car_width_bev) < bev_map_width_pixels
-                    and 0 <= (row_bev - car_length_bev) and (row_bev + car_length_bev) < bev_map_height_pixels):
-                    cv2.rectangle(bev_map,
-                                (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2),
-                                (col_bev + car_width_bev // 2, row_bev + car_length_bev // 2),
-                                color, -1)
-                    cv2.putText(bev_map, identifier, (col_bev - car_width_bev // 2, row_bev - car_length_bev // 2 - 5),
-                                cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
-
-            elif 'person' in obj_class.lower():
+                color = (0, 255, 255)  # Yellow (BGR)
+
+                if (
+                    0 <= (col_bev - car_width_bev)
+                    and (col_bev + car_width_bev) < bev_map_width_pixels
+                    and 0 <= (row_bev - car_length_bev)
+                    and (row_bev + car_length_bev) < bev_map_height_pixels
+                ):
+                    cv2.rectangle(
+                        bev_map,
+                        (
+                            col_bev - car_width_bev // 2,
+                            row_bev - car_length_bev // 2,
+                        ),
+                        (
+                            col_bev + car_width_bev // 2,
+                            row_bev + car_length_bev // 2,
+                        ),
+                        color,
+                        -1,
+                    )
+                    cv2.putText(
+                        bev_map,
+                        identifier,
+                        (
+                            col_bev - car_width_bev // 2,
+                            row_bev - car_length_bev // 2 - 5,
+                        ),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        0.4,
+                        (255, 255, 255),
+                        1,
+                    )
+
+            elif "person" in obj_class.lower():
                 ped_width_bev = int(1.0 / bev_map_res_m_per_pixel)
                 ped_length_bev = int(1.0 / bev_map_res_m_per_pixel)
-                color = (255, 0, 0) # Blue (BGR)
-
-                if (0 <= (col_bev - ped_width_bev) and (col_bev + ped_width_bev) < bev_map_width_pixels
-                    and 0 <= (row_bev - ped_length_bev) and (row_bev + ped_length_bev) < bev_map_height_pixels):
-                    cv2.rectangle(bev_map,
-                                    (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2),
-                                    (col_bev + ped_width_bev // 2, row_bev + ped_length_bev // 2),
-                                    color, -1)
-                    cv2.putText(bev_map, identifier, (col_bev - ped_width_bev // 2, row_bev - ped_length_bev // 2 - 5),
-                                cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+                color = (255, 0, 0)  # Blue (BGR)
+
+                if (
+                    0 <= (col_bev - ped_width_bev)
+                    and (col_bev + ped_width_bev) < bev_map_width_pixels
+                    and 0 <= (row_bev - ped_length_bev)
+                    and (row_bev + ped_length_bev) < bev_map_height_pixels
+                ):
+                    cv2.rectangle(
+                        bev_map,
+                        (
+                            col_bev - ped_width_bev // 2,
+                            row_bev - ped_length_bev // 2,
+                        ),
+                        (
+                            col_bev + ped_width_bev // 2,
+                            row_bev + ped_length_bev // 2,
+                        ),
+                        color,
+                        -1,
+                    )
+                    cv2.putText(
+                        bev_map,
+                        identifier,
+                        (
+                            col_bev - ped_width_bev // 2,
+                            row_bev - ped_length_bev // 2 - 5,
+                        ),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        0.4,
+                        (255, 255, 255),
+                        1,
+                    )
 
     # --- Draw Ego Vehicle ---
-    ego_x_m = 0.0 # Ego vehicle is at (0,0) in its own frame
+    ego_x_m = 0.0  # Ego vehicle is at (0,0) in its own frame
     ego_y_m = 0.0
 
     # Convert ego (0,0) to BEV map pixel coordinates
     ego_col_bev = int((ego_x_m - x_min_m) / bev_map_res_m_per_pixel)
-    ego_row_bev = int(bev_map_height_pixels - 1 - ((ego_y_m - y_min_m) / bev_map_res_m_per_pixel))
-    
+    ego_row_bev = int(
+        bev_map_height_pixels - 1 - ((ego_y_m - y_min_m) / bev_map_res_m_per_pixel)
+    )
+
     # Ego vehicle dimensions (approximate typical car size)
     ego_width_pixels = int(1.5 / bev_map_res_m_per_pixel)
     ego_length_pixels = int(3.0 / bev_map_res_m_per_pixel)
-    
-    ego_color = (0, 0, 255) # Red (BGR)
-    cv2.rectangle(bev_map,
-                  (ego_col_bev - ego_width_pixels // 2, ego_row_bev - ego_length_pixels // 2),
-                  (ego_col_bev + ego_width_pixels // 2, ego_row_bev + ego_length_pixels // 2),
-                  ego_color, -1)
-    cv2.putText(bev_map, 'Ego', (ego_col_bev - ego_width_pixels // 2, ego_row_bev - ego_length_pixels // 2 - 5),
-                cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
-    
+
+    ego_color = (0, 0, 255)  # Red (BGR)
+    cv2.rectangle(
+        bev_map,
+        (
+            ego_col_bev - ego_width_pixels // 2,
+            ego_row_bev - ego_length_pixels // 2,
+        ),
+        (
+            ego_col_bev + ego_width_pixels // 2,
+            ego_row_bev + ego_length_pixels // 2,
+        ),
+        ego_color,
+        -1,
+    )
+    cv2.putText(
+        bev_map,
+        "Ego",
+        (
+            ego_col_bev - ego_width_pixels // 2,
+            ego_row_bev - ego_length_pixels // 2 - 5,
+        ),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.4,
+        (255, 255, 255),
+        1,
+    )
+
     marker_color = (255, 255, 255)  # White
     font = cv2.FONT_HERSHEY_SIMPLEX
     font_scale = 0.6
     thickness = 2
-    
-    cv2.putText(bev_map, 'FRONT', (bev_map_width_pixels//2 - 30, 25), font, font_scale, marker_color, thickness)
-    cv2.putText(bev_map, 'BACK', (bev_map_width_pixels//2 - 25, bev_map_height_pixels - 10), font, font_scale, marker_color, thickness)
-    cv2.putText(bev_map, 'LEFT', (10, bev_map_height_pixels//2), font, font_scale, marker_color, thickness)
-    cv2.putText(bev_map, 'RIGHT', (bev_map_width_pixels - 60, bev_map_height_pixels//2), font, font_scale, marker_color, thickness)
+
+    cv2.putText(
+        bev_map,
+        "FRONT",
+        (bev_map_width_pixels // 2 - 30, 25),
+        font,
+        font_scale,
+        marker_color,
+        thickness,
+    )
+    cv2.putText(
+        bev_map,
+        "BACK",
+        (bev_map_width_pixels // 2 - 25, bev_map_height_pixels - 10),
+        font,
+        font_scale,
+        marker_color,
+        thickness,
+    )
+    cv2.putText(
+        bev_map,
+        "LEFT",
+        (10, bev_map_height_pixels // 2),
+        font,
+        font_scale,
+        marker_color,
+        thickness,
+    )
+    cv2.putText(
+        bev_map,
+        "RIGHT",
+        (bev_map_width_pixels - 60, bev_map_height_pixels // 2),
+        font,
+        font_scale,
+        marker_color,
+        thickness,
+    )
 
     return bev_map
 
@@ -216,33 +346,33 @@ def _is_better_camera_view(obj1, obj2):
     """
     Determine if obj1 has a better camera view than obj2 based on object position and camera type.
     """
-    x1, y1 = obj1['x_ego'], obj1['y_ego']
-    x2, y2 = obj2['x_ego'], obj2['y_ego']
-    cam1 = obj1['camera_name']
-    cam2 = obj2['camera_name']
-    
+    x1, y1 = obj1["x_ego"], obj1["y_ego"]
+    x2, y2 = obj2["x_ego"], obj2["y_ego"]
+    cam1 = obj1["camera_name"]
+    cam2 = obj2["camera_name"]
+
     def get_camera_score(x, y, camera_name):
         score = 0
         # Front cameras are best for forward objects (y > 0)
-        if 'FRONT' in camera_name and y > 0:
+        if "FRONT" in camera_name and y > 0:
             score += 3
         # Back cameras are best for rear objects (y < 0)
-        elif 'BACK' in camera_name and y < 0:
+        elif "BACK" in camera_name and y < 0:
             score += 3
         # Left cameras are best for left objects (x < 0)
-        if 'LEFT' in camera_name and x < 0:
+        if "LEFT" in camera_name and x < 0:
             score += 2
         # Right cameras are best for right objects (x > 0)
-        elif 'RIGHT' in camera_name and x > 0:
+        elif "RIGHT" in camera_name and x > 0:
             score += 2
         # Center cameras (FRONT, BACK) are good for center objects
-        if camera_name in ['CAM_FRONT', 'CAM_BACK'] and abs(x) < 5:
+        if camera_name in ["CAM_FRONT", "CAM_BACK"] and abs(x) < 5:
             score += 1
         return score
-    
+
     score1 = get_camera_score(x1, y1, cam1)
     score2 = get_camera_score(x2, y2, cam2)
-    
+
     return score1 > score2
 
 
@@ -275,7 +405,9 @@ def generate_bevs(data, front_cam: bool = False):
                     target_height = min(front_image.shape[0], bev_img.shape[0])
                     front_aspect = front_image.shape[1] / front_image.shape[0]
                     front_width = int(target_height * front_aspect)
-                    front_resized = cv2.resize(front_image, (front_width, target_height))
+                    front_resized = cv2.resize(
+                        front_image, (front_width, target_height)
+                    )
                     bev_resized = cv2.resize(bev_img, (bev_img.shape[1], target_height))
                     combined_img = np.hstack([front_resized, bev_resized])
                     cv2.imwrite(bev_path, combined_img)
diff --git a/src/data/load_dataset.py b/src/data/load_dataset.py
index 8aa030e..857cc51 100644
--- a/src/data/load_dataset.py
+++ b/src/data/load_dataset.py
@@ -71,13 +71,16 @@ def load_dataset(split: str) -> dict:
         raise ValueError(f"Invalid split: {split}. Must be 'train', 'val' or 'test'.")
 
     base_path = dataset_paths[split]
+
     if not base_path.is_file():
         get_ds(split)
 
+    if not os.path.isfile(os.path.join(nuscenes_dir, "nuscenes_json.zip")):
+        get_nuscenes_ds()
+
     if split == "test":
         logger.debug("Extracting test dataset from train dataset")
         extract_data(drivelm_train_json, drivelm_test_json)
-
         base_path = drivelm_test_json
 
     with open(base_path) as f:
diff --git a/src/eval/eval_models.py b/src/eval/eval_models.py
index 9669b00..6e73d82 100644
--- a/src/eval/eval_models.py
+++ b/src/eval/eval_models.py
@@ -13,7 +13,7 @@
 from src.utils.logger import get_logger
 from src.utils.utils import (
     create_subset,
-    normalise_key_objects_in_text,
+    denormalize_key_objects_in_text,
     sanitize_model_name,
 )
 
@@ -83,15 +83,15 @@ def evaluate_model(
             results.append(
                 {
                     "id": batch[i].qa_id,
-                    "question": normalise_key_objects_in_text(
+                    "question": denormalize_key_objects_in_text(
                         batch[i].question,
-                        resize_factor=1 / resize_factor,
+                        resize_factor=resize_factor,
                         use_grid=use_grid,
                     ),
                     "model_input": batch[i].formatted_message,
-                    "answer": normalise_key_objects_in_text(
+                    "answer": denormalize_key_objects_in_text(
                         text=result,
-                        resize_factor=1 / resize_factor,
+                        resize_factor=resize_factor,
                         use_grid=use_grid,
                     ),
                 }
diff --git a/src/utils/utils.py b/src/utils/utils.py
index 09f4aae..5b14215 100644
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -1,12 +1,18 @@
 import re
 import shutil
 from pathlib import Path
-from typing import Any, Dict, List, Tuple, Type, TypeVar
+from typing import Any, Callable, Dict, List, Tuple, Type, TypeVar
 
 import torch
 from torch.utils.data import Dataset, Subset
 
-from src.constants import BEV_IMG_SIZE, BEV_AND_FRONT_CAM_IMG_SIZE, GRID_IMG_SIZE, IMAGE_SIZE, GRID_POSITIONS
+from src.constants import (
+    BEV_AND_FRONT_CAM_IMG_SIZE,
+    BEV_IMG_SIZE,
+    GRID_IMG_SIZE,
+    GRID_POSITIONS,
+    IMAGE_SIZE,
+)
 from src.data.query_item import QueryItem
 from src.utils.logger import get_logger
 
@@ -136,7 +142,12 @@ def tuple_mul(t: Tuple[float, float], scalar: float) -> Tuple[float, float]:
     return (t[0] * scalar, t[1] * scalar)
 
 
-def get_resize_image_size(resize_factor: float, grid: bool = False, bev: bool = False, front_cam: bool = False) -> Tuple[int, int]:
+def get_resize_image_size(
+    resize_factor: float,
+    grid: bool = False,
+    bev: bool = False,
+    front_cam: bool = False,
+) -> Tuple[int, int]:
     if grid:
         size = tuple_mul(GRID_IMG_SIZE, resize_factor)
     elif bev and not front_cam:
@@ -149,121 +160,193 @@ def get_resize_image_size(resize_factor: float, grid: bool = False, bev: bool =
 
 
 def find_key_objects(text: str) -> List[str]:
-    pattern = r"<c\d+,CAM_[A-Z_]+,\d+\.?\d*,\d+\.?\d*>"
-    matches = re.findall(pattern, text)
-    return matches
+    return re.findall(r"<c\d+,CAM_[A-Z_]+,\d+\.?\d*,\d+\.?\d*>", text)
 
 
-def key_object_str_to_dict(text: str) -> Dict[str, Any]:
-    pattern = r"<c(\d+),CAM_([A-Z_]+),(\d+\.?\d*),(\d+\.?\d*)>"
-    matches = re.findall(pattern, text)
+def key_object_key_to_dict(key: str) -> Dict[str, Any]:
+    match = re.match(r"<c(\d+),CAM_([A-Z_]+),(\d+\.?\d*),(\d+\.?\d*)>", key)
     return (
         {
-            "id": f"c{matches[0][0]}",
-            "camera": f"CAM_{matches[0][1]}",
-            "x": float(matches[0][2]),
-            "y": float(matches[0][3]),
+            "id": f"c{match.group(1)}",
+            "camera": f"CAM_{match.group(2)}",
+            "x": float(match.group(3)),
+            "y": float(match.group(4)),
         }
-        if matches
+        if match
         else {}
     )
 
 
-def key_object_dict_to_str(key_object: Dict[str, Any]) -> str:
-    return f"<{key_object['id']},{key_object['camera']},{key_object['x']},{key_object['y']}>"
+def key_object_dict_to_key(key_obj: Dict[str, Any]) -> str:
+    return f"<{key_obj['id']},{key_obj['camera']},{key_obj['x']},{key_obj['y']}>"
 
 
-def scale_key_object_point(
-    point: tuple[float, float], resize_factor: float
-) -> tuple[float, float]:
-    return float.__round__(point[0] * resize_factor, 2), float.__round__(point[1] * resize_factor, 2)
+def scale_point(point: Tuple[float, float], factor: float) -> Tuple[float, float]:
+    return round(point[0] * factor, 2), round(point[1] * factor, 2)
 
 
-def normalise_key_object_infos(
-    data,
-    resize_factor: float,
-    use_grid: bool,
-) -> tuple[str, dict[str, Any]]:
-    for _, scene_data in data.items():
-        for _, key_frame_data in scene_data["key_frames"].items():
-            key_object_infos = key_frame_data["key_object_infos"]
-            if not key_object_infos:
-                continue
-            normalised_key_object_infos = {}
-            for key, value in key_object_infos.items():
-                normalised_key = normalise_key_object_descriptor(
-                    key,
-                    resize_factor,
-                    use_grid,
-                )
-                new_value = value.copy()
-                if "2d_bbox" in value:
-                    koi_dict = key_object_str_to_dict(key)
-                    x1, y1, x2, y2 = value["2d_bbox"]
-                    if use_grid:
-                        x1, y1 = map_camera_point_to_grid_point((x1, y1), koi_dict["camera"])
-                        x2, y2 = map_camera_point_to_grid_point((x2, y2), koi_dict["camera"])
-
-                    x1, y1 = scale_key_object_point(
-                        (x1, y1),
-                        resize_factor,
-                    )
-                    x2, y2 = scale_key_object_point(
-                        (x2, y2),
-                        resize_factor,
-                    )
-
-                    new_value["2d_bbox"] = (x1, y1, x2, y2)
-                normalised_key_object_infos[normalised_key] = new_value
-            key_frame_data["key_object_infos"] = normalised_key_object_infos
-
-    return data
-
-
-def normalise_key_object_descriptor(
-    key_object_descriptor: str, resize_factor: float, use_grid: bool
-):
-    koi_dict = key_object_str_to_dict(key_object_descriptor)
-
-    if not koi_dict:
-        logger.warning(
-            f"Key object string '{key_object_descriptor}' could not be parsed."
-        )
-        return key_object_descriptor
-
-    # Map to grid coordinates first as it uses the orginal image size
+def _transform_key_object_infos(
+    key_object_infos: Dict[str, Any],
+    key_transform: Callable[[str], str],
+    bbox_transform: Callable[
+        [Tuple[float, float, float, float]], Tuple[float, float, float, float]
+    ] = None,
+) -> Dict[str, Any]:
+    result = {}
+    for key, value in key_object_infos.items():
+        new_key = key_transform(key)
+        new_value = value.copy()
+        if "2d_bbox" in value and bbox_transform:
+            new_value["2d_bbox"] = bbox_transform(value["2d_bbox"])
+        result[new_key] = new_value
+    return result
+
+
+def _transform_key_objects_in_text(
+    text: str, transform_fn: Callable[[str], str]
+) -> str:
+    keys = find_key_objects(text)
+    transformed_keys = [transform_fn(key) for key in keys]
+    for orig, new in zip(keys, transformed_keys):
+        text = text.replace(orig, new)
+    return text
+
+
+def scale_key_object_key(key: str, factor: float) -> str:
+    obj = key_object_key_to_dict(key)
+    if not obj:
+        logger.warning(f"Key object string '{key}' could not be parsed.")
+        return key
+    obj["x"], obj["y"] = scale_point((obj["x"], obj["y"]), factor)
+    return key_object_dict_to_key(obj)
+
+
+def scale_bbox(
+    bbox: Tuple[float, float, float, float], factor: float
+) -> Tuple[float, float, float, float]:
+    x1, y1 = scale_point((bbox[0], bbox[1]), factor)
+    x2, y2 = scale_point((bbox[2], bbox[3]), factor)
+    return x1, y1, x2, y2
+
+
+def scale_key_object_infos(
+    key_object_infos: Dict[str, Any], resize_factor: float
+) -> Dict[str, Any]:
+    return _transform_key_object_infos(
+        key_object_infos,
+        lambda k: scale_key_object_key(k, resize_factor),
+        lambda bbox: scale_bbox(bbox, resize_factor),
+    )
+
+
+def camera_key_object_infos_to_grid(
+    key_object_infos: Dict[str, Any],
+) -> Dict[str, Any]:
+    return _transform_key_object_infos(
+        key_object_infos,
+        camera_key_object_key_to_grid,
+        lambda bbox: (
+            *camera_point_to_grid_point(
+                (bbox[0], bbox[1]),
+                key_object_key_to_dict(list(key_object_infos.keys())[0])["camera"],
+            ),
+            *camera_point_to_grid_point(
+                (bbox[2], bbox[3]),
+                key_object_key_to_dict(list(key_object_infos.keys())[0])["camera"],
+            ),
+        ),
+    )
+
+
+def grid_key_object_infos_to_camera(
+    key_object_infos: Dict[str, Any],
+) -> Dict[str, Any]:
+    return _transform_key_object_infos(
+        key_object_infos,
+        grid_key_objects_key_to_camera,
+        lambda bbox: (
+            *grid_point_to_camera_point(
+                (bbox[0], bbox[1]),
+                key_object_key_to_dict(list(key_object_infos.keys())[0])["camera"],
+            ),
+            *grid_point_to_camera_point(
+                (bbox[2], bbox[3]),
+                key_object_key_to_dict(list(key_object_infos.keys())[0])["camera"],
+            ),
+        ),
+    )
+
+
+def camera_key_object_key_to_grid(key_object_key: str) -> str:
+    obj = key_object_key_to_dict(key_object_key)
+    if not obj:
+        logger.warning(f"Key object string '{key_object_key}' could not be parsed.")
+        return key_object_key
+    obj["x"], obj["y"] = camera_point_to_grid_point((obj["x"], obj["y"]), obj["camera"])
+    return key_object_dict_to_key(obj)
+
+
+def grid_key_objects_key_to_camera(key_object_key: str) -> str:
+    obj = key_object_key_to_dict(key_object_key)
+    if not obj:
+        logger.warning(f"Key object string '{key_object_key}' could not be parsed.")
+        return key_object_key
+    obj["x"], obj["y"] = grid_point_to_camera_point((obj["x"], obj["y"]), obj["camera"])
+    return key_object_dict_to_key(obj)
+
+
+def normalize_key_object_infos(
+    key_object_infos: Dict[str, Any],
+    resize_factor: float = 1.0,
+    use_grid: bool = False,
+) -> Dict[str, Any]:
     if use_grid:
-        new_x, new_y = map_camera_point_to_grid_point(
-            (koi_dict["x"], koi_dict["y"]), koi_dict["camera"]
-        )
-    else:
-        new_x, new_y = koi_dict["x"], koi_dict["y"]
+        key_object_infos = camera_key_object_infos_to_grid(key_object_infos)
+    return scale_key_object_infos(key_object_infos, resize_factor)
+
+
+def denormalize_key_object_infos(
+    key_object_infos: Dict[str, Any],
+    resize_factor: float = 1.0,
+    use_grid: bool = False,
+) -> Dict[str, Any]:
+    infos = scale_key_object_infos(key_object_infos, 1.0 / resize_factor)
+    if use_grid:
+        infos = grid_key_object_infos_to_camera(infos)
+    return infos
 
-    new_x, new_y = scale_key_object_point((new_x, new_y), resize_factor)
 
-    koi_dict["x"] = new_x
-    koi_dict["y"] = new_y
+def normalize_key_objects_in_text(
+    text: str,
+    resize_factor: float = 1.0,
+    use_grid: bool = False,
+) -> str:
+    def transform(key: str) -> str:
+        norm_key = key
+        if use_grid:
+            norm_key = camera_key_object_key_to_grid(norm_key)
+        norm_key = scale_key_object_key(norm_key, resize_factor)
+        return norm_key
 
-    return key_object_dict_to_str(koi_dict)
+    return _transform_key_objects_in_text(text, transform)
 
 
-def normalise_key_objects_in_text(
+def denormalize_key_objects_in_text(
     text: str,
-    resize_factor: float,
-    use_grid: bool,
+    resize_factor: float = 1.0,
+    use_grid: bool = False,
 ) -> str:
-    descriptors = find_key_objects(text)
-    for desc in descriptors:
-        norm_desc = normalise_key_object_descriptor(
-            desc,
-            resize_factor,
-            use_grid,
-        )
-        text = text.replace(desc, norm_desc)
-    return text
+    def transform(key: str) -> str:
+        denorm_key = key
+        denorm_key = scale_key_object_key(denorm_key, 1.0 / resize_factor)
+        if use_grid:
+            denorm_key = grid_key_objects_key_to_camera(denorm_key)
+        return denorm_key
+
+    return _transform_key_objects_in_text(text, transform)
 
 
-def map_camera_point_to_grid_point(
+def camera_point_to_grid_point(
     point: Tuple[float, float],
     cam_name: str,
 ) -> Tuple[float, float]:
@@ -275,3 +358,16 @@ def map_camera_point_to_grid_point(
     x_offset = col * img_width
     y_offset = row * img_height
     return (point[0] + x_offset, point[1] + y_offset)
+
+
+def grid_point_to_camera_point(
+    point: Tuple[float, float],
+    cam_name: str,
+) -> Tuple[float, float]:
+    if cam_name not in GRID_POSITIONS:
+        return point
+    col, row = GRID_POSITIONS[cam_name]
+    img_height, img_width = IMAGE_SIZE
+    x_offset = col * img_width
+    y_offset = row * img_height
+    return (point[0] - x_offset, point[1] - y_offset)
diff --git a/tests/test_basic_dataset.py b/tests/test_basic_dataset.py
index 48df1b8..370aa29 100644
--- a/tests/test_basic_dataset.py
+++ b/tests/test_basic_dataset.py
@@ -6,7 +6,7 @@
 from src.data.basic_dataset import DriveLMImageDataset
 from src.data.message_formats import QwenMessageFormat
 from src.utils.logger import get_logger
-from src.utils.utils import create_subset, key_object_str_to_dict
+from src.utils.utils import create_subset, key_object_key_to_dict
 
 logger = get_logger(__name__)
 
@@ -312,8 +312,8 @@ def test_dataset_with_rescaling(self):
                     orig_item.key_object_info.items(),
                     rescaled_item.key_object_info.items(),
                 ):
-                    orig_koi = key_object_str_to_dict(orig_key)
-                    rescaled_koi = key_object_str_to_dict(rescaled_key)
+                    orig_koi = key_object_key_to_dict(orig_key)
+                    rescaled_koi = key_object_key_to_dict(rescaled_key)
                     assert (
                         abs(orig_koi["x"] * resize_factor - rescaled_koi["x"]) < 1e-2
                     ), (
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 27fd3fe..dc87431 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,8 +1,13 @@
 import unittest
+
+from src.constants import GRID_POSITIONS, IMAGE_SIZE
 from src.utils.utils import (
-    normalise_key_objects_in_text,
-    key_object_str_to_dict,
+    denormalize_key_object_infos,
+    denormalize_key_objects_in_text,
     find_key_objects,
+    key_object_key_to_dict,
+    normalize_key_object_infos,
+    normalize_key_objects_in_text,
 )
 
 
@@ -11,14 +16,14 @@ def test_key_object_descriptor_scaling_and_inverse(self):
         text = "What object would consider <c1,CAM_BACK,1088.3,497.5> to be most relevant to its decision?"
         resize_factor = 0.5
 
-        scaled_text = normalise_key_objects_in_text(
+        scaled_text = normalize_key_objects_in_text(
             text,
             resize_factor=resize_factor,
             use_grid=False,
         )
 
-        scaled_koi = key_object_str_to_dict(find_key_objects(scaled_text)[0])
-        orig_koi = key_object_str_to_dict(find_key_objects(text)[0])
+        scaled_koi = key_object_key_to_dict(find_key_objects(scaled_text)[0])
+        orig_koi = key_object_key_to_dict(find_key_objects(text)[0])
 
         assert abs(orig_koi["x"] * resize_factor - scaled_koi["x"]) < 1e-2, (
             f"x coordinate not scaled correctly: {orig_koi['x']} -> {scaled_koi['x']}"
@@ -27,15 +32,115 @@ def test_key_object_descriptor_scaling_and_inverse(self):
             f"y coordinate not scaled correctly: {orig_koi['y']} -> {scaled_koi['y']}"
         )
 
-        unscaled_text = normalise_key_objects_in_text(
+        unscaled_text = denormalize_key_objects_in_text(
             scaled_text,
-            resize_factor=1 / resize_factor,
+            resize_factor=resize_factor,
             use_grid=False,
         )
-        unscaled_koi = key_object_str_to_dict(find_key_objects(unscaled_text)[0])
+        unscaled_koi = key_object_key_to_dict(find_key_objects(unscaled_text)[0])
+        assert abs(unscaled_koi["x"] - orig_koi["x"]) < 1e-2, (
+            f"x coordinate not unscaled correctly: {unscaled_koi['x']} -> {orig_koi['x']}"
+        )
+        assert abs(unscaled_koi["y"] - orig_koi["y"]) < 1e-2, (
+            f"y coordinate not unscaled correctly: {unscaled_koi['y']} -> {orig_koi['y']}"
+        )
+
+    def test_key_object_descriptor_scaling_with_grid(self):
+        text = "What object would consider <c1,CAM_BACK,1088.3,497.5> to be most relevant to its decision?"
+        resize_factor = 0.5
+
+        scaled_text = normalize_key_objects_in_text(
+            text,
+            resize_factor=resize_factor,
+            use_grid=True,
+        )
+
+        scaled_koi = key_object_key_to_dict(find_key_objects(scaled_text)[0])
+        orig_koi = key_object_key_to_dict(find_key_objects(text)[0])
+
+        cam_name = orig_koi["camera"]
+        col, row = GRID_POSITIONS[cam_name]
+        img_height, img_width = IMAGE_SIZE
+        x_offset = col * img_width
+        y_offset = row * img_height
+
+        expected_x = (orig_koi["x"] + x_offset) * resize_factor
+        expected_y = (orig_koi["y"] + y_offset) * resize_factor
+
+        assert abs(scaled_koi["x"] - expected_x) < 1e-2, (
+            f"x coordinate not grid-mapped and scaled correctly: {scaled_koi['x']} != {expected_x}"
+        )
+        assert abs(scaled_koi["y"] - expected_y) < 1e-2, (
+            f"y coordinate not grid-mapped and scaled correctly: {scaled_koi['y']} != {expected_y}"
+        )
+
+        unscaled_text = denormalize_key_objects_in_text(
+            scaled_text,
+            resize_factor=resize_factor,
+            use_grid=True,
+        )
+
+        unscaled_koi = key_object_key_to_dict(find_key_objects(unscaled_text)[0])
         assert abs(unscaled_koi["x"] - orig_koi["x"]) < 1e-2, (
             f"x coordinate not unscaled correctly: {unscaled_koi['x']} -> {orig_koi['x']}"
         )
         assert abs(unscaled_koi["y"] - orig_koi["y"]) < 1e-2, (
             f"y coordinate not unscaled correctly: {unscaled_koi['y']} -> {orig_koi['y']}"
         )
+
+    def test_key_object_infos_scaling_and_inverse_with_grid(self):
+        key_object_infos = {
+            "<c1,CAM_BACK,1088.3,497.5>": {
+                "Category": "Vehicle",
+                "Status": "Moving",
+                "Visual_description": "Brown SUV.",
+                "2d_bbox": [966.6, 403.3, 1224.1, 591.7],
+            }
+        }
+        resize_factor = 1
+        cam_name = "CAM_BACK"
+        col, row = GRID_POSITIONS[cam_name]
+        img_height, img_width = IMAGE_SIZE
+        x_offset = col * img_width
+        y_offset = row * img_height
+
+        normalized_infos = normalize_key_object_infos(
+            key_object_infos,
+            resize_factor=resize_factor,
+            use_grid=True,
+        )
+
+        norm_key = list(normalized_infos.keys())[0]
+        norm_koi = key_object_key_to_dict(norm_key)
+        orig_koi = key_object_key_to_dict(list(key_object_infos.keys())[0])
+
+        expected_x = (orig_koi["x"] + x_offset) * resize_factor
+        expected_y = (orig_koi["y"] + y_offset) * resize_factor
+        assert abs(norm_koi["x"] - expected_x) < 1e-2
+        assert abs(norm_koi["y"] - expected_y) < 1e-2
+
+        orig_bbox = key_object_infos[list(key_object_infos.keys())[0]]["2d_bbox"]
+        norm_bbox = normalized_infos[norm_key]["2d_bbox"]
+        expected_bbox = [
+            (orig_bbox[0] + x_offset) * resize_factor,
+            (orig_bbox[1] + y_offset) * resize_factor,
+            (orig_bbox[2] + x_offset) * resize_factor,
+            (orig_bbox[3] + y_offset) * resize_factor,
+        ]
+        for a, b in zip(norm_bbox, expected_bbox):
+            assert abs(a - b) < 1e-2
+
+        denormalized_infos = denormalize_key_object_infos(
+            normalized_infos,
+            resize_factor=resize_factor,
+            use_grid=True,
+        )
+
+        denorm_key = list(denormalized_infos.keys())[0]
+        denorm_koi = key_object_key_to_dict(denorm_key)
+        assert abs(denorm_koi["x"] - orig_koi["x"]) < 1e-2
+        assert abs(denorm_koi["y"] - orig_koi["y"]) < 1e-2
+
+        denorm_bbox = denormalized_infos[denorm_key]["2d_bbox"]
+        for a, b in zip(denorm_bbox, orig_bbox):
+            assert abs(a - b) < 1e-2

From ad07a7d5a77dc2d6e7f480a8598b0bde9e731429 Mon Sep 17 00:00:00 2001
From: Caspar Siemssen <c.siemssen@campus.tu-berlin.de>
Date: Fri, 8 Aug 2025 14:06:25 +0200
Subject: [PATCH 20/25] add normalisation of kois in gradio ui with image
 switch

---
 gradio_app.py | 89 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 64 insertions(+), 25 deletions(-)

diff --git a/gradio_app.py b/gradio_app.py
index 033ecb8..cdc7487 100644
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -9,6 +9,12 @@
 from src.models.gemma_inference import GemmaInferenceEngine
 from src.models.intern_vl_inference import InternVLInferenceEngine
 from src.models.qwen_vl_inference import QwenVLInferenceEngine
+from src.utils.utils import (
+    denormalize_key_object_infos,
+    denormalize_key_objects_in_text,
+    normalize_key_object_infos,
+    normalize_key_objects_in_text,
+)
 
 PROVIDERS = {
     "Google": {
@@ -219,10 +225,48 @@ def get_image():
 
 def update_image(evt: gr.SelectData):
     global image_paths_list, selected_question_item
-    if 0 <= evt.index < len(image_paths_list):
-        selected_question_item.image_path = image_paths_list[evt.index]
-        return [selected_question_item.image_path, get_formatted_question()]
-    return [None, get_formatted_question()]
+    if 0 > evt.index or evt.index >= len(image_paths_list):
+        raise gr.Error("Invalid image selection", duration=2)
+
+    previous_image_path = get_image()
+    selected_question_item.image_path = image_paths_list[evt.index]
+
+    was_grid = "GRID" in previous_image_path
+    is_grid = "GRID" in selected_question_item.image_path
+
+    if not was_grid and is_grid:
+        if selected_question_item.key_object_info is not None:
+            selected_question_item.key_object_info = normalize_key_object_infos(
+                selected_question_item.key_object_info,
+                resize_factor=1,
+                use_grid=is_grid,
+            )
+        selected_question_item.question = normalize_key_objects_in_text(
+            selected_question_item.question,
+            resize_factor=1,
+            use_grid=is_grid,
+        )
+
+    if was_grid and not is_grid:
+        if selected_question_item.key_object_info is not None:
+            selected_question_item.key_object_info = denormalize_key_object_infos(
+                selected_question_item.key_object_info,
+                resize_factor=1,
+                use_grid=was_grid,
+            )
+
+        selected_question_item.question = denormalize_key_objects_in_text(
+            selected_question_item.question,
+            resize_factor=1,
+            use_grid=was_grid,
+        )
+
+    return [
+        selected_question_item.image_path,
+        get_question(),
+        get_kois(),
+        get_formatted_question(),
+    ]
 
 
 def get_question():
@@ -283,11 +327,8 @@ def get_ground_truth(invalid=False):
 def get_kois():
     global selected_question_item
     if selected_question_item is not None:
-        return [
-            selected_question_item.key_object_info,
-            get_formatted_question(),
-        ]
-    return [None, get_formatted_question()]
+        return selected_question_item.key_object_info
+    return None
 
 
 def update_kois_active(active):
@@ -373,12 +414,7 @@ def update_question_item_on_question_id(
             selected_question_item = copy.deepcopy(item)
             break
 
-    return [
-        get_question(),
-        get_system_prompt(),
-        get_formatted_question(),
-        get_ground_truth(),
-    ]
+    return [*render_question_item_change_on_question_id(question_id_items)]
 
 
 def render_question_item_change_on_scene_id(items):
@@ -400,21 +436,22 @@ def render_question_item_change_on_keyframe(items):
 
 
 def render_question_item_change_on_question_type(items):
+    question_id_items = filter_question_items(items, question_id=get_question_id())
     return [
         render_question_ids(items),
-        *render_question_item_change_on_question_id(),
-        get_kois()[0],
-        get_images(items),
-        get_image(),
+        *render_question_item_change_on_question_id(question_id_items),
+        get_kois(),
     ]
 
 
-def render_question_item_change_on_question_id():
+def render_question_item_change_on_question_id(items):
     return [
         get_question(),
         get_system_prompt(),
         get_formatted_question(),
         get_ground_truth(),
+        get_images(items),
+        get_image(),
     ]
 
 
@@ -586,9 +623,9 @@ def raise_dataset_error():
             system_prompt_textbox,
             format_message_json,
             ground_truth_textbox,
-            kois_json,
             image_gallery,
             image,
+            kois_json,
         ],
     )
 
@@ -613,9 +650,9 @@ def raise_dataset_error():
             system_prompt_textbox,
             format_message_json,
             ground_truth_textbox,
-            kois_json,
             image_gallery,
             image,
+            kois_json,
         ],
     )
     keyframe_id_dropdown.input(
@@ -628,9 +665,9 @@ def raise_dataset_error():
             system_prompt_textbox,
             format_message_json,
             ground_truth_textbox,
-            kois_json,
             image_gallery,
             image,
+            kois_json,
         ],
     )
 
@@ -643,9 +680,9 @@ def raise_dataset_error():
             system_prompt_textbox,
             format_message_json,
             ground_truth_textbox,
-            kois_json,
             image_gallery,
             image,
+            kois_json,
         ],
     )
 
@@ -662,13 +699,15 @@ def raise_dataset_error():
             system_prompt_textbox,
             format_message_json,
             ground_truth_textbox,
+            image_gallery,
+            image,
         ],
     )
 
     image_gallery.select(
         fn=update_image,
         inputs=None,
-        outputs=[image, format_message_json],
+        outputs=[image, question_textbox, kois_json, format_message_json],
     )
 
     question_textbox.submit(

From 4690a6c96fd1513d03d0ea6508d3f9676c6f5e38 Mon Sep 17 00:00:00 2001
From: Caspar Siemssen <c.siemssen@campus.tu-berlin.de>
Date: Fri, 8 Aug 2025 16:09:11 +0200
Subject: [PATCH 21/25] performance optimisation

---
 gradio_app.py | 166 +++++++++++++++++++++++++-------------------------
 1 file changed, 83 insertions(+), 83 deletions(-)

diff --git a/gradio_app.py b/gradio_app.py
index cdc7487..40d906d 100644
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -65,7 +65,7 @@ def get_dataset(split, add_kois, add_bev, use_grid, use_system_prompt):
     if inference_engine is None:
         raise gr.Error("Please load model first", duration=2)
 
-    dataset = DriveLMImageDataset(
+    flat_dataset = DriveLMImageDataset(
         message_format=inference_engine.message_formatter,
         split=split,
         front_cam=True,
@@ -74,6 +74,19 @@ def get_dataset(split, add_kois, add_bev, use_grid, use_system_prompt):
         use_grid=use_grid,
         use_system_prompt=use_system_prompt,
     )
+
+    dataset = {}
+    for item in flat_dataset:
+        scene_id = parse_scene_id(item.qa_id)
+        keyframe_id = parse_keyframe_id(item.qa_id)
+        question_id = parse_question_id(item.qa_id)
+        question_type = item.qa_type
+
+        dataset.setdefault(scene_id, {})
+        dataset[scene_id].setdefault(keyframe_id, {})
+        dataset[scene_id][keyframe_id].setdefault(question_type, {})
+        dataset[scene_id][keyframe_id][question_type][question_id] = item
+
     raw_dataset = load_dataset(split)
     return split
 
@@ -108,6 +121,30 @@ def filter_question_items(
     ]
 
 
+def pick_question_item(
+    scene_id=None, keyframe_id=None, question_type=None, question_id=None
+):
+    global dataset, selected_question_item
+    if dataset is None or len(dataset) == 0:
+        raise_dataset_error()
+        return None
+
+    for s_id, keyframes in dataset.items():
+        if scene_id is not None and s_id != scene_id:
+            continue
+        for k_id, questions in keyframes.items():
+            if keyframe_id is not None and k_id != keyframe_id:
+                continue
+            for q_type, questions_dict in questions.items():
+                if question_type is not None and q_type != question_type:
+                    continue
+                for q_id, item in questions_dict.items():
+                    if question_id is not None and q_id != question_id:
+                        continue
+
+                    return copy.deepcopy(item)
+
+
 def get_scene_id():
     global selected_question_item
     if selected_question_item is not None:
@@ -115,14 +152,9 @@ def get_scene_id():
     return None
 
 
-def get_scenes(items):
-    return sorted({parse_scene_id(item.qa_id) for item in items})
-
-
-def render_scenes(items):
-    scene_ids = get_scenes(items)
+def render_scene_ids(scene_ids):
     return gr.update(
-        choices=scene_ids,
+        choices=sorted(scene_ids),
         value=get_scene_id(),
     )
 
@@ -134,30 +166,13 @@ def get_keyframe_id():
     return None
 
 
-def get_keyframes(items):
-    return sorted({parse_keyframe_id(item.qa_id) for item in items})
-
-
-def render_keyframes(items):
-    keyframe_ids = get_keyframes(items)
+def render_keyframe_ids(keyframe_ids):
     return gr.update(
-        choices=keyframe_ids,
+        choices=sorted(keyframe_ids),
         value=get_keyframe_id(),
     )
 
 
-def get_question_types(items):
-    return sorted({item.qa_type for item in items})
-
-
-def render_question_types(items):
-    question_types = get_question_types(items)
-    return gr.update(
-        choices=question_types,
-        value=get_question_type(),
-    )
-
-
 def get_question_type():
     global selected_question_item
     if selected_question_item is not None:
@@ -165,6 +180,13 @@ def get_question_type():
     return None
 
 
+def render_question_types(question_types):
+    return gr.update(
+        choices=sorted(question_types),
+        value=get_question_type(),
+    )
+
+
 def get_question_id():
     global selected_question_item
     if selected_question_item is not None:
@@ -172,19 +194,14 @@ def get_question_id():
     return None
 
 
-def get_question_ids(items):
-    return sorted({parse_question_id(item.qa_id) for item in items})
-
-
-def render_question_ids(items):
-    question_ids = get_question_ids(items)
+def render_question_ids(question_ids):
     return gr.update(
-        choices=question_ids,
+        choices=sorted(question_ids),
         value=get_question_id(),
     )
 
 
-def get_images(items):
+def get_images():
     global image_paths_list
     scene_id = get_scene_id()
     keyframe_id = get_keyframe_id()
@@ -343,56 +360,45 @@ def update_question_item():
         raise_dataset_error()
         return None
 
-    selected_question_item = copy.deepcopy(dataset[0])
-    scene_items = filter_question_items(dataset, scene_id=get_scene_id())
+    selected_question_item = pick_question_item()
+    scene_id = parse_scene_id(selected_question_item.qa_id)
+    scene = dataset[scene_id]
     return [
-        render_scenes(dataset),
-        *render_question_item_change_on_scene_id(scene_items),
+        render_scene_ids(dataset.keys()),
+        *render_question_item_change_on_scene_id(scene),
     ]
 
 
 def update_question_item_on_scene_id(scene_id):
     global dataset, selected_question_item
-    if dataset is None or len(dataset) == 0:
-        raise_dataset_error()
-        return None
-    for item in dataset:
-        if parse_scene_id(item.qa_id) == scene_id:
-            selected_question_item = copy.deepcopy(item)
-            break
+    selected_question_item = pick_question_item(scene_id=scene_id)
 
-    scene_items = filter_question_items(items=dataset, scene_id=scene_id)
-    return [*render_question_item_change_on_scene_id(scene_items)]
+    return [*render_question_item_change_on_scene_id(dataset[scene_id])]
 
 
 def update_question_item_on_keyframe_id(scene_id, keyframe_id):
     global dataset, selected_question_item
-    keyframe_items = filter_question_items(
-        items=dataset, scene_id=scene_id, keyframe_id=keyframe_id
+    selected_question_item = pick_question_item(
+        scene_id=scene_id, keyframe_id=keyframe_id
     )
-    for item in keyframe_items:
-        selected_question_item = copy.deepcopy(item)
-        break
 
-    return [*render_question_item_change_on_keyframe(keyframe_items)]
+    keyframe = dataset[scene_id][keyframe_id]
+    return [*render_question_item_change_on_keyframe(keyframe)]
 
 
 def update_question_item_on_question_type(scene_id, keyframe_id, question_type):
     global dataset, selected_question_item
 
-    question_type_items = filter_question_items(
-        items=dataset,
+    selected_question_item = pick_question_item(
         scene_id=scene_id,
         keyframe_id=keyframe_id,
         question_type=question_type,
     )
 
-    for item in question_type_items:
-        selected_question_item = copy.deepcopy(item)
-        break
+    question_type = dataset[scene_id][keyframe_id][question_type]
 
     return [
-        *render_question_item_change_on_question_type(question_type_items),
+        *render_question_item_change_on_question_type(question_type),
     ]
 
 
@@ -401,56 +407,50 @@ def update_question_item_on_question_id(
 ):
     global dataset, selected_question_item
 
-    question_id_items = filter_question_items(
-        items=dataset,
+    selected_question_item = pick_question_item(
         scene_id=scene_id,
         keyframe_id=keyframe_id,
         question_type=question_type,
         question_id=question_id,
     )
 
-    for item in question_id_items:
-        if parse_question_id(item.qa_id) == question_id:
-            selected_question_item = copy.deepcopy(item)
-            break
+    question_id = dataset[scene_id][keyframe_id][question_type][question_id]
 
-    return [*render_question_item_change_on_question_id(question_id_items)]
+    return [*render_question_item_change_on_question_id(question_id)]
 
 
-def render_question_item_change_on_scene_id(items):
-    keyframe_items = filter_question_items(items, keyframe_id=get_keyframe_id())
+def render_question_item_change_on_scene_id(scene):
+    keyframe = scene[get_keyframe_id()]
     return [
-        render_keyframes(items),
-        *render_question_item_change_on_keyframe(keyframe_items),
+        render_keyframe_ids(scene.keys()),
+        *render_question_item_change_on_keyframe(keyframe),
     ]
 
 
-def render_question_item_change_on_keyframe(items):
-    question_type_items = filter_question_items(
-        items, question_type=get_question_type()
-    )
+def render_question_item_change_on_keyframe(keyframe):
+    question_type = keyframe[get_question_type()]
     return [
-        render_question_types(items),
-        *render_question_item_change_on_question_type(question_type_items),
+        render_question_types(keyframe.keys()),
+        *render_question_item_change_on_question_type(question_type),
     ]
 
 
-def render_question_item_change_on_question_type(items):
-    question_id_items = filter_question_items(items, question_id=get_question_id())
+def render_question_item_change_on_question_type(question_type):
+    question_ids = question_type[get_question_id()]
     return [
-        render_question_ids(items),
-        *render_question_item_change_on_question_id(question_id_items),
+        render_question_ids(question_type.keys()),
+        *render_question_item_change_on_question_id(question_ids),
         get_kois(),
     ]
 
 
-def render_question_item_change_on_question_id(items):
+def render_question_item_change_on_question_id(question_id):
     return [
         get_question(),
         get_system_prompt(),
         get_formatted_question(),
         get_ground_truth(),
-        get_images(items),
+        get_images(),
         get_image(),
     ]
 

From 6cf67e70c38d8bfdf5e55edaf049aac0a4e6bc0a Mon Sep 17 00:00:00 2001
From: Caspar Siemssen <c.siemssen@campus.tu-berlin.de>
Date: Fri, 8 Aug 2025 16:21:54 +0200
Subject: [PATCH 22/25] add prediction history

---
 gradio_app.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/gradio_app.py b/gradio_app.py
index 40d906d..f06320e 100644
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -48,6 +48,7 @@
 image_paths_list = []
 selected_question_item = None
 kois_active = True
+prediction_history = {}
 
 
 def get_engine(provider, model):
@@ -354,6 +355,13 @@ def update_kois_active(active):
     return [active, get_formatted_question()]
 
 
+def get_answer(question_id):
+    global prediction_history
+    if question_id in prediction_history:
+        return prediction_history[question_id]
+    return None
+
+
 def update_question_item():
     global dataset, selected_question_item
     if dataset is None or len(dataset) == 0:
@@ -452,6 +460,7 @@ def render_question_item_change_on_question_id(question_id):
         get_ground_truth(),
         get_images(),
         get_image(),
+        get_answer(question_id.qa_id),
     ]
 
 
@@ -474,9 +483,11 @@ def get_formatted_question():
 
 
 def predict_question(formatted_question):
-    global inference_engine
+    global inference_engine, prediction_history, selected_question_item
 
     responses = inference_engine.predict_batch([[formatted_question]])
+    question_id = selected_question_item.qa_id
+    prediction_history[question_id] = responses[0]
     return responses[0]
 
 
@@ -625,6 +636,7 @@ def raise_dataset_error():
             ground_truth_textbox,
             image_gallery,
             image,
+            response_textbox,
             kois_json,
         ],
     )
@@ -652,6 +664,7 @@ def raise_dataset_error():
             ground_truth_textbox,
             image_gallery,
             image,
+            response_textbox,
             kois_json,
         ],
     )
@@ -667,6 +680,7 @@ def raise_dataset_error():
             ground_truth_textbox,
             image_gallery,
             image,
+            response_textbox,
             kois_json,
         ],
     )
@@ -682,6 +696,7 @@ def raise_dataset_error():
             ground_truth_textbox,
             image_gallery,
             image,
+            response_textbox,
             kois_json,
         ],
     )
@@ -701,6 +716,7 @@ def raise_dataset_error():
             ground_truth_textbox,
             image_gallery,
             image,
+            response_textbox,
         ],
     )
 

From b492f7e0787c2df56e7370b1b695bdc30b2d4f29 Mon Sep 17 00:00:00 2001
From: Caspar Siemssen <c.siemssen@campus.tu-berlin.de>
Date: Fri, 8 Aug 2025 21:51:57 +0200
Subject: [PATCH 23/25] add model to response history

---
 gradio_app.py | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/gradio_app.py b/gradio_app.py
index f06320e..8f34c1b 100644
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -357,11 +357,16 @@ def update_kois_active(active):
 
 def get_answer(question_id):
     global prediction_history
-    if question_id in prediction_history:
-        return prediction_history[question_id]
+    if question_id in prediction_history and prediction_history[question_id]:
+        return prediction_history[question_id][-1]["answer"]
     return None
 
 
+def get_answer_history(question_id):
+    global prediction_history
+    return prediction_history.get(question_id, None)
+
+
 def update_question_item():
     global dataset, selected_question_item
     if dataset is None or len(dataset) == 0:
@@ -461,6 +466,7 @@ def render_question_item_change_on_question_id(question_id):
         get_images(),
         get_image(),
         get_answer(question_id.qa_id),
+        get_answer_history(question_id.qa_id),
     ]
 
 
@@ -487,8 +493,15 @@ def predict_question(formatted_question):
 
     responses = inference_engine.predict_batch([[formatted_question]])
     question_id = selected_question_item.qa_id
-    prediction_history[question_id] = responses[0]
-    return responses[0]
+    prediction_history.setdefault(question_id, []).append(
+        {
+            "model": inference_engine.model_path,
+            "message": selected_question_item.formatted_message,
+            "answer": responses[0],
+        }
+    )
+
+    return [responses[0], get_answer_history(question_id)]
 
 
 def raise_dataset_error():
@@ -602,6 +615,11 @@ def raise_dataset_error():
 
         response_textbox = gr.Textbox(label="Answer", value=None, interactive=False)
 
+        with gr.Accordion("Response History", open=False):
+            response_history_json = gr.JSON(
+                value=None,
+            )
+
         ground_truth_textbox = gr.Textbox(
             label="Ground Truth Answer",
             value=None,
@@ -637,6 +655,7 @@ def raise_dataset_error():
             image_gallery,
             image,
             response_textbox,
+            response_history_json,
             kois_json,
         ],
     )
@@ -665,6 +684,7 @@ def raise_dataset_error():
             image_gallery,
             image,
             response_textbox,
+            response_history_json,
             kois_json,
         ],
     )
@@ -681,6 +701,7 @@ def raise_dataset_error():
             image_gallery,
             image,
             response_textbox,
+            response_history_json,
             kois_json,
         ],
     )
@@ -697,6 +718,7 @@ def raise_dataset_error():
             image_gallery,
             image,
             response_textbox,
+            response_history_json,
             kois_json,
         ],
     )
@@ -717,6 +739,7 @@ def raise_dataset_error():
             image_gallery,
             image,
             response_textbox,
+            response_history_json,
         ],
     )
 
@@ -747,7 +770,7 @@ def raise_dataset_error():
     send_button.click(
         fn=predict_question,
         inputs=format_message_json,
-        outputs=response_textbox,
+        outputs=[response_textbox, response_history_json],
     )
 
 

From b66d4eb80dd34414eaf70d099d4ea8e4a909937a Mon Sep 17 00:00:00 2001
From: Caspar Siemssen <c.siemssen@campus.tu-berlin.de>
Date: Fri, 8 Aug 2025 22:05:27 +0200
Subject: [PATCH 24/25] remove constants merge error

---
 src/constants.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/constants.py b/src/constants.py
index c3705ef..ffde349 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -21,8 +21,6 @@
     IMAGE_SIZE[0] * GRID[0],
     IMAGE_SIZE[1] * GRID[1],
 )  # (height, width)
-BEV_IMG_SIZE = (500, 500)
-BEV_AND_FRONT_CAM_IMG_SIZE = (500, 1388)
 
 GRID_POSITIONS = {
     "CAM_FRONT_LEFT": (0, 0),

From a58ceeeebe6b6e077d7a7690c291c652b7f013cd Mon Sep 17 00:00:00 2001
From: Caspar Siemssen <c.siemssen@campus.tu-berlin.de>
Date: Fri, 8 Aug 2025 22:21:42 +0200
Subject: [PATCH 25/25] extend comments approaches notebooks

---
 notebooks/eval.ipynb  | 2 +-
 notebooks/test.ipynb  | 2 +-
 notebooks/train.ipynb | 6 +++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/notebooks/eval.ipynb b/notebooks/eval.ipynb
index 6c56455..201a689 100644
--- a/notebooks/eval.ipynb
+++ b/notebooks/eval.ipynb
@@ -51,7 +51,7 @@
     "from src.utils.approach import get_approach_kwargs, get_approach_name\n",
     "from src.utils.utils import get_resize_image_size\n",
     "\n",
-    "approaches = []  # [\"image_grid\", \"reasoning\", \"system_prompt\"]\n",
+    "approaches = []  # [\"image_grid\", \"reasoning\", \"system_prompt\", \"front_cam\", \"add_kois\", \"add_bev\"]\n",
     "resize_factor = 0.25  # [0.25, 0.5, 0.75, 1]\n",
     "batch_size = 30\n",
     "dataset_split = \"val\"\n",
diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb
index f5d544d..db6ef41 100644
--- a/notebooks/test.ipynb
+++ b/notebooks/test.ipynb
@@ -51,7 +51,7 @@
     "from src.utils.approach import get_approach_kwargs, get_approach_name\n",
     "from src.utils.utils import get_resize_image_size\n",
     "\n",
-    "approaches = []  # [\"image_grid\", \"reasoning\", \"system_prompt\"]\n",
+    "approaches = []  # [\"image_grid\", \"reasoning\", \"system_prompt\", \"front_cam\", \"add_kois\", \"add_bev\"]\n",
     "resize_factor = 0.25  # [0.25, 0.5, 0.75, 1]\n",
     "batch_size = 30\n",
     "test_set_size = 29450\n",
diff --git a/notebooks/train.ipynb b/notebooks/train.ipynb
index 4b6a97a..e699f42 100644
--- a/notebooks/train.ipynb
+++ b/notebooks/train.ipynb
@@ -49,7 +49,11 @@
    "source": [
     "from src.utils.approach import get_approach_kwargs, get_approach_name\n",
     "\n",
-    "approaches = [\"image_grid\", \"descriptor_quas\", \"reasoning\"]\n",
+    "approaches = [\n",
+    "    \"image_grid\",\n",
+    "    \"descriptor_quas\",\n",
+    "    \"reasoning\",\n",
+    "]  # [\"image_grid\", \"descriptor_quas\", \"reasoning\", \"system_prompt\", \"front_cam\", \"add_kois\", \"add_bev\"]\n",
     "batch_size = 4\n",
     "test_set_size = 37759\n",
     "resize_factor = 0.2\n",