From 08815d920d1f1c49d49add0ced0d5bb98f6ec094 Mon Sep 17 00:00:00 2001 From: Veit Laule Date: Thu, 7 Aug 2025 11:10:41 +0200 Subject: [PATCH 1/2] add new approaches to training code --- src/data/basic_dataset.py | 3 +++ src/models/qwen_vl_inference.py | 2 +- src/reasoning/reasoning_engine.py | 2 +- src/train/train_qwen.py | 11 +++++++++-- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/data/basic_dataset.py b/src/data/basic_dataset.py index 24d0559..fb97071 100644 --- a/src/data/basic_dataset.py +++ b/src/data/basic_dataset.py @@ -76,6 +76,9 @@ def __init__( data = load_dataset(split) if split == "train": + if add_bev: + data = get_calibration(data) + data = generate_bevs(data, front_cam=front_cam) data = normalise_key_object_infos(data, resize_factor, use_grid) if split == "train" and add_augmented: diff --git a/src/models/qwen_vl_inference.py b/src/models/qwen_vl_inference.py index 342c497..f3e6081 100644 --- a/src/models/qwen_vl_inference.py +++ b/src/models/qwen_vl_inference.py @@ -63,7 +63,7 @@ def load_model(self, flash_attn: bool = True) -> None: num_img_tokens = (height // patch_size) * (width // patch_size) num_img_pixel = num_img_tokens * patch_size * patch_size - logger.debug( + logger.info( f"Resizing images to {self.resize_image_size} with {num_img_tokens} visual tokens and {num_img_pixel} pixels." ) diff --git a/src/reasoning/reasoning_engine.py b/src/reasoning/reasoning_engine.py index 02f11aa..eb79c85 100644 --- a/src/reasoning/reasoning_engine.py +++ b/src/reasoning/reasoning_engine.py @@ -37,7 +37,7 @@ def process_batch(self, batch_items: List[QueryItem]) -> List[QueryItem]: image_path=item.image_path, qa_id=f"{item.qa_id}_reasoning", qa_type=item.qa_type, - key_object_info=item.key_object_info, # note not available in eval mode + key_object_info=item.key_object_info, system_prompt=item.system_prompt, ) desc_item.formatted_message = desc_item.format_message( diff --git a/src/train/train_qwen.py b/src/train/train_qwen.py index 36c8e0c..6a99c7a 100644 --- a/src/train/train_qwen.py +++ b/src/train/train_qwen.py @@ -41,7 +41,7 @@ class TrainingArguments(transformers.TrainingArguments): cache_dir: Optional[str] = field(default=None) optim: str = field(default="adamw_torch") model_max_length: int = field( - default=512, + default=1028, metadata={ "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." }, @@ -242,7 +242,6 @@ def create_optimizer(self): return self.optimizer -# TODO: Look into the deepspeed config def train( approach_name: str, resize_factor: float, @@ -252,6 +251,9 @@ def train( use_augmented: bool = False, use_reasoning: bool = False, use_system_prompt: bool = False, + add_kois: bool = False, + add_bev: bool = False, + front_cam: bool = False, **kwargs, ): name = approach_name + datetime.now().strftime("%H:%M:%S-%m-%d-%Y%") @@ -310,6 +312,9 @@ def collator(batch: Any): dataset = DriveLMImageDataset( engine.training_message_formatter, split="train", + front_cam=front_cam, + add_kois=add_kois, + add_bev=add_bev, use_grid=use_grid, add_augmented=use_augmented, use_reasoning=use_reasoning, @@ -320,6 +325,8 @@ def collator(batch: Any): dataset = create_subset(dataset, int(test_set_size)) dataset = [item.formatted_message for item in dataset] + logger.info(dataset[0]) + engine.load_model(flash_attn=False) model = prepare_model_for_kbit_training( engine.model, use_gradient_checkpointing=True From 3d150d0d5c93470a993d1957b1acc83a9d063875 Mon Sep 17 00:00:00 2001 From: Veit Laule Date: Thu, 7 Aug 2025 11:25:52 +0200 Subject: [PATCH 2/2] make sure we use equal distribution when training --- src/models/qwen_vl_inference.py | 2 +- src/train/train_qwen.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/models/qwen_vl_inference.py b/src/models/qwen_vl_inference.py index f3e6081..342c497 100644 --- a/src/models/qwen_vl_inference.py +++ b/src/models/qwen_vl_inference.py @@ -63,7 +63,7 @@ def load_model(self, flash_attn: bool = True) -> None: num_img_tokens = (height // patch_size) * (width // patch_size) num_img_pixel = num_img_tokens * patch_size * patch_size - logger.info( + logger.debug( f"Resizing images to {self.resize_image_size} with {num_img_tokens} visual tokens and {num_img_pixel} pixels." ) diff --git a/src/train/train_qwen.py b/src/train/train_qwen.py index 6a99c7a..6f9655a 100644 --- a/src/train/train_qwen.py +++ b/src/train/train_qwen.py @@ -322,11 +322,9 @@ def collator(batch: Any): resize_factor=resize_factor, ) if test_set_size is not None: - dataset = create_subset(dataset, int(test_set_size)) + dataset = create_subset(dataset, int(test_set_size), equal_distribution=True) dataset = [item.formatted_message for item in dataset] - logger.info(dataset[0]) - engine.load_model(flash_attn=False) model = prepare_model_for_kbit_training( engine.model, use_gradient_checkpointing=True