refactor active learning code, multi gpu fails

weecology · Dec 10, 2024 · e17de33 · e17de33
1 parent a3019bf
commit e17de33
Show file tree

Hide file tree

Showing 17 changed files with 233 additions and 240 deletions.
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -34,41 +34,45 @@ propagate:
 detection_model:
   checkpoint: bird
   checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/detection/checkpoints
-  validation_csv_path:
   train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train/
   train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
   crop_image_dir: /blue/ewhite/b.weinstein/BOEM/detection/crops/
   limit_empty_frac: 0.05
-  fast_dev_run: false
   labels:
     - "Bird"
+  trainer:
+    train:
+      fast_dev_run: False
+      epochs: 3
+      lr: 0.00001
+    workers: 0
 
 classification_model:
   checkpoint: 
   checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/classification/checkpoints
-  validation_csv_path: 
   train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train
   train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
   crop_image_dir: /blue/ewhite/b.weinstein/BOEM/classification/crops/
   under_sample_ratio: 0
   trainer:
-    fast_dev_run: false
-    max_epochs: 1
+    fast_dev_run: False
+    max_epochs: 4
     lr: 0.001
 
 pipeline_evaluation:
   detect_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
-  classify_confident_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
-  classify_uncertain_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
+  classify_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
   # This is an average mAP threshold for now, but we may want to add a per-iou threshold in the future
   detection_true_positive_threshold: 0.8
   detection_false_positive_threshold: 0.5
   classification_avg_score: 0.5
   image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
+  debug: False
 
 reporting:
   report_dir: /blue/ewhite/b.weinstein/BOEM/reporting/reports
   metadata: /blue/ewhite/b.weinstein/BOEM/reporting/metadata.csv
+  thin_factor: 500
 
 active_learning:
   image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27
@@ -84,7 +88,7 @@ active_learning:
   # Optional parameters:
   evaluation: 
   dask_client: 
-  pool_limit: 5000
+  pool_limit: 500
   gpus: 1
 
 active_testing:
@@ -95,10 +99,3 @@ active_testing:
   patch_size: 2000
   patch_overlap: 0
   min_score: 0.2
-
-deepforest:
-  train:
-    fast_dev_run: False
-    epochs: 10
-    lr: 0.00001
-  workers: 0
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,27 @@
+name: BOEM
+channels:
+    - conda-forge
+dependencies:
+    - python=3.10
+    - mamba
+    - geopandas
+    - hydra-core
+    - numpy
+    - pandas
+    - scikit-learn
+    - rasterio
+    - pytest
+    - label-studio-sdk
+    - dask
+    - bokeh
+    - pytorch
+    - torchvision
+    - paramiko
+    - omegaconf
+    - matplotlib
+    - torchmetrics
+    - pip:
+        - dask_jobqueue
+        - label-studio-converter
+        - comet-ml
+
diff --git a/requirements.txt b/requirements.txt
diff --git a/src/active_learning.py b/src/active_learning.py
@@ -5,7 +5,7 @@
 import dask.array as da
 import pandas as pd
 
-def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, dask_client=None, target_labels=None, pool_limit=1000):
+def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, classification_model=None, dask_client=None, target_labels=None, pool_limit=1000):
     """Choose images to annotate.
     Args:
         evaluation (dict): A dictionary of evaluation metrics.
@@ -20,6 +20,7 @@ def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, p
         patch_overlap (float, optional): The amount of overlap between image patches. Defaults to 0.1.
         min_score (float, optional): The minimum score for a prediction to be included. Defaults to 0.5.
         model (main.deepforest, optional): A trained deepforest model. Defaults to None. 
+        classification_model (main.deepforest, optional): A trained deepforest model for classification. Defaults to None.
         model_path (str, optional): The path to the model checkpoint file. Defaults to None. Only used in combination with dask
         target_labels: (list, optional): A list of target labels to filter images by. Defaults to None.
         pool_limit (int, optional): The maximum number of images to consider. Defaults to 1000.

diff --git a/src/classification.py b/src/classification.py
@@ -57,6 +57,7 @@ def train(model, train_dir, val_dir, comet_workspace=None, comet_project=None, f
 
     if comet_project:
         comet_logger = CometLogger(project_name=comet_project, workspace=comet_workspace)
+        comet_logger.experiment.add_tags(["classification"])
     else:
         comet_logger = None
 
@@ -66,6 +67,9 @@ def train(model, train_dir, val_dir, comet_workspace=None, comet_project=None, f
     model.load_from_disk(train_dir=train_dir, val_dir=val_dir)
     model.trainer.fit(model)
 
+    model.trainer.logger.experiment.end()
+    comet_logger.experiment.end()
+
     return model
 
 def preprocess_images(model, annotations, root_dir, save_dir):
@@ -82,7 +86,6 @@ def preprocess_and_train_classification(config, validation_df=None):
     Args:
         config: Configuration object containing training parameters
         validation_df (pd.DataFrame): A DataFrame containing validation annotations.
-        comet_logger (CometLogger): A CometLogger object.
     Returns:
         trained_model: Trained model object
     """

diff --git a/src/data_processing.py b/src/data_processing.py
@@ -140,7 +140,6 @@ def process_image(
     crop_csv = "{}.csv".format(os.path.join(save_dir, image_name))
 
     if os.path.exists(crop_csv):
-        warn("Crops for {} already exist in {}. Skipping.".format(crop_csv, save_dir))
         return pd.read_csv(crop_csv)
 
     full_path = os.path.join(root_dir, image_path)

diff --git a/src/detection.py b/src/detection.py
@@ -191,25 +191,24 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro
             visualize.plot_results(prediction, savedir=tmpdir)
             comet_logger.experiment.log_image(os.path.join(tmpdir, image_path))
 
+    comet_logger.experiment.end()
+    model.trainer.logger.experiment.end()
+
     return model
 
-def preprocess_and_train(config, validation_df=None, model_type="detection"):
+def preprocess_and_train(config, model_type="detection"):
     """Preprocess data and train model.
     
     Args:
         config: Configuration object containing training parameters
-        validation_df (pd.DataFrame): A DataFrame containing validation annotations.
         model_type (str): The type of model to train. Defaults to "detection".
     Returns:
         trained_model: Trained model object
     """
     # Get and split annotations
-    annotations = gather_data(config.detection_model.train_csv_folder)
-
-    if validation_df is None:
-        train_df, validation_df = create_train_test(annotations)
-    else:
-        train_df = annotations[~annotations["image_path"].isin(validation_df["image_path"])]
+    train_df = gather_data(config.detection_model.train_csv_folder)
+    validation_df = gather_data(config.label_studio.csv_dir_validation)
+    validation_df.loc[validation_df.label==0,"label"] = "Bird"
 
     # Preprocess train and validation data
     train_df = data_processing.preprocess_images(train_df,
@@ -247,7 +246,7 @@ def preprocess_and_train(config, validation_df=None, model_type="detection"):
                             model=loaded_model,
                             comet_project=config.comet.project,
                             comet_workspace=config.comet.workspace,
-                            config_args=config.deepforest)
+                            config_args=config.detection_model.trainer)
 
     return trained_model
 
@@ -277,9 +276,7 @@ def _predict_list_(image_paths, patch_size, patch_overlap, model_path, m=None, c
         if m is None:
             raise ValueError("A model or model_path is required for prediction.")
 
-    # if no trainer, create one
-    if m.trainer is None:
-        m.create_trainer()
+    m.create_trainer(fast_dev_run=False)
 
     predictions = []
     for image_path in image_paths:
@@ -318,7 +315,6 @@ def update_sys_path():
                                               patch_size=patch_size,
                                               patch_overlap=patch_overlap,
                                               model_path=model_path,
-                                              m=m,
                                               crop_model=crop_model)
             block_futures.append(block_future)
         # Get results

diff --git a/src/pipeline.py b/src/pipeline.py
@@ -36,10 +36,12 @@ def save_model(self, model, directory):
         checkpoint_path = os.path.join(directory, f"model_{timestamp}.ckpt")
         model.trainer.save_checkpoint(checkpoint_path)
 
+        return checkpoint_path
+
     def run(self):
         # Check for new annotations if the check_annotations flag is set
         if self.config.check_annotations:
-            new_annotations = label_studio.check_for_new_annotations(
+            new_train_annotations = label_studio.check_for_new_annotations(
                 sftp_client=self.sftp_client,
                 url=self.config.label_studio.url,
                 csv_dir=self.config.label_studio.csv_dir_train,
@@ -50,7 +52,7 @@ def run(self):
             )
 
             # Validation
-            new_annotations = label_studio.check_for_new_annotations(
+            new_val_annotations = label_studio.check_for_new_annotations(
                 sftp_client=self.sftp_client,
                 url=self.config.label_studio.url,
                 csv_dir=self.config.label_studio.csv_dir_validation,
@@ -59,15 +61,20 @@ def run(self):
                 images_to_annotate_dir=self.config.label_studio.images_to_annotate_dir,
                 annotated_images_dir=self.config.label_studio.annotated_images_dir,
             )
-            if new_annotations is None:
+            if new_val_annotations is None:
                 if self.config.force_upload:
                     print("No new annotations, but force_upload is set to True, continuing")
                     self.skip_training = True
                 else:
                     print("No new annotations, exiting")
                     return None
             else:   
-                print(f"New annotations found: {len(new_annotations)}")
+                try:
+                    print(f"New train annotations found: {len(new_train_annotations)}")
+                except:
+                    pass
+                print(f"New val annotations found: {len(new_val_annotations)}")
+
                 self.skip_training = False
 
             # Given new annotations, propogate labels to nearby images
@@ -76,37 +83,28 @@ def run(self):
             # label_propagator.through_time(new_annotations)
         else:
             self.skip_training = False
-
-        if self.config.detection_model.validation_csv_path is not None:
-            validation_df = pd.read_csv(self.config.detection_model.validation_csv_path)
-        else:
-            validation_df = None
 
-        reporter = Reporting(report_dir=self.config.reporting.report_dir, image_dir=self.config.active_learning.image_dir)
-
         if not self.skip_training:
             trained_detection_model = detection.preprocess_and_train(
-                self.config, validation_df=validation_df)
+                self.config)
 
             trained_classification_model = classification.preprocess_and_train_classification(
-                self.config, validation_df=validation_df)
+                self.config)
 
-            self.save_model(trained_detection_model,
+            detection_checkpoint_path = self.save_model(trained_detection_model,
                             self.config.detection_model.checkpoint_dir)
-            self.save_model(trained_classification_model,
+            classification_checkpoint_path = self.save_model(trained_classification_model,
                             self.config.classification_model.checkpoint_dir)
 
             pipeline_monitor = PipelineEvaluation(
-            model=trained_detection_model,
-            crop_model=trained_classification_model,
-            **self.config.pipeline_evaluation)
+                model=trained_detection_model,
+                crop_model=trained_classification_model,
+                **self.config.pipeline_evaluation)
 
             performance = pipeline_monitor.evaluate()
-            reporter.pipeline_monitor = pipeline_monitor
 
             if pipeline_monitor.check_success():
                 print("Pipeline performance is satisfactory, exiting")
-                reporter.generate_report()
                 return None
         else:
             trained_detection_model = detection.load(
@@ -119,6 +117,8 @@ def run(self):
                 trained_classification_model = None
 
             performance = None
+            pipeline_monitor = None
+            detection_checkpoint_path = None
 
         if self.config.active_learning.gpus > 1:
             dask_client = start(gpus=self.config.active_learning.gpus, mem_size="70GB")
@@ -145,6 +145,7 @@ def run(self):
         train_images_to_annotate = choose_train_images(
             evaluation=performance,
             image_dir=self.config.active_learning.image_dir,
+            model_path=detection_checkpoint_path,
             model=trained_detection_model,
             strategy=self.config.active_learning.strategy,
             n=self.config.active_learning.n_images,
@@ -167,9 +168,6 @@ def run(self):
                 min_score=self.config.active_learning.min_score
             )
 
-            reporter.confident_predictions = confident_predictions
-            reporter.uncertain_predictions = uncertain_predictions
-
             print(f"Images requiring human review: {len(confident_predictions)}")
             print(f"Images auto-annotated: {len(uncertain_predictions)}")
 
@@ -191,6 +189,18 @@ def run(self):
                                                     preannotations=preannotations)
 
 
-        if reporter.pipeline_monitor is not None:
-            reporter.generate_report()
+            if pipeline_monitor:
+                reporter = Reporting(
+                    report_dir=self.config.reporting.report_dir,
+                    image_dir=self.config.active_learning.image_dir,
+                    model=trained_detection_model,
+                    classification_model=trained_classification_model,
+                    thin_factor=self.config.reporting.thin_factor,
+                    patch_overlap=self.config.active_learning.patch_overlap,
+                    patch_size=self.config.active_learning.patch_size,
+                    confident_predictions=confident_predictions,
+                    uncertain_predictions=uncertain_predictions,
+                    pipeline_monitor=pipeline_monitor)
+
+                reporter.generate_report()