classification trains, pipeline tests need to be updated

weecology · Mar 4, 2025 · 3660d7b · 3660d7b
1 parent f06f611
commit 3660d7b
Show file tree

Hide file tree

Showing 16 changed files with 188 additions and 143 deletions.
diff --git a/USGS_classification.py b/USGS_classification.py
@@ -1,29 +1,59 @@
 from deepforest import model
 import pandas as pd
-import os
+import glob
 import comet_ml
 from pytorch_lightning.loggers import CometLogger
 from src.classification import preprocess_and_train_classification
 import hydra
 from omegaconf import DictConfig
 
+# Create train test split, split each class into 90% train and 10% test with a minimum of 10 images per class for test and a max of 100
+def train_test_split(df, test_size=0.1, min_test_images=10, max_test_images=100):
+    train_df = pd.DataFrame()
+    test_df = pd.DataFrame()
+
+    for label in df['label'].unique():
+        class_df = df[df['label'] == label]
+        test_count = max(min_test_images, int(len(class_df) * test_size))
+        test_count = min(test_count, max_test_images)
+
+        test_class_df = class_df.sample(n=test_count)
+        train_class_df = class_df.drop(test_class_df.index)
+
+        train_df = pd.concat([train_df, train_class_df])
+        test_df = pd.concat([test_df, test_class_df])
+
+    return train_df, test_df
+
 @hydra.main(config_path="conf", config_name="config")
 def main(cfg: DictConfig):
     # Override the classification_model config with USGS.yaml
     cfg = hydra.compose(config_name="config", overrides=["classification_model=USGS"])
-
-    classification_cfg = cfg.classification_model
-
+
     # From the detection script
-    savedir = "/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops"
-    train = pd.read_csv(os.path.join(savedir, "train.csv"))
-    test = pd.read_csv(os.path.join(savedir, "test.csv"))
+    crop_annotations = glob.glob("/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops/*.csv")
+    crop_annotations = [pd.read_csv(x) for x in crop_annotations]
+    crop_annotations = pd.concat(crop_annotations)
+
+    # Keep labels with more than 100 images
+    crop_annotations = crop_annotations.groupby("label").filter(lambda x: len(x) > 100)
+
+    # Only keep two word labels
+    crop_annotations = crop_annotations[crop_annotations["label"].str.contains(" ")]
 
-    comet_logger = CometLogger(project_name=cfg.project, workspace=cfg.workspace)
+    # Expand bounding boxes by 30 pixels on all sides
+    crop_annotations["xmin"] -= 30
+    crop_annotations["ymin"] -= 30
+    crop_annotations["xmax"] += 30
+    crop_annotations["ymax"] += 30
+
+    train_df, validation_df = train_test_split(crop_annotations)
+
+    comet_logger = CometLogger(project_name=cfg.comet.project, workspace=cfg.comet.workspace)
     preprocess_and_train_classification(
         config=cfg,
-        train_df=train,
-        validation_df=test,
+        train_df=train_df,
+        validation_df=validation_df,
         comet_logger=comet_logger
     )
 

diff --git a/conf/classification_model/USGS.yaml b/conf/classification_model/USGS.yaml
@@ -1,12 +1,12 @@
-classification_model:
-  checkpoint: 
-  checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/checkpoints/
-  train_csv_folder: 
-  train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
-  crop_image_dir:  /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/
-  under_sample_ratio: 0
-  trainer:
-    fast_dev_run: True
-    max_epochs: 1
-    lr: 0.00001
-    batch_size: 16
+checkpoint: 
+checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/checkpoints/
+train_csv_folder: 
+train_image_dir: /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops
+crop_image_dir:  /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/crops/
+under_sample_ratio: 0
+trainer:
+  fast_dev_run: False
+  max_epochs: 100
+  lr: 0.00001
+  batch_size: 16
+  workers: 10
diff --git a/conf/classification_model/finetune.yaml b/conf/classification_model/finetune.yaml
@@ -1,12 +1,12 @@
-classification_model:
-  checkpoint: 
-  checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/classification/checkpoints
-  train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train
-  train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
-  crop_image_dir: /blue/ewhite/b.weinstein/BOEM/classification/crops/
-  under_sample_ratio: 0
-  trainer:
-    fast_dev_run: True
-    max_epochs: 1
-    lr: 0.00001
-    batch_size: 16
+checkpoint: 
+checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/classification/checkpoints
+train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train
+train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
+crop_image_dir: /blue/ewhite/b.weinstein/BOEM/classification/crops/
+under_sample_ratio: 0
+trainer:
+  fast_dev_run: False
+  max_epochs: 1
+  lr: 0.00001
+  batch_size: 16
+  workers: 10
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -34,7 +34,7 @@ predict:
   patch_size: 1000
   patch_overlap: 0
   min_score: 0.4
-  batch_size: 48
+  batch_size: 32
 
 pipeline:
   confidence_threshold: 0.9
@@ -54,14 +54,14 @@ detection_model:
   labels:
     - "Object"
   trainer: 
-    batch_size: 4
+    batch_size: 12
     train:
       fast_dev_run: False
-      epochs: 10
-      lr: 0.000001
-    workers: 0
+      epochs: 20
+      lr: 0.00001
+    workers: 10
     validation:
-      val_accuracy_interval: 3
+      val_accuracy_interval: 5
 
 pipeline_evaluation:
   detect_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation

diff --git a/src/classification.py b/src/classification.py
@@ -4,6 +4,7 @@
 from PIL import Image
 
 from deepforest.model import CropModel
+import torch
 
 # Local imports
 from src.label_studio import gather_data
@@ -58,7 +59,7 @@ def load(checkpoint=None, annotations=None, checkpoint_dir=None, lr=0.0001, num_
 
     return loaded_model
 
-def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_epochs=10, batch_size=4):
+def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_epochs=10, batch_size=4, workers=0):
     """Train a model on labeled images.
     Args:
         model (CropModel): A CropModel object.
@@ -73,19 +74,32 @@ def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_
         main.deepforest: A trained deepforest model.
     """
     model.batch_size = batch_size
-    model.create_trainer(logger=comet_logger, fast_dev_run=fast_dev_run, max_epochs=max_epochs)
+    model.num_workers = workers
+
+    devices = torch.cuda.device_count()
+    model.create_trainer(logger=comet_logger, fast_dev_run=fast_dev_run, max_epochs=max_epochs, num_nodes=1, devices = devices)
 
     # Get the data stored from the write_crops processing.
     model.load_from_disk(train_dir=train_dir, val_dir=val_dir)
 
-    # Log the validation dataset images
+    model.label_dict = model.train_ds.class_to_idx
+    model.numeric_to_label = {v: k for k, v in model.train_ds.class_to_idx.items()}
+
+    # Log the validation dataset images, max 10 per class
+    label_count = {}
+    numeric_to_label = {v: k for k, v in model.val_ds.class_to_idx.items()}
     for image_path, label in model.val_ds.imgs:
-        label_name = model.numeric_to_label_dict[label]
-        image_name = os.path.basename(image_path)
-        comet_logger.experiment.log_image(image_path, name=f"{label_name}_{image_name}")
+        label_name =numeric_to_label[label]
+        if label_name not in label_count:
+            label_count[label_name] = 0
+        if label_count[label_name] < 10:
+            image_name = os.path.basename(image_path)
+            comet_logger.experiment.log_image(image_path, name=f"{label_name}_{image_name}")
+            label_count[label_name] += 1
+
+    #with comet_logger.experiment.context_manager("classification"):
 
-    with comet_logger.experiment.context_manager("classification"):
-        model.trainer.fit(model)
+    model.trainer.fit(model)
 
     # Compute confusion matrix and upload to cometml
     image_dataset = []
@@ -112,6 +126,7 @@ def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_
 def preprocess_images(model, annotations, root_dir, save_dir):
     # Remove any annotations with empty boxes
     annotations = annotations[(annotations['xmin'] != 0) & (annotations['ymin'] != 0) & (annotations['xmax'] != 0) & (annotations['ymax'] != 0)]
+
     # Remove any negative values
     annotations = annotations[(annotations['xmin'] >= 0) & (annotations['ymin'] >= 0) & (annotations['xmax'] >= 0) & (annotations['ymax'] >= 0)]
     boxes = annotations[['xmin', 'ymin', 'xmax', 'ymax']].values.tolist()
@@ -131,9 +146,11 @@ def preprocess_and_train_classification(config, train_df=None, validation_df=Non
         trained_model: Trained model object
     """
     # Get and split annotations
-    if train_df is not None:
+    if train_df is None:
         annotations = gather_data(config.classification_model.train_csv_folder)
-
+    else:
+        annotations = train_df
+
     num_classes = len(annotations["label"].unique())
 
     # Remove the empty frames
@@ -155,10 +172,6 @@ def preprocess_and_train_classification(config, train_df=None, validation_df=Non
         num_classes=num_classes
         )
 
-    # Force the label dict, DeepForest will update this soon
-    loaded_model.label_dict = {v:k for k,v in enumerate(annotations["label"].unique())}
-    loaded_model.numeric_to_label_dict = {v:k for k,v in loaded_model.label_dict.items()}
-
     # Preprocess train and validation data
     preprocess_images(
         model=loaded_model, 
@@ -179,7 +192,8 @@ def preprocess_and_train_classification(config, train_df=None, validation_df=Non
         model=loaded_model,
         fast_dev_run=config.classification_model.trainer.fast_dev_run,
         max_epochs=config.classification_model.trainer.max_epochs,
-        comet_logger=comet_logger
+        comet_logger=comet_logger,
+        workers=config.classification_model.trainer.workers
         )
 
     return trained_model
diff --git a/src/detection.py b/src/detection.py
@@ -12,6 +12,7 @@
 import pandas as pd
 from deepforest import main, visualize
 from deepforest.utilities import read_file
+import torch
 
 # Local imports
 from src import data_processing
@@ -29,7 +30,8 @@ def evaluate(model, test_csv, image_root_dir):
         dict: A dictionary of evaluation metrics.
     """
     # create trainer
-    model.create_trainer()
+    devices = torch.cuda.device_count()
+    model.create_trainer(num_nodes=1, devices=devices)
     model.config["validation"]["csv_file"] = test_csv
     model.config["validation"]["root_dir"] = image_root_dir
     results = model.trainer.validate(model)
@@ -160,19 +162,20 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_log
             else:
                 model.config[key] = value
 
+    devices = torch.cuda.device_count()
     if comet_logger:
         comet_logger.experiment.log_parameters(model.config)
         comet_logger.experiment.log_table("train.csv", train_annotations)
         comet_logger.experiment.log_table("test.csv", test_annotations)
-        model.create_trainer(logger=comet_logger)
+        model.create_trainer(logger=comet_logger, num_nodes=1, devices=devices)
     else:
-        model.create_trainer()
+        model.create_trainer(num_nodes=1, devices=devices)
 
     with comet_logger.experiment.context_manager("train_images"):
         non_empty_train_annotations = read_file(model.config["train"]["csv_file"], root_dir=train_image_dir)
         # Sanity check for debug
         n = 5 if non_empty_train_annotations.shape[0] > 5 else non_empty_train_annotations.shape[0]
-        for filename in non_empty_train_annotations.image_path.sample():
+        for filename in non_empty_train_annotations.image_path.sample(n=n).unique():
             sample_train_annotations_for_image = non_empty_train_annotations[non_empty_train_annotations.image_path == filename]
             sample_train_annotations_for_image.root_dir = train_image_dir
             visualize.plot_annotations(sample_train_annotations_for_image, savedir=tmpdir)
@@ -181,7 +184,7 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_log
     with comet_logger.experiment.context_manager("test_images"):
         non_empty_validation_annotations = read_file(model.config["validation"]["csv_file"], root_dir=train_image_dir)
         n = 5 if non_empty_validation_annotations.shape[0] > 5 else non_empty_validation_annotations.shape[0]
-        for filename in non_empty_validation_annotations.image_path.head(5):
+        for filename in non_empty_validation_annotations.image_path.sample(n=n).unique():
             sample_validation_annotations_for_image = non_empty_validation_annotations[non_empty_validation_annotations.image_path == filename]
             sample_validation_annotations_for_image.root_dir = train_image_dir
             visualize.plot_annotations(sample_validation_annotations_for_image, savedir=tmpdir)
@@ -191,7 +194,7 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_log
         model.trainer.fit(model)
 
     with comet_logger.experiment.context_manager("post-training prediction"):
-        for image_path in test_annotations.image_path.head(5):
+        for image_path in test_annotations.image_path.unique():
             prediction = model.predict_image(path = os.path.join(train_image_dir, image_path))
             if prediction is None:
                 continue
@@ -305,7 +308,7 @@ def _predict_list_(image_paths, patch_size, patch_overlap, model_path, m=None, c
         if m is None:
             raise ValueError("A model or model_path is required for prediction.")
 
-    m.create_trainer(fast_dev_run=False)
+    m.create_trainer(fast_dev_run=False, devices=1)
     m.config["batch_size"] = batch_size
     predictions = []
     for image_path in image_paths:

diff --git a/src/label_studio.py b/src/label_studio.py
@@ -12,7 +12,7 @@ def upload_to_label_studio(images, sftp_client, url, project_name, images_to_ann
     Upload images to Label Studio and import image tasks.
 
     Args:
-        images (list): List of image paths to upload.
+        images (list): List of image paths to upload, full paths
         url (str): The URL of the Label Studio server.
         sftp_client (paramiko.SFTPClient): The SFTP client for uploading images.
         project_name (str): The name of the Label Studio project.

diff --git a/src/pipeline.py b/src/pipeline.py
@@ -25,7 +25,7 @@ def __init__(self, cfg: DictConfig):
         self.all_images = glob.glob(os.path.join(self.config.active_learning.image_dir, "*.jpg"))
 
         self.comet_logger = CometLogger(project_name=self.config.comet.project, workspace=self.config.comet.workspace)
-
+        self.comet_logger.experiment.add_tag("pipeline")
 
     def save_model(self, model, directory):
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -194,7 +194,8 @@ def run(self):
         chosen_uncertain_images = uncertain_predictions.sort_values(by="score", ascending=False).head(self.config.human_review.n)["image_path"].tolist()
         chosen_preannotations = uncertain_predictions[uncertain_predictions.image_path.isin(chosen_uncertain_images)]
         chosen_preannotations = [group for _, group in chosen_preannotations.groupby("image_path")]
-        label_studio.upload_to_label_studio(images=chosen_uncertain_images, 
+        full_image_paths = [os.path.join(self.config.active_learning.image_dir, image) for image in chosen_uncertain_images]
+        label_studio.upload_to_label_studio(images=full_image_paths, 
                                             sftp_client=self.sftp_client, 
                                             url=self.config.label_studio.url,
                                             project_name=self.config.label_studio.instances.review.project_name, 

diff --git a/src/pipeline_evaluation.py b/src/pipeline_evaluation.py
@@ -12,7 +12,7 @@
 import os
 
 class PipelineEvaluation:
-    def __init__(self, model, crop_model, image_dir, detect_ground_truth_dir, classify_ground_truth_dir, detection_true_positive_threshold=0.85, classification_avg_score=0.5, patch_size=450, patch_overlap=0, min_score=0.5, debug=False, batch_size=16, detection_results=None, comet_logger=None):
+    def __init__(self, model, crop_model, image_dir, detect_ground_truth_dir, classify_ground_truth_dir, comet_logger, detection_true_positive_threshold=0.85, classification_avg_score=0.5, patch_size=450, patch_overlap=0, min_score=0.5, debug=False, batch_size=16, detection_results=None):
         """Initialize pipeline evaluation.
         
         Args:
@@ -22,6 +22,7 @@ def __init__(self, model, crop_model, image_dir, detect_ground_truth_dir, classi
             detect_ground_truth_dir (str): Directory containing detection ground truth annotation CSV files
             classify_ground_truth_dir (str): Directory containing confident classification ground truth annotation CSV files
             detection_true_positive_threshold (float): IoU threshold for considering a detection a true positive
+            comet_logger: CometLogger object for logging
             classification_threshold (float): Threshold for classification confidence score
             patch_size (int): Size of image patches for prediction
             patch_overlap (int): Overlap between patches
@@ -246,12 +247,13 @@ def evaluate_detection(self):
         combined_predictions["workflow"] = "detection"
         self.predictions.append(combined_predictions)
 
-        # replace None with 0
-        combined_predictions = combined_predictions.fillna(0)
-        combined_predictions["label"] = "Object"
+        # Remove empty predictions, needs to be confirmed for edge cases
+        combined_predictions = combined_predictions[~combined_predictions["score"].isna()]
 
         combined_predictions = read_file(combined_predictions, self.image_dir)
-        ground_truth = read_file(self.detection_annotations, self.image_dir)
+        ground_truth = self.detection_annotations
+        if "geometry" not in ground_truth.columns:
+            ground_truth = read_file(ground_truth, self.image_dir)
 
         iou_results = evaluate_boxes(
             combined_predictions,

diff --git a/submit.sh b/submit.sh
@@ -4,7 +4,7 @@
 #SBATCH [email protected]  # Where to send mail
 #SBATCH --account=ewhite
 #SBATCH --nodes=1                 # Number of MPI ran
-#SBATCH --cpus-per-task=1
+#SBATCH --cpus-per-task=10
 #SBATCH --mem=150GB
 #SBATCH --time=48:00:00       #Time limit hrs:min:sec
 #SBATCH --output=/home/b.weinstein/logs/BOEM%j.out   # Standard output and error log
@@ -15,4 +15,4 @@
 source activate BOEM
 
 cd ~/BOEM/
-python main.py check_annotations=True active_learning.pool_limit=10 active_testing.n_images=1 active_learning.n_images=1 
+srun python main.py check_annotations=True active_learning.pool_limit=10000 active_testing.n_images=100 active_learning.n_images=200