use comet for dashboard

weecology · Mar 1, 2025 · f36706e · f36706e
1 parent f194dcc
commit f36706e
Show file tree

Hide file tree

Showing 14 changed files with 208 additions and 338 deletions.
diff --git a/USGS_classification.py b/USGS_classification.py
@@ -0,0 +1,27 @@
+from deepforest import model
+import pandas as pd
+import os
+import comet_ml
+from pytorch_lightning.loggers import CometLogger
+from src.classification import preprocess_and_train_classification
+import hydra
+from omegaconf import DictConfig
+
+@hydra.main(config_path="conf/classification_model", config_name="USGS")
+def main(cfg: DictConfig):
+    classification_cfg = cfg.classification
+    savedir = classification_cfg.savedir
+    train = pd.read_csv(os.path.join(savedir, "train.csv"))
+    test = pd.read_csv(os.path.join(savedir, "test.csv"))
+
+    comet_logger = CometLogger(project_name=classification_cfg.project_name, workspace=classification_cfg.workspace)
+    preprocess_and_train_classification(
+        config=cfg,
+        train_df=train,
+        validation_df=test,
+        comet_logger=comet_logger
+    )
+
+if __name__ == "__main__":
+    main()
+
diff --git a/conf/classification_model/USGS.yaml b/conf/classification_model/USGS.yaml
@@ -0,0 +1,11 @@
+classification_model:
+  checkpoint: 
+  checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/checkpoints/
+  train_csv_folder: 
+  train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
+  crop_image_dir:  /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/
+  under_sample_ratio: 0
+  trainer:
+    fast_dev_run: True
+    max_epochs: 1
+    lr: 0.00001
diff --git a/conf/classification_model/finetune.yaml b/conf/classification_model/finetune.yaml
@@ -0,0 +1,11 @@
+classification_model:
+  checkpoint: 
+  checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/classification/checkpoints
+  train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train
+  train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
+  crop_image_dir: /blue/ewhite/b.weinstein/BOEM/classification/crops/
+  under_sample_ratio: 0
+  trainer:
+    fast_dev_run: True
+    max_epochs: 1
+    lr: 0.00001
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -1,6 +1,9 @@
 defaults:
   - server: serenity
-
+  - classification_model: finetune.yaml
+
+debug: False
+
 comet:
   project: BOEM
   workspace: bw4sz
@@ -28,7 +31,7 @@ label_studio:
       csv_dir: /blue/ewhite/b.weinstein/BOEM/annotations/review
       project_name: "Bureau of Ocean Energy Management - Review"
 predict:
-  patch_size: 2000
+  patch_size: 1000
   patch_overlap: 0
   min_score: 0.4
   batch_size: 48
@@ -60,37 +63,19 @@ detection_model:
     validation:
       val_accuracy_interval: 3
 
-classification_model:
-  checkpoint: 
-  checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/classification/checkpoints
-  train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train
-  train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
-  crop_image_dir: /blue/ewhite/b.weinstein/BOEM/classification/crops/
-  under_sample_ratio: 0
-  trainer:
-    fast_dev_run: True
-    max_epochs: 1
-    lr: 0.00001
-
 pipeline_evaluation:
   detect_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
   classify_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
   # This is an average mAP threshold for now, but we may want to add a per-iou threshold in the future
   detection_true_positive_threshold: 0.8
   classification_avg_score: 0.5
   image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
-  debug: False
-
-reporting:
-  report_dir: /orange/ewhite/web/public/BOEM
-  metadata: /blue/ewhite/b.weinstein/BOEM/reporting/metadata/metadata.csv
-  thin_factor: 100
 
 active_learning:
   image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27
   strategy: 'target-labels'
   n_images: 50
-  patch_size: 2000
+  patch_size: 1000
   patch_overlap: 0
   min_score: 0.1
   model_checkpoint:
@@ -108,9 +93,9 @@ active_testing:
   strategy: 'random'
   n_images: 1000
   m: 
-  patch_size: 2000
+  patch_size: 1000
   patch_overlap: 0
   min_score: 0.2
 
 human_review: 
-  n_images: 10
+  n: 10
diff --git a/src/active_learning.py b/src/active_learning.py
@@ -200,54 +200,30 @@ def update_sys_path():
     chosen_preannotations = preannotations[preannotations["image_path"].isin(chosen_images)]
     return chosen_images, chosen_preannotations
 
-def human_review(detection_model, classification_model, image_paths, patch_size, patch_overlap, confident_threshold, min_score, batch_size):
+def human_review(predictions, min_score=0.1, confident_threshold=0.5):
     """
     Predict on images and divide into confident and uncertain predictions.
     Args:
-        detection_model (deepforest.deepforest): A trained detection model.
-        classification_model (deepforest.deepforest): A trained classification model.
-        image_paths (list): A list of image paths.
-        patch_size (int): The size of the image patches to predict on.
-        patch_overlap (float): The amount of overlap between image patches.
         confident_threshold (float): The threshold for confident predictions.
-        min_score (float): The minimum score for a prediction to be included.
-        batch_size (int): The batch size for prediction.
-        existing_predictions (pd.DataFrame, optional): A DataFrame of existing predictions. Defaults to None.
+        min_score (float, optional): The minimum score for a prediction to be included. Defaults to 0.1.
+        predictions (pd.DataFrame, optional): A DataFrame of existing predictions. Defaults to None.
         Returns:
         tuple: A tuple of confident and uncertain predictions.
         """
-    # Check for existing predictions
-    if existing_predictions is not None:
-        image_basenames = [os.path.basename(image_path) for image_path in image_paths]
-        existing_predictions = existing_predictions[existing_predictions["image_path"].isin(image_basenames)]
-        image_paths = [image_path for image_path in image_paths if os.path.basename(image_path) not in existing_predictions["image_path"].unique()]
-    if len(image_paths) > 0:
-        predictions = detection.predict(
-            m=detection_model,
-            crop_model=classification_model,
-            image_paths=image_paths,
-            patch_size=patch_size,
-            patch_overlap=patch_overlap,
-            batch_size=batch_size
-        )
-        predictions = pd.concat(predictions)
-        combined_predictions = pd.concat([predictions, existing_predictions])
-    else:
-        combined_predictions = existing_predictions
-
-    combined_predictions[combined_predictions["score"] > min_score]
+
+    predictions[predictions["score"] > min_score]
 
     # Split predictions into confident and uncertain
-    uncertain_predictions = combined_predictions[
-        combined_predictions["score"] <= confident_threshold]
+    uncertain_predictions = predictions[
+        predictions["score"] <= confident_threshold]
 
-    confident_predictions = combined_predictions[
-        ~combined_predictions["image_path"].isin(
+    confident_predictions = predictions[
+        ~predictions["image_path"].isin(
             uncertain_predictions["image_path"])]
 
     return confident_predictions, uncertain_predictions
 
-def generate_training_pool_predictions(image_dir, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, dask_client=None, batch_size=16, comet_logger=None):
+def generate_training_pool_predictions(image_dir, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, dask_client=None, batch_size=16, comet_logger=None, pool_limit=1000):
     """
     Generate predictions for the training pool.
     
@@ -261,6 +237,7 @@ def generate_training_pool_predictions(image_dir, patch_size=512, patch_overlap=
         dask_client (dask.distributed.Client, optional): A Dask client for parallel processing. Defaults to None.
         batch_size (int, optional): The batch size for prediction. Defaults to 16.
         comet_logger (CometLogger, optional): A CometLogger object. Defaults to None.
+        pool_limit (int, optional): The maximum number of images to consider. Defaults to 1000.
     
     Returns:
         pd.DataFrame: A DataFrame of predictions.
@@ -270,6 +247,10 @@ def generate_training_pool_predictions(image_dir, patch_size=512, patch_overlap=
     # Remove .csv files from the pool
     pool = [image for image in pool if not image.endswith('.csv')]
 
+    #subsample
+    if len(pool) > pool_limit:
+        pool = random.sample(pool, pool_limit)
+
     # Remove crop dir
     try:
         pool.remove(os.path.join(image_dir, "crops"))
@@ -301,7 +282,7 @@ def update_sys_path():
         preannotations = pd.concat(preannotations)
 
     if comet_logger:
-        comet_logger.log_table("active_training_pool", preannotations)
+        comet_logger.experiment.log_table("active_training_pool", preannotations)
 
     # Print the number of preannotations before removing min score
     preannotations = preannotations[preannotations["score"] >= min_score]

diff --git a/src/classification.py b/src/classification.py
@@ -1,14 +1,12 @@
-# Standard library imports
 import os
 import glob
 import warnings
-# Third party imports
-import pandas as pd
+from PIL import Image
+
 from deepforest.model import CropModel
 
 # Local imports
 from src.label_studio import gather_data
-from pytorch_lightning.loggers import CometLogger
 
 def create_train_test(annotations):
     return annotations.sample(frac=0.8, random_state=1), annotations.drop(
@@ -75,12 +73,38 @@ def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_
     """
     model.create_trainer(logger=comet_logger, fast_dev_run=fast_dev_run, max_epochs=max_epochs)
 
-    # Get the data stored from the write_crops step above.
+    # Get the data stored from the write_crops processing.
     model.load_from_disk(train_dir=train_dir, val_dir=val_dir)
 
-    with comet_logger.context_manager("classification"):
+    # Log the validation dataset images
+    for image_path, label in model.val_ds.imgs:
+        label_name = model.numeric_to_label_dict[label]
+        image_name = os.path.basename(image_path)
+        comet_logger.experiment.log_image(image_path, name=f"{label_name}_{image_name}")
+
+    with comet_logger.experiment.context_manager("classification"):
         model.trainer.fit(model)
 
+    # Compute confusion matrix and upload to cometml
+    image_dataset = []
+    y_true = []
+    y_predicted = []
+    for index, (image,label) in enumerate(model.val_ds):
+        image_path, label = model.val_ds.imgs[index]
+        original_image = Image.open(image_path)
+        image_dataset += [original_image]
+        y_true += [label]
+        y_predicted += [model(image.unsqueeze(0)).argmax().item()]
+    labels = model.val_ds.classes
+
+    # Log the confusion matrix to Comet
+    comet_logger.experiment.log_confusion_matrix(
+        y_true=y_true,
+        y_predicted=y_predicted,
+        images=image_dataset,
+        labels=labels,
+    )
+
     return model
 
 def preprocess_images(model, annotations, root_dir, save_dir):
@@ -93,18 +117,21 @@ def preprocess_images(model, annotations, root_dir, save_dir):
     labels = annotations["label"].values
     model.write_crops(boxes=boxes, root_dir=root_dir, images=images, labels=labels, savedir=save_dir)
 
-def preprocess_and_train_classification(config, validation_df=None, comet_logger=None):
+def preprocess_and_train_classification(config, train_df=None, validation_df=None, comet_logger=None):
     """Preprocess data and train a crop model.
     
     Args:
         config: Configuration object containing training parameters
+        train_df (pd.DataFrame): A DataFrame containing training annotations.
         validation_df (pd.DataFrame): A DataFrame containing validation annotations.
         comet_logger: CometLogger object for logging experiments
     Returns:
         trained_model: Trained model object
     """
     # Get and split annotations
-    annotations = gather_data(config.classification_model.train_csv_folder)
+    if train_df is not None:
+        annotations = gather_data(config.classification_model.train_csv_folder)
+
     num_classes = len(annotations["label"].unique())
 
     # Remove the empty frames
@@ -128,6 +155,7 @@ def preprocess_and_train_classification(config, validation_df=None, comet_logger
 
     # Force the label dict, DeepForest will update this soon
     loaded_model.label_dict = {v:k for k,v in enumerate(annotations["label"].unique())}
+    loaded_model.numeric_to_label_dict = {v:k for k,v in loaded_model.label_dict.items()}
 
     # Preprocess train and validation data
     preprocess_images(
@@ -146,11 +174,9 @@ def preprocess_and_train_classification(config, validation_df=None, comet_logger
         train_dir=config.classification_model.crop_image_dir,
         val_dir=config.classification_model.crop_image_dir,
         model=loaded_model,
-        comet_workspace=config.comet.workspace,
-        comet_project=config.comet.project,
         fast_dev_run=config.classification_model.trainer.fast_dev_run,
         max_epochs=config.classification_model.trainer.max_epochs,
         comet_logger=comet_logger
         )
 
-    return trained_model
+    return trained_model
diff --git a/src/detection.py b/src/detection.py
@@ -12,7 +12,6 @@
 import pandas as pd
 from deepforest import main, visualize
 from deepforest.utilities import read_file
-from pytorch_lightning.loggers import CometLogger
 
 # Local imports
 from src import data_processing

diff --git a/src/label_studio.py b/src/label_studio.py
@@ -7,21 +7,23 @@
 import shutil
 from PIL import Image
 
-def upload_to_label_studio(images, sftp_client, label_studio_project, images_to_annotate_dir, folder_name, preannotations):
+def upload_to_label_studio(images, sftp_client, url, project_name, images_to_annotate_dir, folder_name, preannotations):
     """
     Upload images to Label Studio and import image tasks.
 
     Args:
         images (list): List of image paths to upload.
+        url (str): The URL of the Label Studio server.
         sftp_client (paramiko.SFTPClient): The SFTP client for uploading images.
-        label_studio_project (label_studio_sdk.Project): The Label Studio project instance.
+        project_name (str): The name of the Label Studio project.
         images_to_annotate_dir (str): The path to the directory of images to annotate.
         folder_name (str): The name of the folder to upload images to.
         preannotations (list): List of preannotations for the images.
 
     Returns:
         None
     """
+    label_studio_project = connect_to_label_studio(url=url, project_name=project_name)
     upload_images(sftp_client=sftp_client, images=images, folder_name=folder_name)
     import_image_tasks(label_studio_project=label_studio_project, image_names=images, local_image_dir=images_to_annotate_dir, predictions=preannotations)
 
@@ -165,6 +167,9 @@ def gather_data(annotation_dir):
     df = []
     for x in csvs:
         df.append(pd.read_csv(x))
+
+    if len(df) == 0:
+        return None
     df = pd.concat(df)
     df.drop_duplicates(inplace=True)
     df.reset_index(drop=True, inplace=True)
@@ -201,7 +206,6 @@ def connect_to_label_studio(url, project_name, label_config=None):
 
     if len(project) == 0:
         # Create a project with the specified title and labeling configuration
-
         project = ls.create_project(
             title=project_name,
             label_config=label_config
@@ -243,8 +247,6 @@ def import_image_tasks(label_studio_project, image_names, local_image_dir, predi
     Returns:
         None
     """
-    import os
-
     tasks = []
     for index, image_name in enumerate(image_names):
         print(f"Importing {image_name} into Label Studio")
@@ -260,7 +262,8 @@ def import_image_tasks(label_studio_project, image_names, local_image_dir, predi
         else:
             upload_dict = {"data": data_dict}
         tasks.append(upload_dict)
-    label_studio_project.import_tasks(tasks)
+    if len(tasks) > 0:
+        label_studio_project.import_tasks(tasks)
 
 def download_completed_tasks(label_studio_project, csv_dir):
     labeled_tasks = label_studio_project.get_labeled_tasks()