refactor comet logger

weecology · Feb 27, 2025 · f194dcc · f194dcc
1 parent 3c7188f
commit f194dcc
Show file tree

Hide file tree

Showing 10 changed files with 347 additions and 268 deletions.
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -11,15 +11,22 @@ force_upload: False
 force_training: True
 
 label_studio:
-  project_name_train: "Bureau of Ocean Energy Management - Training"
-  project_name_validation: "Bureau of Ocean Energy Management - Validation"
   url: "https://labelstudio.naturecast.org/"
   folder_name: "/pgsql/retrieverdash/everglades-label-studio/everglades-data"
   images_to_annotate_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27
   annotated_images_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
   csv_dir_train: /blue/ewhite/b.weinstein/BOEM/annotations/train
   csv_dir_validation: /blue/ewhite/b.weinstein/BOEM/annotations/validation
-
+  instances:
+    train:
+      csv_dir: /blue/ewhite/b.weinstein/BOEM/annotations/train
+      project_name: "Bureau of Ocean Energy Management - Training"
+    validation:
+      csv_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
+      project_name: "Bureau of Ocean Energy Management - Validation"
+    review:
+      csv_dir: /blue/ewhite/b.weinstein/BOEM/annotations/review
+      project_name: "Bureau of Ocean Energy Management - Review"
 predict:
   patch_size: 2000
   patch_overlap: 0
@@ -35,7 +42,7 @@ propagate:
   distance_threshold_pixels: 50
 
 detection_model:
-  checkpoint: "/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/checkpoints/900710af16d9431aaef84bf74034a48c.pl"
+  checkpoint: "/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/checkpoints/d3a74b0852df4d3ba9a6503660fcc9f1.pl"
   checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/detection/checkpoints
   train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train/
   train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
@@ -104,3 +111,6 @@ active_testing:
   patch_size: 2000
   patch_overlap: 0
   min_score: 0.2
+
+human_review: 
+  n_images: 10
diff --git a/src/active_learning.py b/src/active_learning.py
@@ -5,7 +5,7 @@
 import dask.array as da
 import pandas as pd
 
-def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, classification_model=None, dask_client=None, target_labels=None, pool_limit=1000, batch_size=16):
+def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, classification_model=None, dask_client=None, target_labels=None, pool_limit=1000, batch_size=16, selected_test_images=[], comet_logger=None):
     """
     Choose images to annotate.
     Args:
@@ -26,8 +26,11 @@ def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, p
         target_labels: (list, optional): A list of target labels to filter images by. Defaults to None.
         pool_limit (int, optional): The maximum number of images to consider. Defaults to 1000.
         batch_size (int, optional): The batch size for prediction. Defaults to 16.
+        selected_test_images (list, optional): A list of test images that have already been selected. Defaults to [].
+        comet_logger (CometLogger, optional): A CometLogger object. Defaults to None.
     Returns:
-        tuple: A tuple containing a list of chosen image paths and a DataFrame of preannotations.
+        list: A list of image paths.
+        pd.DataFrame: A DataFrame of preannotations.
     """
     pool = glob.glob(os.path.join(image_dir,"*.jpg")) # Get all images in the data directory
 
@@ -44,6 +47,9 @@ def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, p
     if len(pool) > pool_limit:
         pool = random.sample(pool, pool_limit)
 
+    # Don't allow any test images that have just been selected to be chosen
+    pool = [x for x in pool if not x in selected_test_images]
+
     if strategy=="random":
         chosen_images = random.sample(pool, n)
         return chosen_images    
@@ -75,13 +81,12 @@ def update_sys_path():
             preannotations = detection.predict(m=model, image_paths=pool, patch_size=patch_size, patch_overlap=patch_overlap, batch_size=batch_size)
             preannotations = pd.concat(preannotations)
 
+        if comet_logger:
+            comet_logger.log_table("active_training_pool", preannotations)
+
         # Print the number of preannotations before removing min score
-        print("There are {} preannotations before removing min score".format(preannotations.shape[0]))
-        print("There are {} images before removing min score".format(preannotations["image_path"].nunique()))
         preannotations = preannotations[preannotations["score"] >= min_score]
-        print("There are {} preannotations after removing min score".format(preannotations.shape[0]))
-        print("There are {} images after removing min score".format(preannotations["image_path"].nunique()))
-
+
         if strategy == "most-detections":
             # Sort images by total number of predictions
             chosen_images = preannotations.groupby("image_path").size().sort_values(ascending=False).head(n).index.tolist()
@@ -97,9 +102,12 @@ def update_sys_path():
     else:
         raise ValueError("Invalid strategy. Must be one of 'random', 'most-detections', or 'target-labels'.")
 
-    return chosen_images, preannotations
+    # Get preannotations for chosen images
+    chosen_preannotations = preannotations[preannotations["image_path"].isin(chosen_images)]
+
+    return chosen_images, chosen_preannotations
 
-def choose_test_images(image_dir, strategy, n=10, patch_size=512, patch_overlap=0, min_score=0.5, model=None, model_path=None, dask_client=None, target_labels=None, pool_limit=1000, batch_size=1):
+def choose_test_images(image_dir, strategy, n=10, patch_size=512, patch_overlap=0, min_score=0.5, model=None, model_path=None, dask_client=None, target_labels=None, pool_limit=1000, batch_size=1, comet_logger=None):
     """Choose images to annotate.
     Args:
         evaluation (dict): A dictionary of evaluation metrics.
@@ -118,8 +126,10 @@ def choose_test_images(image_dir, strategy, n=10, patch_size=512, patch_overlap=
         target_labels: (list, optional): A list of target labels to filter images by. Defaults to None.
         pool_limit (int, optional): The maximum number of images to consider. Defaults to 1000.
         batch_size (int, optional): The batch size for prediction. Defaults to 1.
+        comet_logger (CometLogger, optional): A CometLogger object. Defaults to None.
     Returns:
         list: A list of image paths.
+        pd.DataFrame: A DataFrame of preannotations.
     """
     pool = glob.glob(os.path.join(image_dir,"*")) # Get all images in the data directory
     # Remove .csv files from the pool
@@ -138,7 +148,7 @@ def choose_test_images(image_dir, strategy, n=10, patch_size=512, patch_overlap=
     if strategy=="random":
         chosen_images = random.sample(pool, n)
         preannotations = None
-        return chosen_images    
+        return chosen_images, None    
     elif strategy in ["most-detections","target-labels"]:
         # Predict all images
         if model_path is None:
@@ -167,6 +177,9 @@ def update_sys_path():
             preannotations = detection.predict(model=model, image_paths=pool, patch_size=patch_size, patch_overlap=patch_overlap, batch_size=batch_size)
             preannotations = pd.concat(preannotations)
 
+        if comet_logger:
+            comet_logger.log_table("active_testing_pool", preannotations)
+
         print("There are {} preannotations before removing min score".format(preannotations.shape[0]))
         preannotations = preannotations[preannotations["score"] >= min_score]
 
@@ -183,9 +196,11 @@ def update_sys_path():
     else:
         raise ValueError("Invalid strategy. Must be one of 'random', 'most-detections', or 'target-labels'.")
 
-    return chosen_images, preannotations
+    # Get preannotations for chosen images
+    chosen_preannotations = preannotations[preannotations["image_path"].isin(chosen_images)]
+    return chosen_images, chosen_preannotations
 
-def predict_and_divide(detection_model, classification_model, image_paths, patch_size, patch_overlap, confident_threshold, min_score, batch_size):
+def human_review(detection_model, classification_model, image_paths, patch_size, patch_overlap, confident_threshold, min_score, batch_size):
     """
     Predict on images and divide into confident and uncertain predictions.
     Args:
@@ -201,7 +216,6 @@ def predict_and_divide(detection_model, classification_model, image_paths, patch
         Returns:
         tuple: A tuple of confident and uncertain predictions.
         """
-
     # Check for existing predictions
     if existing_predictions is not None:
         image_basenames = [os.path.basename(image_path) for image_path in image_paths]
@@ -226,9 +240,105 @@ def predict_and_divide(detection_model, classification_model, image_paths, patch
     # Split predictions into confident and uncertain
     uncertain_predictions = combined_predictions[
         combined_predictions["score"] <= confident_threshold]
-            
+
     confident_predictions = combined_predictions[
         ~combined_predictions["image_path"].isin(
             uncertain_predictions["image_path"])]
 
-    return confident_predictions, uncertain_predictions
+    return confident_predictions, uncertain_predictions
+
+def generate_training_pool_predictions(image_dir, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, dask_client=None, batch_size=16, comet_logger=None):
+    """
+    Generate predictions for the training pool.
+    
+    Args:
+        image_dir (str): The path to a directory of images.
+        patch_size (int, optional): The size of the image patches to predict on. Defaults to 512.
+        patch_overlap (float, optional): The amount of overlap between image patches. Defaults to 0.1.
+        min_score (float, optional): The minimum score for a prediction to be included. Defaults to 0.1.
+        model (main.deepforest, optional): A trained deepforest model. Defaults to None.
+        model_path (str, optional): The path to the model checkpoint file. Defaults to None. Only used in combination with dask.
+        dask_client (dask.distributed.Client, optional): A Dask client for parallel processing. Defaults to None.
+        batch_size (int, optional): The batch size for prediction. Defaults to 16.
+        comet_logger (CometLogger, optional): A CometLogger object. Defaults to None.
+    
+    Returns:
+        pd.DataFrame: A DataFrame of predictions.
+    """
+    pool = glob.glob(os.path.join(image_dir, "*.jpg"))  # Get all images in the data directory
+
+    # Remove .csv files from the pool
+    pool = [image for image in pool if not image.endswith('.csv')]
+
+    # Remove crop dir
+    try:
+        pool.remove(os.path.join(image_dir, "crops"))
+    except ValueError:
+        pass
+
+    if dask_client:
+        # load model on each client
+        def update_sys_path():
+            import sys
+            sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        dask_client.run(update_sys_path)
+
+        # Load model on each client
+        dask_pool = da.from_array(pool, chunks=len(pool) // len(dask_client.ncores()))
+        blocks = dask_pool.to_delayed().ravel()
+        block_futures = []
+        for block in blocks:
+            block_future = dask_client.submit(detection.predict, image_paths=block.compute(), patch_size=patch_size, patch_overlap=patch_overlap, model_path=model_path)
+            block_futures.append(block_future)
+        # Get results
+        dask_results = []
+        for block_result in block_futures:
+            block_result = block_result.result()
+            dask_results.append(pd.concat(block_result))
+        preannotations = pd.concat(dask_results)
+    else:
+        preannotations = detection.predict(m=model, image_paths=pool, patch_size=patch_size, patch_overlap=patch_overlap, batch_size=batch_size)
+        preannotations = pd.concat(preannotations)
+
+    if comet_logger:
+        comet_logger.log_table("active_training_pool", preannotations)
+
+    # Print the number of preannotations before removing min score
+    preannotations = preannotations[preannotations["score"] >= min_score]
+
+    return preannotations
+
+def select_train_images(preannotations, strategy, n=10, target_labels=None):
+    """
+    Select images to annotate based on the strategy.
+    
+    Args:
+        preannotations (pd.DataFrame): A DataFrame of predictions.
+        strategy (str): The strategy for choosing images. Available strategies are:
+            - "random": Choose images randomly from the pool.
+            - "most-detections": Choose images with the most detections based on predictions.
+            - "target-labels": Choose images with target labels.
+        n (int, optional): The number of images to choose. Defaults to 10.
+        target_labels (list, optional): A list of target labels to filter images by. Defaults to None.
+    
+    Returns:
+        list: A list of image paths.
+        pd.DataFrame: A DataFrame of preannotations for the chosen images.
+    """
+    if strategy == "random":
+        chosen_images = random.sample(preannotations["image_path"].unique().tolist(), n)
+    elif strategy == "most-detections":
+        # Sort images by total number of predictions
+        chosen_images = preannotations.groupby("image_path").size().sort_values(ascending=False).head(n).index.tolist()
+    elif strategy == "target-labels":
+        if target_labels is None:
+            raise ValueError("Target labels are required for the 'target-labels' strategy.")
+        # Filter images by target labels
+        chosen_images = preannotations[preannotations.label.isin(target_labels)].groupby("image_path")["score"].mean().sort_values(ascending=False).head(n).index.tolist()
+    else:
+        raise ValueError("Invalid strategy. Must be one of 'random', 'most-detections', or 'target-labels'.")
+
+    # Get preannotations for chosen images
+    chosen_preannotations = preannotations[preannotations["image_path"].isin(chosen_images)]
+
+    return chosen_images, chosen_preannotations
diff --git a/src/classification.py b/src/classification.py
@@ -60,30 +60,26 @@ def load(checkpoint=None, annotations=None, checkpoint_dir=None, lr=0.0001, num_
 
     return loaded_model
 
-def train(model, train_dir, val_dir, comet_workspace=None, comet_project=None, fast_dev_run=False, max_epochs=10):
+def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_epochs=10):
     """Train a model on labeled images.
     Args:
         model (CropModel): A CropModel object.
         train_dir (str): The directory containing the training images.
         val_dir (str): The directory containing the validation images.
         fast_dev_run (bool): Whether to run a fast development run.
         max_epochs (int): The maximum number of epochs to train for.
+        comet_logger (CometLogger): A CometLogger object.
 
     Returns:
         main.deepforest: A trained deepforest model.
     """
-
-    if comet_project:
-        comet_logger = CometLogger(project_name=comet_project, workspace=comet_workspace)
-        comet_logger.experiment.add_tags(["classification"])
-    else:
-        comet_logger = None
-
     model.create_trainer(logger=comet_logger, fast_dev_run=fast_dev_run, max_epochs=max_epochs)
 
     # Get the data stored from the write_crops step above.
     model.load_from_disk(train_dir=train_dir, val_dir=val_dir)
-    model.trainer.fit(model)
+
+    with comet_logger.context_manager("classification"):
+        model.trainer.fit(model)
 
     return model
 
@@ -97,12 +93,13 @@ def preprocess_images(model, annotations, root_dir, save_dir):
     labels = annotations["label"].values
     model.write_crops(boxes=boxes, root_dir=root_dir, images=images, labels=labels, savedir=save_dir)
 
-def preprocess_and_train_classification(config, validation_df=None):
+def preprocess_and_train_classification(config, validation_df=None, comet_logger=None):
     """Preprocess data and train a crop model.
     
     Args:
         config: Configuration object containing training parameters
         validation_df (pd.DataFrame): A DataFrame containing validation annotations.
+        comet_logger: CometLogger object for logging experiments
     Returns:
         trained_model: Trained model object
     """
@@ -153,6 +150,7 @@ def preprocess_and_train_classification(config, validation_df=None):
         comet_project=config.comet.project,
         fast_dev_run=config.classification_model.trainer.fast_dev_run,
         max_epochs=config.classification_model.trainer.max_epochs,
+        comet_logger=comet_logger
         )
 
     return trained_model