folders

weecology · Dec 9, 2024 · a3019bf · a3019bf
1 parent c7de761
commit a3019bf
Show file tree

Hide file tree

Showing 8 changed files with 195 additions and 132 deletions.
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -9,12 +9,14 @@ check_annotations: true
 # Force upload bypasses the pipeline, useful for debugging and starting a new project
 force_upload: true
 label_studio:
-  project_name: "Bureau of Ocean Energy Management"
+  project_name_train: "Bureau of Ocean Energy Management - Training"
+  project_name_validation: "Bureau of Ocean Energy Management - Validation"
   url: "https://labelstudio.naturecast.org/"
   folder_name: "/pgsql/retrieverdash/everglades-label-studio/everglades-data"
   images_to_annotate_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27
   annotated_images_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
-  csv_dir: /blue/ewhite/b.weinstein/BOEM/annotations
+  csv_dir_train: /blue/ewhite/b.weinstein/BOEM/annotations/train
+  csv_dir_validation: /blue/ewhite/b.weinstein/BOEM/annotations/validation
 
 predict:
   patch_size: 2000
@@ -23,7 +25,7 @@ predict:
 
 pipeline:
   confidence_threshold: 0.5
-  limit_empty_frac: 0.1
+  limit_empty_frac: 0.01
 
 propagate:
   time_threshold_seconds: 5
@@ -33,10 +35,10 @@ detection_model:
   checkpoint: bird
   checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/detection/checkpoints
   validation_csv_path:
-  train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/
+  train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train/
   train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
   crop_image_dir: /blue/ewhite/b.weinstein/BOEM/detection/crops/
-  limit_empty_frac: 0
+  limit_empty_frac: 0.05
   fast_dev_run: false
   labels:
     - "Bird"
@@ -45,31 +47,33 @@ classification_model:
   checkpoint: 
   checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/classification/checkpoints
   validation_csv_path: 
-  train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/
+  train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train
   train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
   crop_image_dir: /blue/ewhite/b.weinstein/BOEM/classification/crops/
   under_sample_ratio: 0
-  fast_dev_run: True
+  trainer:
+    fast_dev_run: false
+    max_epochs: 1
+    lr: 0.001
 
 pipeline_evaluation:
-  detect_ground_truth_dir: 
-  classify_confident_ground_truth_dir:
-  classify_uncertain_ground_truth_dir:
+  detect_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
+  classify_confident_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
+  classify_uncertain_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
   # This is an average mAP threshold for now, but we may want to add a per-iou threshold in the future
   detection_true_positive_threshold: 0.8
   detection_false_positive_threshold: 0.5
   classification_avg_score: 0.5
-  image_dir:
+  image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
 
 reporting:
-  image_dir: /blue/ewhite/b.weinstein/BOEM/reporting/samples
   report_dir: /blue/ewhite/b.weinstein/BOEM/reporting/reports
   metadata: /blue/ewhite/b.weinstein/BOEM/reporting/metadata.csv
 
 active_learning:
   image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27
   strategy: 'target-labels'
-  n_images: 1
+  n_images: 50
   patch_size: 2000
   patch_overlap: 0
   min_score: 0.3
@@ -80,7 +84,7 @@ active_learning:
   # Optional parameters:
   evaluation: 
   dask_client: 
-  pool_limit: 10
+  pool_limit: 5000
   gpus: 1
 
 active_testing:
@@ -94,6 +98,7 @@ active_testing:
 
 deepforest:
   train:
-    fast_dev_run: True
-    epochs: 1
+    fast_dev_run: False
+    epochs: 10
+    lr: 0.00001
   workers: 0
diff --git a/src/classification.py b/src/classification.py
@@ -10,12 +10,11 @@
 from src.label_studio import gather_data
 from pytorch_lightning.loggers import CometLogger
 
-
 def create_train_test(annotations):
     return annotations.sample(frac=0.8, random_state=1), annotations.drop(
         annotations.sample(frac=0.8, random_state=1).index)
 
-def get_latest_checkpoint(checkpoint_dir, annotations):
+def get_latest_checkpoint(checkpoint_dir, annotations, lr=0.0001):
     #Get model with latest checkpoint dir, if none exist make a new model
     if os.path.exists(checkpoint_dir):
         checkpoints = glob.glob(os.path.join(checkpoint_dir,"*.ckpt"))
@@ -25,16 +24,16 @@ def get_latest_checkpoint(checkpoint_dir, annotations):
             m = CropModel.load_from_checkpoint(checkpoint)
         else:
             warnings.warn("No checkpoints found in {}".format(checkpoint_dir))
-            m = CropModel(num_classes=len(annotations["label"].unique()))
+            m = CropModel(num_classes=len(annotations["label"].unique()), lr=lr)
     else:
         os.makedirs(checkpoint_dir)
-        m = CropModel(num_classes=len(annotations["label"].unique()))
+        m = CropModel(num_classes=len(annotations["label"].unique()), lr=lr)
 
     return m
 
-def load(checkpoint=None, annotations=None, checkpoint_dir=None):
+def load(checkpoint=None, annotations=None, checkpoint_dir=None, lr=0.0001):
     if checkpoint: 
-        loaded_model = CropModel(checkpoint, num_classes=len(annotations["label"].unique()))
+        loaded_model = CropModel(checkpoint, num_classes=len(annotations["label"].unique()), lr=lr)
     elif checkpoint_dir:
         loaded_model = get_latest_checkpoint(
             checkpoint_dir, annotations)
@@ -43,24 +42,25 @@ def load(checkpoint=None, annotations=None, checkpoint_dir=None):
 
     return loaded_model
 
-def train(model, train_dir, val_dir, comet_project=None, comet_workspace=None, fast_dev_run=False):
+def train(model, train_dir, val_dir, comet_workspace=None, comet_project=None, fast_dev_run=False, max_epochs=10):
     """Train a model on labeled images.
     Args:
         model (CropModel): A CropModel object.
         train_dir (str): The directory containing the training images.
         val_dir (str): The directory containing the validation images.
-        comet_project (str): The comet project name for logging. Defaults to None.
-        comet_workspace (str): The comet workspace for logging. Defaults to None.
+        fast_dev_run (bool): Whether to run a fast development run.
+        max_epochs (int): The maximum number of epochs to train for.
 
     Returns:
         main.deepforest: A trained deepforest model.
     """
-    # Update
+
     if comet_project:
         comet_logger = CometLogger(project_name=comet_project, workspace=comet_workspace)
-        model.create_trainer(logger=comet_logger, fast_dev_run=fast_dev_run)
     else:
-        model.create_trainer(fast_dev_run=fast_dev_run)
+        comet_logger = None
+
+    model.create_trainer(logger=comet_logger, fast_dev_run=fast_dev_run, max_epochs=max_epochs)
 
     # Get the data stored from the write_crops step above.
     model.load_from_disk(train_dir=train_dir, val_dir=val_dir)
@@ -82,6 +82,7 @@ def preprocess_and_train_classification(config, validation_df=None):
     Args:
         config: Configuration object containing training parameters
         validation_df (pd.DataFrame): A DataFrame containing validation annotations.
+        comet_logger (CometLogger): A CometLogger object.
     Returns:
         trained_model: Trained model object
     """
@@ -95,7 +96,7 @@ def preprocess_and_train_classification(config, validation_df=None):
                                isin(validation_df["image_path"])]
 
     # Load existing model
-    loaded_model = load(checkpoint=config.classification_model.checkpoint, checkpoint_dir=config.classification_model.checkpoint_dir, annotations=annotations)
+    loaded_model = load(checkpoint=config.classification_model.checkpoint, checkpoint_dir=config.classification_model.checkpoint_dir, annotations=annotations, lr=config.classification_model.trainer.lr)
 
     # Preprocess train and validation data
     preprocess_images(
@@ -114,8 +115,10 @@ def preprocess_and_train_classification(config, validation_df=None):
         train_dir=config.classification_model.crop_image_dir,
         val_dir=config.classification_model.crop_image_dir,
         model=loaded_model,
-        comet_project=config.comet.project,
         comet_workspace=config.comet.workspace,
-        fast_dev_run=config.classification_model.fast_dev_run)
+        comet_project=config.comet.project,
+        fast_dev_run=config.classification_model.trainer.fast_dev_run,
+        max_epochs=config.classification_model.trainer.max_epochs,
+        )
 
     return trained_model
diff --git a/src/detection.py b/src/detection.py
@@ -153,6 +153,7 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro
 
     if comet_project:
         comet_logger = CometLogger(project_name=comet_project, workspace=comet_workspace)
+        comet_logger.experiment.add_tags(["detection"])
         comet_logger.experiment.log_parameters(model.config)
         comet_logger.experiment.log_table("train.csv", train_annotations)
         comet_logger.experiment.log_table("test.csv", test_annotations)
@@ -161,24 +162,24 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro
     else:
         model.create_trainer()
 
-    with comet_logger.experiment.context_manager("train_images"):
-        non_empty_train_annotations = train_annotations[~(train_annotations.xmax==0)]
-        try:
-            non_empty_train_annotations= gpd.GeoDataFrame(non_empty_train_annotations, geometry=non_empty_train_annotations["geometry"])
-            non_empty_train_annotations.root_dir = train_image_dir
-            non_empty_train_annotations = read_file(non_empty_train_annotations)
-        except: 
-            non_empty_train_annotations = read_file(non_empty_train_annotations, root_dir=train_image_dir)
-
-        if non_empty_train_annotations.empty:
-            pass
-        else:
-            sample_train_annotations = non_empty_train_annotations[non_empty_train_annotations.image_path.isin(non_empty_train_annotations.image_path.head(5))]
-            for filename in sample_train_annotations.image_path:
-                sample_train_annotations_for_image = sample_train_annotations[sample_train_annotations.image_path == filename]
-                sample_train_annotations_for_image.root_dir = train_image_dir
-                visualize.plot_results(sample_train_annotations_for_image, savedir=tmpdir)
-                comet_logger.experiment.log_image(os.path.join(tmpdir, filename))
+    # with comet_logger.experiment.context_manager("train_images"):
+    #     non_empty_train_annotations = train_annotations[~(train_annotations.xmax==0)]
+    #     try:
+    #         non_empty_train_annotations= gpd.GeoDataFrame(non_empty_train_annotations, geometry=non_empty_train_annotations["geometry"])
+    #         non_empty_train_annotations.root_dir = train_image_dir
+    #         non_empty_train_annotations = read_file(non_empty_train_annotations)
+    #     except: 
+    #         non_empty_train_annotations = read_file(non_empty_train_annotations, root_dir=train_image_dir)
+
+    #     if non_empty_train_annotations.empty:
+    #         pass
+    #     else:
+    #         sample_train_annotations = non_empty_train_annotations[non_empty_train_annotations.image_path.isin(non_empty_train_annotations.image_path.head(5))]
+    #         for filename in sample_train_annotations.image_path:
+    #             sample_train_annotations_for_image = sample_train_annotations[sample_train_annotations.image_path == filename]
+    #             sample_train_annotations_for_image.root_dir = train_image_dir
+    #             visualize.plot_results(sample_train_annotations_for_image, savedir=tmpdir)
+    #             comet_logger.experiment.log_image(os.path.join(tmpdir, filename))
 
     model.trainer.fit(model)
 
@@ -214,11 +215,16 @@ def preprocess_and_train(config, validation_df=None, model_type="detection"):
     train_df = data_processing.preprocess_images(train_df,
                                root_dir=config.detection_model.train_image_dir,
                                save_dir=config.detection_model.crop_image_dir)
+
+    non_empty = train_df[train_df.xmin!=0]
+    train_df.loc[train_df.label==0,"label"] = "Bird"
 
     if not validation_df.empty:
         validation_df = data_processing.preprocess_images(validation_df,
                                     root_dir=config.detection_model.train_image_dir,
                                     save_dir=config.detection_model.crop_image_dir)
+        non_empty = validation_df[validation_df.xmin!=0]
+        validation_df.loc[validation_df.label==0,"label"] = "Bird"
 
     # Limit empty frames
     if config.detection_model.limit_empty_frac > 0:
@@ -295,7 +301,6 @@ def predict(image_paths, patch_size, patch_overlap, m=None, model_path=None, das
     Returns:
         list: A list of image predictions.
     """
-
     if dask_client:
         # load model on each client
         def update_sys_path():