refactor evaluate detection

weecology · Feb 25, 2025 · 3c7188f · 3c7188f
1 parent b62a380
commit 3c7188f
Show file tree

Hide file tree

Showing 19 changed files with 465 additions and 426 deletions.
diff --git a/BOEM.qmd b/BOEM.qmd
diff --git a/Dockerfile b/Dockerfile
diff --git a/USGS_backbone.py b/USGS_backbone.py
@@ -1,64 +1,77 @@
 from deepforest import main
 import pandas as pd
 import os
-import tempfile
 import comet_ml
 from pytorch_lightning.loggers import CometLogger
+from pytorch_lightning.profilers.simple import SimpleProfiler
+import torch
+import argparse
+from deepforest.callbacks import images_callback
 
-df = pd.read_csv("/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/20231118/20231116_cropped_annotations.csv")
-df.wat_label.value_counts()
-df = df[df.wat_label.isin(["Bird","Cartilaginous Fish","Bony Fish","Mammal","Reptile"])]
+# Parse arguments
+parser = argparse.ArgumentParser(description="Train DeepForest model")
+parser.add_argument("--batch_size", type=int, default=12, help="Batch size for training")
+parser.add_argument("--workers", type=int, default=0, help="Number of workers for data loading")
+args = parser.parse_args()
 
-# Combine Fish classes
-df.loc[df.wat_label.isin(["Cartilaginous Fish","Bony Fish"]),"wat_label"] = "Fish"
+# Use parsed arguments
+batch_size = args.batch_size
+workers = args.workers
 
-# Construct padded crop name
-df["image_path"] = df["bname_parent"] +"_" + df["tile_xtl"].astype(str) + "_" + df["tile_ytl"].astype(str) + "_" + df["tile_xbr"].astype(str) + "_" + df["tile_ybr"].astype(str) + ".JPG"
+savedir = "/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops"
+train = pd.read_csv(os.path.join(savedir,"train.csv"))
+test = pd.read_csv(os.path.join(savedir,"test.csv"))
 
-# Check if all images exist 
-df["image_exists"] = df["image_path"].apply(lambda x: os.path.exists(os.path.join("/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/20231118/padded",x)))
+# Initalize Deepforest model
+m = main.deepforest()
+m.load_model("weecology/deepforest-bird")
+m.label_dict = {"Object":0}
+m.numeric_to_label_dict = {0:"Object"}
 
-df["xmin"] = df["xtl"]
-df["ymin"] = df["ytl"]
-df["xmax"] = df["xbr"]
-df["ymax"] = df["ybr"]
-df["label"] = df["wat_label"]
+m.config["train"]["csv_file"] = os.path.join(savedir,"train.csv")
+m.config["train"]["root_dir"] = "/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops"
+m.config["train"]["fast_dev_run"] = False
+m.config["validation"]["csv_file"] = os.path.join(savedir,"test.csv")
+m.config["validation"]["root_dir"] = "/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops"
+m.config["batch_size"] = batch_size
+m.config["train"]["epochs"] = 100
+m.config["workers"] = workers
+m.config["validation"]["val_accuracy_interval"] = 10
+m.config["train"]["scheduler"]["params"]["eps"]  = 0
+m.config["train"]["lr"] = 0.0005
 
-# Randomly split 80 - 20 for each class
-train = df.groupby("wat_label").sample(frac=0.85)
-test = df.drop(train.index)
+comet_logger = CometLogger(project_name="BOEM", workspace="bw4sz")
 
-# Write to tmp data directory
-tmpdir = tempfile.mkdtemp()
-train.to_csv(os.path.join(tmpdir,"train.csv"),index=False)
-test.to_csv(os.path.join(tmpdir,"test.csv"),index=False)
+im = images_callback(n=20, every_n_epochs=25, savedir=os.path.join(savedir,"images"))
 
-# Initialize new Deepforest model ( the model that you will train ) with your classes
-m = main.deepforest(config_args={"num_classes":4}, label_dict={"Bird":0,"Fish":1,"Mammal":2,"Reptile":3})
+# Log the training and test sets
+comet_logger.experiment.log_table("train.csv", train)
+comet_logger.experiment.log_table("test.csv", test)
 
-# Inatialize Deepforest model ( the model that you will modify its regression head ) 
-deepforest_release_model = main.deepforest()
-deepforest_release_model.load_model("weecology/deepforest-bird") # or load_model('weecology/deepforest-bird')
+# Pytorch lightning save checkpoint
+#simple_profiler = SimpleProfiler(dirpath=os.path.join(tmpdir,"profiler"), filename="profiler.txt", extended=True)
 
-# Extract single class backbone that will have useful features for multi-class classification
-m.model.backbone.load_state_dict(deepforest_release_model.model.backbone.state_dict())
+# Log the devices
+devices = torch.cuda.device_count()
+comet_logger.experiment.log_parameter("devices", devices)
+comet_logger.experiment.log_parameter("workers", m.config["workers"])
+comet_logger.experiment.log_parameter("batch_size", m.config["batch_size"])
 
-# load regression head in the new model
-m.model.head.regression_head.load_state_dict(deepforest_release_model.model.head.regression_head.state_dict())
+# Log data sizes
+comet_logger.experiment.log_parameter("train_size", train.shape[0])
+comet_logger.experiment.log_parameter("test_size", test.shape[0])
 
-m.config["train"]["csv_file"] = os.path.join(tmpdir,"train.csv")
-m.config["train"]["root_dir"] = "/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/20231118/padded"
-m.config["train"]["fast_dev_run"] = False
-m.config["validation"]["csv_file"] = os.path.join(tmpdir,"test.csv")
-m.config["validation"]["root_dir"] = "/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/20231118/padded"
-m.config["batch_size"] = 6
-m.config["train"]["epochs"] = 25
-m.config["validation"]["val_accuracy_interval"] = 5
-m.config["train"]["scheduler"]["params"]["eps"]  = 0
-comet_logger = CometLogger(project_name="BOEM", workspace="bw4sz")
-
-m.create_trainer(logger=comet_logger)
+m.create_trainer(callbacks=[im], logger=comet_logger, accelerator="gpu", strategy="ddp", num_nodes=1, devices=devices)
 m.trainer.fit(m)
+results = m.evaluate(m.config["validation"]["csv_file"],m.config["validation"]["root_dir"])
+print(results)
+
+# Gather the number of steps taken from all GPUs
+global_steps = torch.tensor(m.trainer.global_step, dtype=torch.int32, device=m.device)
+comet_logger.experiment.log_metric("global_steps", global_steps)
+
+# Save profiler to comet
+#comet_logger.experiment.log_asset(os.path.join(tmpdir,"profiler","profiler.txt"))
 
 # Save the model
-m.trainer.save_checkpoint("/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/checkpoints/{}.pl".format(comet_logger.experiment.id))
+m.trainer.save_checkpoint("/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/checkpoints/{}.pl".format(comet_logger.experiment.id))
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -24,6 +24,7 @@ predict:
   patch_size: 2000
   patch_overlap: 0
   min_score: 0.4
+  batch_size: 48
 
 pipeline:
   confidence_threshold: 0.9
@@ -34,14 +35,14 @@ propagate:
   distance_threshold_pixels: 50
 
 detection_model:
-  checkpoint: "/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/checkpoints/5420a9c3f27d4299992094a7b9b49cb7.pl"
+  checkpoint: "/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/checkpoints/900710af16d9431aaef84bf74034a48c.pl"
   checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/detection/checkpoints
   train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train/
   train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
   crop_image_dir: /blue/ewhite/b.weinstein/BOEM/detection/crops/
   limit_empty_frac: 0.01
   labels:
-    - "Bird"
+    - "Object"
   trainer: 
     batch_size: 4
     train:
@@ -69,7 +70,6 @@ pipeline_evaluation:
   classify_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
   # This is an average mAP threshold for now, but we may want to add a per-iou threshold in the future
   detection_true_positive_threshold: 0.8
-  detection_false_positive_threshold: 0.5
   classification_avg_score: 0.5
   image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
   debug: False
@@ -79,7 +79,6 @@ reporting:
   metadata: /blue/ewhite/b.weinstein/BOEM/reporting/metadata/metadata.csv
   thin_factor: 100
 
-
 active_learning:
   image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27
   strategy: 'target-labels'
@@ -89,7 +88,7 @@ active_learning:
   min_score: 0.1
   model_checkpoint:
   target_labels:
-    - "Bird"
+    - "Object"
 
   # Optional parameters:
   evaluation: 

diff --git a/container.sh b/container.sh
diff --git a/prepare_USGS.py b/prepare_USGS.py
@@ -0,0 +1,118 @@
+# Prepare USGS backbone
+import pandas as pd
+import os
+import glob
+from deepforest.preprocess import split_raster
+import torch
+import argparse
+import random
+import numpy as np
+from src.cluster import start
+from dask.distributed import as_completed
+
+# Parse arguments
+parser = argparse.ArgumentParser(description="Train DeepForest model")
+parser.add_argument("--batch_size", type=int, default=12, help="Batch size for training")
+parser.add_argument("--workers", type=int, default=0, help="Number of workers for data loading")
+args = parser.parse_args()
+
+# Use parsed arguments
+batch_size = args.batch_size
+workers = args.workers
+
+# Set random seeds for reproducibility
+seed = 42
+random.seed(seed)
+np.random.seed(seed)
+torch.manual_seed(seed)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+df = pd.read_csv("/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/20250203_total.csv")
+df.label.value_counts()
+
+# Construct padded crop name
+df["image_path"] = df["bname_parent"] + ".JPG"
+
+# Check if all images exist remove any that do not exist
+df["image_exists"] = df["image_path"].apply(lambda x: os.path.exists(os.path.join("/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/images_parent",x)))
+df = df[df["image_exists"]]
+
+df["xmin"] = df["left"]
+df["ymin"] = df["top"]
+df["xmax"] = df["left"] + df["width"]
+df["ymax"] = df["top"] + df["height"]
+
+os.makedirs("/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops", exist_ok=True)
+crop_annotations =[]
+regenerate_crops = True
+if regenerate_crops:
+    client = start(cpus=5, mem_size="40GB")
+    futures = []
+
+    def process_image(image_annotations):
+        x = image_annotations.image_path.unique()[0]
+        filename = os.path.join("/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops", x.replace(".JPG", ".csv"))
+        if os.path.exists(filename):
+            return pd.read_csv(filename)
+        try:
+            split_raster(
+                annotations_file=image_annotations,
+                patch_size=1000,
+                patch_overlap=0,
+                path_to_raster=os.path.join("/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/images_parent", x),
+                root_dir="/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/images_parent",
+                base_dir="/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops",
+                allow_empty=False)
+            return filename
+        except Exception as e:
+            print(f"Error processing {x}: {e}")
+            return None
+
+    for x in df.image_path.unique():
+        image_annotations = df[df["image_path"] == x]
+        futures.append(client.submit(process_image, image_annotations))
+
+    for future in as_completed(futures):
+        result = future.result()
+        if result is not None:
+            crop_annotations.append(result)
+
+crop_annotations = glob.glob("/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops/*.csv")
+crop_annotations = [pd.read_csv(x) for x in crop_annotations]
+
+crop_annotations = pd.concat(crop_annotations)
+# Background classes as negatives
+crop_annotations.loc[crop_annotations['label'].isin(["Algae", "Boat", "Buoy"]), ['xmin', 'xmax', 'ymin', 'ymax', 'label']] = [0, 0, 0, 0, "Object"]
+
+# All other as "Object"
+crop_annotations.loc[~crop_annotations['label'].isin(["Algae", "Boat", "Buoy"]), 'label'] = "FalsePositive"
+
+# Drop duplicates for False Positives only
+falsepositives = crop_annotations[crop_annotations['label'] == "FalsePositive"]
+falsepositives = falsepositives.drop_duplicates(subset=['xmin', 'xmax', 'ymin', 'ymax'])
+
+# Drop any falsepositive images that occur in the same image as a true positive by image_path
+true_positives = crop_annotations[crop_annotations['label'] != "FalsePositive"]
+falsepositives = falsepositives[~falsepositives['image_path'].isin(true_positives['image_path'])]
+crop_annotations = pd.concat([crop_annotations[crop_annotations['label'] != "FalsePositive"], falsepositives])
+crop_annotations["label"] = "Object"
+
+# Randomly split by image_path
+images = crop_annotations.image_path.unique()
+random.shuffle(images)
+train_images = images[:int(len(images)*0.90)]
+test_images = images[int(len(images)*0.90):]
+
+train = crop_annotations[crop_annotations["image_path"].isin(train_images)]
+test = crop_annotations[crop_annotations["image_path"].isin(test_images)]
+
+# Write to tmp data directory
+savedir = "/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops"
+# create images directory
+os.makedirs(os.path.join(savedir,"images"), exist_ok=True)
+
+train.to_csv(os.path.join(savedir,"train.csv"),index=False)
+test.to_csv(os.path.join(savedir,"test.csv"),index=False)
+