Skip to content

Commit

Permalink
use comet for dashboard
Browse files Browse the repository at this point in the history
  • Loading branch information
bw4sz committed Mar 1, 2025
1 parent f194dcc commit f36706e
Show file tree
Hide file tree
Showing 14 changed files with 208 additions and 338 deletions.
27 changes: 27 additions & 0 deletions USGS_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from deepforest import model
import pandas as pd
import os
import comet_ml
from pytorch_lightning.loggers import CometLogger
from src.classification import preprocess_and_train_classification
import hydra
from omegaconf import DictConfig

@hydra.main(config_path="conf/classification_model", config_name="USGS")
def main(cfg: DictConfig):
classification_cfg = cfg.classification
savedir = classification_cfg.savedir
train = pd.read_csv(os.path.join(savedir, "train.csv"))
test = pd.read_csv(os.path.join(savedir, "test.csv"))

comet_logger = CometLogger(project_name=classification_cfg.project_name, workspace=classification_cfg.workspace)
preprocess_and_train_classification(
config=cfg,
train_df=train,
validation_df=test,
comet_logger=comet_logger
)

if __name__ == "__main__":
main()

11 changes: 11 additions & 0 deletions conf/classification_model/USGS.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
classification_model:
checkpoint:
checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/checkpoints/
train_csv_folder:
train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
crop_image_dir: /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/
under_sample_ratio: 0
trainer:
fast_dev_run: True
max_epochs: 1
lr: 0.00001
11 changes: 11 additions & 0 deletions conf/classification_model/finetune.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
classification_model:
checkpoint:
checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/classification/checkpoints
train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train
train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
crop_image_dir: /blue/ewhite/b.weinstein/BOEM/classification/crops/
under_sample_ratio: 0
trainer:
fast_dev_run: True
max_epochs: 1
lr: 0.00001
31 changes: 8 additions & 23 deletions conf/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
defaults:
- server: serenity

- classification_model: finetune.yaml

debug: False

comet:
project: BOEM
workspace: bw4sz
Expand Down Expand Up @@ -28,7 +31,7 @@ label_studio:
csv_dir: /blue/ewhite/b.weinstein/BOEM/annotations/review
project_name: "Bureau of Ocean Energy Management - Review"
predict:
patch_size: 2000
patch_size: 1000
patch_overlap: 0
min_score: 0.4
batch_size: 48
Expand Down Expand Up @@ -60,37 +63,19 @@ detection_model:
validation:
val_accuracy_interval: 3

classification_model:
checkpoint:
checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/classification/checkpoints
train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train
train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
crop_image_dir: /blue/ewhite/b.weinstein/BOEM/classification/crops/
under_sample_ratio: 0
trainer:
fast_dev_run: True
max_epochs: 1
lr: 0.00001

pipeline_evaluation:
detect_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
classify_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
# This is an average mAP threshold for now, but we may want to add a per-iou threshold in the future
detection_true_positive_threshold: 0.8
classification_avg_score: 0.5
image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
debug: False

reporting:
report_dir: /orange/ewhite/web/public/BOEM
metadata: /blue/ewhite/b.weinstein/BOEM/reporting/metadata/metadata.csv
thin_factor: 100

active_learning:
image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27
strategy: 'target-labels'
n_images: 50
patch_size: 2000
patch_size: 1000
patch_overlap: 0
min_score: 0.1
model_checkpoint:
Expand All @@ -108,9 +93,9 @@ active_testing:
strategy: 'random'
n_images: 1000
m:
patch_size: 2000
patch_size: 1000
patch_overlap: 0
min_score: 0.2

human_review:
n_images: 10
n: 10
51 changes: 16 additions & 35 deletions src/active_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,54 +200,30 @@ def update_sys_path():
chosen_preannotations = preannotations[preannotations["image_path"].isin(chosen_images)]
return chosen_images, chosen_preannotations

def human_review(detection_model, classification_model, image_paths, patch_size, patch_overlap, confident_threshold, min_score, batch_size):
def human_review(predictions, min_score=0.1, confident_threshold=0.5):
"""
Predict on images and divide into confident and uncertain predictions.
Args:
detection_model (deepforest.deepforest): A trained detection model.
classification_model (deepforest.deepforest): A trained classification model.
image_paths (list): A list of image paths.
patch_size (int): The size of the image patches to predict on.
patch_overlap (float): The amount of overlap between image patches.
confident_threshold (float): The threshold for confident predictions.
min_score (float): The minimum score for a prediction to be included.
batch_size (int): The batch size for prediction.
existing_predictions (pd.DataFrame, optional): A DataFrame of existing predictions. Defaults to None.
min_score (float, optional): The minimum score for a prediction to be included. Defaults to 0.1.
predictions (pd.DataFrame, optional): A DataFrame of existing predictions. Defaults to None.
Returns:
tuple: A tuple of confident and uncertain predictions.
"""
# Check for existing predictions
if existing_predictions is not None:
image_basenames = [os.path.basename(image_path) for image_path in image_paths]
existing_predictions = existing_predictions[existing_predictions["image_path"].isin(image_basenames)]
image_paths = [image_path for image_path in image_paths if os.path.basename(image_path) not in existing_predictions["image_path"].unique()]
if len(image_paths) > 0:
predictions = detection.predict(
m=detection_model,
crop_model=classification_model,
image_paths=image_paths,
patch_size=patch_size,
patch_overlap=patch_overlap,
batch_size=batch_size
)
predictions = pd.concat(predictions)
combined_predictions = pd.concat([predictions, existing_predictions])
else:
combined_predictions = existing_predictions

combined_predictions[combined_predictions["score"] > min_score]

predictions[predictions["score"] > min_score]

# Split predictions into confident and uncertain
uncertain_predictions = combined_predictions[
combined_predictions["score"] <= confident_threshold]
uncertain_predictions = predictions[
predictions["score"] <= confident_threshold]

confident_predictions = combined_predictions[
~combined_predictions["image_path"].isin(
confident_predictions = predictions[
~predictions["image_path"].isin(
uncertain_predictions["image_path"])]

return confident_predictions, uncertain_predictions

def generate_training_pool_predictions(image_dir, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, dask_client=None, batch_size=16, comet_logger=None):
def generate_training_pool_predictions(image_dir, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, dask_client=None, batch_size=16, comet_logger=None, pool_limit=1000):
"""
Generate predictions for the training pool.
Expand All @@ -261,6 +237,7 @@ def generate_training_pool_predictions(image_dir, patch_size=512, patch_overlap=
dask_client (dask.distributed.Client, optional): A Dask client for parallel processing. Defaults to None.
batch_size (int, optional): The batch size for prediction. Defaults to 16.
comet_logger (CometLogger, optional): A CometLogger object. Defaults to None.
pool_limit (int, optional): The maximum number of images to consider. Defaults to 1000.
Returns:
pd.DataFrame: A DataFrame of predictions.
Expand All @@ -270,6 +247,10 @@ def generate_training_pool_predictions(image_dir, patch_size=512, patch_overlap=
# Remove .csv files from the pool
pool = [image for image in pool if not image.endswith('.csv')]

#subsample
if len(pool) > pool_limit:
pool = random.sample(pool, pool_limit)

# Remove crop dir
try:
pool.remove(os.path.join(image_dir, "crops"))
Expand Down Expand Up @@ -301,7 +282,7 @@ def update_sys_path():
preannotations = pd.concat(preannotations)

if comet_logger:
comet_logger.log_table("active_training_pool", preannotations)
comet_logger.experiment.log_table("active_training_pool", preannotations)

# Print the number of preannotations before removing min score
preannotations = preannotations[preannotations["score"] >= min_score]
Expand Down
48 changes: 37 additions & 11 deletions src/classification.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
# Standard library imports
import os
import glob
import warnings
# Third party imports
import pandas as pd
from PIL import Image

from deepforest.model import CropModel

# Local imports
from src.label_studio import gather_data
from pytorch_lightning.loggers import CometLogger

def create_train_test(annotations):
return annotations.sample(frac=0.8, random_state=1), annotations.drop(
Expand Down Expand Up @@ -75,12 +73,38 @@ def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_
"""
model.create_trainer(logger=comet_logger, fast_dev_run=fast_dev_run, max_epochs=max_epochs)

# Get the data stored from the write_crops step above.
# Get the data stored from the write_crops processing.
model.load_from_disk(train_dir=train_dir, val_dir=val_dir)

with comet_logger.context_manager("classification"):
# Log the validation dataset images
for image_path, label in model.val_ds.imgs:
label_name = model.numeric_to_label_dict[label]
image_name = os.path.basename(image_path)
comet_logger.experiment.log_image(image_path, name=f"{label_name}_{image_name}")

with comet_logger.experiment.context_manager("classification"):
model.trainer.fit(model)

# Compute confusion matrix and upload to cometml
image_dataset = []
y_true = []
y_predicted = []
for index, (image,label) in enumerate(model.val_ds):
image_path, label = model.val_ds.imgs[index]
original_image = Image.open(image_path)
image_dataset += [original_image]
y_true += [label]
y_predicted += [model(image.unsqueeze(0)).argmax().item()]
labels = model.val_ds.classes

# Log the confusion matrix to Comet
comet_logger.experiment.log_confusion_matrix(
y_true=y_true,
y_predicted=y_predicted,
images=image_dataset,
labels=labels,
)

return model

def preprocess_images(model, annotations, root_dir, save_dir):
Expand All @@ -93,18 +117,21 @@ def preprocess_images(model, annotations, root_dir, save_dir):
labels = annotations["label"].values
model.write_crops(boxes=boxes, root_dir=root_dir, images=images, labels=labels, savedir=save_dir)

def preprocess_and_train_classification(config, validation_df=None, comet_logger=None):
def preprocess_and_train_classification(config, train_df=None, validation_df=None, comet_logger=None):
"""Preprocess data and train a crop model.
Args:
config: Configuration object containing training parameters
train_df (pd.DataFrame): A DataFrame containing training annotations.
validation_df (pd.DataFrame): A DataFrame containing validation annotations.
comet_logger: CometLogger object for logging experiments
Returns:
trained_model: Trained model object
"""
# Get and split annotations
annotations = gather_data(config.classification_model.train_csv_folder)
if train_df is not None:
annotations = gather_data(config.classification_model.train_csv_folder)

num_classes = len(annotations["label"].unique())

# Remove the empty frames
Expand All @@ -128,6 +155,7 @@ def preprocess_and_train_classification(config, validation_df=None, comet_logger

# Force the label dict, DeepForest will update this soon
loaded_model.label_dict = {v:k for k,v in enumerate(annotations["label"].unique())}
loaded_model.numeric_to_label_dict = {v:k for k,v in loaded_model.label_dict.items()}

# Preprocess train and validation data
preprocess_images(
Expand All @@ -146,11 +174,9 @@ def preprocess_and_train_classification(config, validation_df=None, comet_logger
train_dir=config.classification_model.crop_image_dir,
val_dir=config.classification_model.crop_image_dir,
model=loaded_model,
comet_workspace=config.comet.workspace,
comet_project=config.comet.project,
fast_dev_run=config.classification_model.trainer.fast_dev_run,
max_epochs=config.classification_model.trainer.max_epochs,
comet_logger=comet_logger
)

return trained_model
return trained_model
1 change: 0 additions & 1 deletion src/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import pandas as pd
from deepforest import main, visualize
from deepforest.utilities import read_file
from pytorch_lightning.loggers import CometLogger

# Local imports
from src import data_processing
Expand Down
15 changes: 9 additions & 6 deletions src/label_studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,23 @@
import shutil
from PIL import Image

def upload_to_label_studio(images, sftp_client, label_studio_project, images_to_annotate_dir, folder_name, preannotations):
def upload_to_label_studio(images, sftp_client, url, project_name, images_to_annotate_dir, folder_name, preannotations):
"""
Upload images to Label Studio and import image tasks.
Args:
images (list): List of image paths to upload.
url (str): The URL of the Label Studio server.
sftp_client (paramiko.SFTPClient): The SFTP client for uploading images.
label_studio_project (label_studio_sdk.Project): The Label Studio project instance.
project_name (str): The name of the Label Studio project.
images_to_annotate_dir (str): The path to the directory of images to annotate.
folder_name (str): The name of the folder to upload images to.
preannotations (list): List of preannotations for the images.
Returns:
None
"""
label_studio_project = connect_to_label_studio(url=url, project_name=project_name)
upload_images(sftp_client=sftp_client, images=images, folder_name=folder_name)
import_image_tasks(label_studio_project=label_studio_project, image_names=images, local_image_dir=images_to_annotate_dir, predictions=preannotations)

Expand Down Expand Up @@ -165,6 +167,9 @@ def gather_data(annotation_dir):
df = []
for x in csvs:
df.append(pd.read_csv(x))

if len(df) == 0:
return None
df = pd.concat(df)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
Expand Down Expand Up @@ -201,7 +206,6 @@ def connect_to_label_studio(url, project_name, label_config=None):

if len(project) == 0:
# Create a project with the specified title and labeling configuration

project = ls.create_project(
title=project_name,
label_config=label_config
Expand Down Expand Up @@ -243,8 +247,6 @@ def import_image_tasks(label_studio_project, image_names, local_image_dir, predi
Returns:
None
"""
import os

tasks = []
for index, image_name in enumerate(image_names):
print(f"Importing {image_name} into Label Studio")
Expand All @@ -260,7 +262,8 @@ def import_image_tasks(label_studio_project, image_names, local_image_dir, predi
else:
upload_dict = {"data": data_dict}
tasks.append(upload_dict)
label_studio_project.import_tasks(tasks)
if len(tasks) > 0:
label_studio_project.import_tasks(tasks)

def download_completed_tasks(label_studio_project, csv_dir):
labeled_tasks = label_studio_project.get_labeled_tasks()
Expand Down
Loading

0 comments on commit f36706e

Please sign in to comment.