Skip to content

Commit

Permalink
classification trains, pipeline tests need to be updated
Browse files Browse the repository at this point in the history
  • Loading branch information
bw4sz committed Mar 4, 2025
1 parent f06f611 commit 3660d7b
Show file tree
Hide file tree
Showing 16 changed files with 188 additions and 143 deletions.
50 changes: 40 additions & 10 deletions USGS_classification.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,59 @@
from deepforest import model
import pandas as pd
import os
import glob
import comet_ml
from pytorch_lightning.loggers import CometLogger
from src.classification import preprocess_and_train_classification
import hydra
from omegaconf import DictConfig

# Create train test split, split each class into 90% train and 10% test with a minimum of 10 images per class for test and a max of 100
def train_test_split(df, test_size=0.1, min_test_images=10, max_test_images=100):
train_df = pd.DataFrame()
test_df = pd.DataFrame()

for label in df['label'].unique():
class_df = df[df['label'] == label]
test_count = max(min_test_images, int(len(class_df) * test_size))
test_count = min(test_count, max_test_images)

test_class_df = class_df.sample(n=test_count)
train_class_df = class_df.drop(test_class_df.index)

train_df = pd.concat([train_df, train_class_df])
test_df = pd.concat([test_df, test_class_df])

return train_df, test_df

@hydra.main(config_path="conf", config_name="config")
def main(cfg: DictConfig):
# Override the classification_model config with USGS.yaml
cfg = hydra.compose(config_name="config", overrides=["classification_model=USGS"])

classification_cfg = cfg.classification_model


# From the detection script
savedir = "/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops"
train = pd.read_csv(os.path.join(savedir, "train.csv"))
test = pd.read_csv(os.path.join(savedir, "test.csv"))
crop_annotations = glob.glob("/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops/*.csv")
crop_annotations = [pd.read_csv(x) for x in crop_annotations]
crop_annotations = pd.concat(crop_annotations)

# Keep labels with more than 100 images
crop_annotations = crop_annotations.groupby("label").filter(lambda x: len(x) > 100)

# Only keep two word labels
crop_annotations = crop_annotations[crop_annotations["label"].str.contains(" ")]

comet_logger = CometLogger(project_name=cfg.project, workspace=cfg.workspace)
# Expand bounding boxes by 30 pixels on all sides
crop_annotations["xmin"] -= 30
crop_annotations["ymin"] -= 30
crop_annotations["xmax"] += 30
crop_annotations["ymax"] += 30

train_df, validation_df = train_test_split(crop_annotations)

comet_logger = CometLogger(project_name=cfg.comet.project, workspace=cfg.comet.workspace)
preprocess_and_train_classification(
config=cfg,
train_df=train,
validation_df=test,
train_df=train_df,
validation_df=validation_df,
comet_logger=comet_logger
)

Expand Down
24 changes: 12 additions & 12 deletions conf/classification_model/USGS.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
classification_model:
checkpoint:
checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/checkpoints/
train_csv_folder:
train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
crop_image_dir: /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/
under_sample_ratio: 0
trainer:
fast_dev_run: True
max_epochs: 1
lr: 0.00001
batch_size: 16
checkpoint:
checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/checkpoints/
train_csv_folder:
train_image_dir: /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/crops
crop_image_dir: /blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/classification/crops/
under_sample_ratio: 0
trainer:
fast_dev_run: False
max_epochs: 100
lr: 0.00001
batch_size: 16
workers: 10
24 changes: 12 additions & 12 deletions conf/classification_model/finetune.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
classification_model:
checkpoint:
checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/classification/checkpoints
train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train
train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
crop_image_dir: /blue/ewhite/b.weinstein/BOEM/classification/crops/
under_sample_ratio: 0
trainer:
fast_dev_run: True
max_epochs: 1
lr: 0.00001
batch_size: 16
checkpoint:
checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/classification/checkpoints
train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train
train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
crop_image_dir: /blue/ewhite/b.weinstein/BOEM/classification/crops/
under_sample_ratio: 0
trainer:
fast_dev_run: False
max_epochs: 1
lr: 0.00001
batch_size: 16
workers: 10
12 changes: 6 additions & 6 deletions conf/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ predict:
patch_size: 1000
patch_overlap: 0
min_score: 0.4
batch_size: 48
batch_size: 32

pipeline:
confidence_threshold: 0.9
Expand All @@ -54,14 +54,14 @@ detection_model:
labels:
- "Object"
trainer:
batch_size: 4
batch_size: 12
train:
fast_dev_run: False
epochs: 10
lr: 0.000001
workers: 0
epochs: 20
lr: 0.00001
workers: 10
validation:
val_accuracy_interval: 3
val_accuracy_interval: 5

pipeline_evaluation:
detect_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
Expand Down
44 changes: 29 additions & 15 deletions src/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from PIL import Image

from deepforest.model import CropModel
import torch

# Local imports
from src.label_studio import gather_data
Expand Down Expand Up @@ -58,7 +59,7 @@ def load(checkpoint=None, annotations=None, checkpoint_dir=None, lr=0.0001, num_

return loaded_model

def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_epochs=10, batch_size=4):
def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_epochs=10, batch_size=4, workers=0):
"""Train a model on labeled images.
Args:
model (CropModel): A CropModel object.
Expand All @@ -73,19 +74,32 @@ def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_
main.deepforest: A trained deepforest model.
"""
model.batch_size = batch_size
model.create_trainer(logger=comet_logger, fast_dev_run=fast_dev_run, max_epochs=max_epochs)
model.num_workers = workers

devices = torch.cuda.device_count()
model.create_trainer(logger=comet_logger, fast_dev_run=fast_dev_run, max_epochs=max_epochs, num_nodes=1, devices = devices)

# Get the data stored from the write_crops processing.
model.load_from_disk(train_dir=train_dir, val_dir=val_dir)

# Log the validation dataset images
model.label_dict = model.train_ds.class_to_idx
model.numeric_to_label = {v: k for k, v in model.train_ds.class_to_idx.items()}

# Log the validation dataset images, max 10 per class
label_count = {}
numeric_to_label = {v: k for k, v in model.val_ds.class_to_idx.items()}
for image_path, label in model.val_ds.imgs:
label_name = model.numeric_to_label_dict[label]
image_name = os.path.basename(image_path)
comet_logger.experiment.log_image(image_path, name=f"{label_name}_{image_name}")
label_name =numeric_to_label[label]
if label_name not in label_count:
label_count[label_name] = 0
if label_count[label_name] < 10:
image_name = os.path.basename(image_path)
comet_logger.experiment.log_image(image_path, name=f"{label_name}_{image_name}")
label_count[label_name] += 1

#with comet_logger.experiment.context_manager("classification"):

with comet_logger.experiment.context_manager("classification"):
model.trainer.fit(model)
model.trainer.fit(model)

# Compute confusion matrix and upload to cometml
image_dataset = []
Expand All @@ -112,6 +126,7 @@ def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_
def preprocess_images(model, annotations, root_dir, save_dir):
# Remove any annotations with empty boxes
annotations = annotations[(annotations['xmin'] != 0) & (annotations['ymin'] != 0) & (annotations['xmax'] != 0) & (annotations['ymax'] != 0)]

# Remove any negative values
annotations = annotations[(annotations['xmin'] >= 0) & (annotations['ymin'] >= 0) & (annotations['xmax'] >= 0) & (annotations['ymax'] >= 0)]
boxes = annotations[['xmin', 'ymin', 'xmax', 'ymax']].values.tolist()
Expand All @@ -131,9 +146,11 @@ def preprocess_and_train_classification(config, train_df=None, validation_df=Non
trained_model: Trained model object
"""
# Get and split annotations
if train_df is not None:
if train_df is None:
annotations = gather_data(config.classification_model.train_csv_folder)

else:
annotations = train_df

num_classes = len(annotations["label"].unique())

# Remove the empty frames
Expand All @@ -155,10 +172,6 @@ def preprocess_and_train_classification(config, train_df=None, validation_df=Non
num_classes=num_classes
)

# Force the label dict, DeepForest will update this soon
loaded_model.label_dict = {v:k for k,v in enumerate(annotations["label"].unique())}
loaded_model.numeric_to_label_dict = {v:k for k,v in loaded_model.label_dict.items()}

# Preprocess train and validation data
preprocess_images(
model=loaded_model,
Expand All @@ -179,7 +192,8 @@ def preprocess_and_train_classification(config, train_df=None, validation_df=Non
model=loaded_model,
fast_dev_run=config.classification_model.trainer.fast_dev_run,
max_epochs=config.classification_model.trainer.max_epochs,
comet_logger=comet_logger
comet_logger=comet_logger,
workers=config.classification_model.trainer.workers
)

return trained_model
17 changes: 10 additions & 7 deletions src/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas as pd
from deepforest import main, visualize
from deepforest.utilities import read_file
import torch

# Local imports
from src import data_processing
Expand All @@ -29,7 +30,8 @@ def evaluate(model, test_csv, image_root_dir):
dict: A dictionary of evaluation metrics.
"""
# create trainer
model.create_trainer()
devices = torch.cuda.device_count()
model.create_trainer(num_nodes=1, devices=devices)
model.config["validation"]["csv_file"] = test_csv
model.config["validation"]["root_dir"] = image_root_dir
results = model.trainer.validate(model)
Expand Down Expand Up @@ -160,19 +162,20 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_log
else:
model.config[key] = value

devices = torch.cuda.device_count()
if comet_logger:
comet_logger.experiment.log_parameters(model.config)
comet_logger.experiment.log_table("train.csv", train_annotations)
comet_logger.experiment.log_table("test.csv", test_annotations)
model.create_trainer(logger=comet_logger)
model.create_trainer(logger=comet_logger, num_nodes=1, devices=devices)
else:
model.create_trainer()
model.create_trainer(num_nodes=1, devices=devices)

with comet_logger.experiment.context_manager("train_images"):
non_empty_train_annotations = read_file(model.config["train"]["csv_file"], root_dir=train_image_dir)
# Sanity check for debug
n = 5 if non_empty_train_annotations.shape[0] > 5 else non_empty_train_annotations.shape[0]
for filename in non_empty_train_annotations.image_path.sample():
for filename in non_empty_train_annotations.image_path.sample(n=n).unique():
sample_train_annotations_for_image = non_empty_train_annotations[non_empty_train_annotations.image_path == filename]
sample_train_annotations_for_image.root_dir = train_image_dir
visualize.plot_annotations(sample_train_annotations_for_image, savedir=tmpdir)
Expand All @@ -181,7 +184,7 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_log
with comet_logger.experiment.context_manager("test_images"):
non_empty_validation_annotations = read_file(model.config["validation"]["csv_file"], root_dir=train_image_dir)
n = 5 if non_empty_validation_annotations.shape[0] > 5 else non_empty_validation_annotations.shape[0]
for filename in non_empty_validation_annotations.image_path.head(5):
for filename in non_empty_validation_annotations.image_path.sample(n=n).unique():
sample_validation_annotations_for_image = non_empty_validation_annotations[non_empty_validation_annotations.image_path == filename]
sample_validation_annotations_for_image.root_dir = train_image_dir
visualize.plot_annotations(sample_validation_annotations_for_image, savedir=tmpdir)
Expand All @@ -191,7 +194,7 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_log
model.trainer.fit(model)

with comet_logger.experiment.context_manager("post-training prediction"):
for image_path in test_annotations.image_path.head(5):
for image_path in test_annotations.image_path.unique():
prediction = model.predict_image(path = os.path.join(train_image_dir, image_path))
if prediction is None:
continue
Expand Down Expand Up @@ -305,7 +308,7 @@ def _predict_list_(image_paths, patch_size, patch_overlap, model_path, m=None, c
if m is None:
raise ValueError("A model or model_path is required for prediction.")

m.create_trainer(fast_dev_run=False)
m.create_trainer(fast_dev_run=False, devices=1)
m.config["batch_size"] = batch_size
predictions = []
for image_path in image_paths:
Expand Down
2 changes: 1 addition & 1 deletion src/label_studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def upload_to_label_studio(images, sftp_client, url, project_name, images_to_ann
Upload images to Label Studio and import image tasks.
Args:
images (list): List of image paths to upload.
images (list): List of image paths to upload, full paths
url (str): The URL of the Label Studio server.
sftp_client (paramiko.SFTPClient): The SFTP client for uploading images.
project_name (str): The name of the Label Studio project.
Expand Down
5 changes: 3 additions & 2 deletions src/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(self, cfg: DictConfig):
self.all_images = glob.glob(os.path.join(self.config.active_learning.image_dir, "*.jpg"))

self.comet_logger = CometLogger(project_name=self.config.comet.project, workspace=self.config.comet.workspace)

self.comet_logger.experiment.add_tag("pipeline")

def save_model(self, model, directory):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
Expand Down Expand Up @@ -194,7 +194,8 @@ def run(self):
chosen_uncertain_images = uncertain_predictions.sort_values(by="score", ascending=False).head(self.config.human_review.n)["image_path"].tolist()
chosen_preannotations = uncertain_predictions[uncertain_predictions.image_path.isin(chosen_uncertain_images)]
chosen_preannotations = [group for _, group in chosen_preannotations.groupby("image_path")]
label_studio.upload_to_label_studio(images=chosen_uncertain_images,
full_image_paths = [os.path.join(self.config.active_learning.image_dir, image) for image in chosen_uncertain_images]
label_studio.upload_to_label_studio(images=full_image_paths,
sftp_client=self.sftp_client,
url=self.config.label_studio.url,
project_name=self.config.label_studio.instances.review.project_name,
Expand Down
12 changes: 7 additions & 5 deletions src/pipeline_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import os

class PipelineEvaluation:
def __init__(self, model, crop_model, image_dir, detect_ground_truth_dir, classify_ground_truth_dir, detection_true_positive_threshold=0.85, classification_avg_score=0.5, patch_size=450, patch_overlap=0, min_score=0.5, debug=False, batch_size=16, detection_results=None, comet_logger=None):
def __init__(self, model, crop_model, image_dir, detect_ground_truth_dir, classify_ground_truth_dir, comet_logger, detection_true_positive_threshold=0.85, classification_avg_score=0.5, patch_size=450, patch_overlap=0, min_score=0.5, debug=False, batch_size=16, detection_results=None):
"""Initialize pipeline evaluation.
Args:
Expand All @@ -22,6 +22,7 @@ def __init__(self, model, crop_model, image_dir, detect_ground_truth_dir, classi
detect_ground_truth_dir (str): Directory containing detection ground truth annotation CSV files
classify_ground_truth_dir (str): Directory containing confident classification ground truth annotation CSV files
detection_true_positive_threshold (float): IoU threshold for considering a detection a true positive
comet_logger: CometLogger object for logging
classification_threshold (float): Threshold for classification confidence score
patch_size (int): Size of image patches for prediction
patch_overlap (int): Overlap between patches
Expand Down Expand Up @@ -246,12 +247,13 @@ def evaluate_detection(self):
combined_predictions["workflow"] = "detection"
self.predictions.append(combined_predictions)

# replace None with 0
combined_predictions = combined_predictions.fillna(0)
combined_predictions["label"] = "Object"
# Remove empty predictions, needs to be confirmed for edge cases
combined_predictions = combined_predictions[~combined_predictions["score"].isna()]

combined_predictions = read_file(combined_predictions, self.image_dir)
ground_truth = read_file(self.detection_annotations, self.image_dir)
ground_truth = self.detection_annotations
if "geometry" not in ground_truth.columns:
ground_truth = read_file(ground_truth, self.image_dir)

iou_results = evaluate_boxes(
combined_predictions,
Expand Down
4 changes: 2 additions & 2 deletions submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#SBATCH [email protected] # Where to send mail
#SBATCH --account=ewhite
#SBATCH --nodes=1 # Number of MPI ran
#SBATCH --cpus-per-task=1
#SBATCH --cpus-per-task=10
#SBATCH --mem=150GB
#SBATCH --time=48:00:00 #Time limit hrs:min:sec
#SBATCH --output=/home/b.weinstein/logs/BOEM%j.out # Standard output and error log
Expand All @@ -15,4 +15,4 @@
source activate BOEM

cd ~/BOEM/
python main.py check_annotations=True active_learning.pool_limit=10 active_testing.n_images=1 active_learning.n_images=1
srun python main.py check_annotations=True active_learning.pool_limit=10000 active_testing.n_images=100 active_learning.n_images=200
Loading

0 comments on commit 3660d7b

Please sign in to comment.