Skip to content

Commit

Permalink
refactor comet logger
Browse files Browse the repository at this point in the history
  • Loading branch information
bw4sz committed Feb 27, 2025
1 parent 3c7188f commit f194dcc
Show file tree
Hide file tree
Showing 10 changed files with 347 additions and 268 deletions.
18 changes: 14 additions & 4 deletions conf/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,22 @@ force_upload: False
force_training: True

label_studio:
project_name_train: "Bureau of Ocean Energy Management - Training"
project_name_validation: "Bureau of Ocean Energy Management - Validation"
url: "https://labelstudio.naturecast.org/"
folder_name: "/pgsql/retrieverdash/everglades-label-studio/everglades-data"
images_to_annotate_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27
annotated_images_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
csv_dir_train: /blue/ewhite/b.weinstein/BOEM/annotations/train
csv_dir_validation: /blue/ewhite/b.weinstein/BOEM/annotations/validation

instances:
train:
csv_dir: /blue/ewhite/b.weinstein/BOEM/annotations/train
project_name: "Bureau of Ocean Energy Management - Training"
validation:
csv_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation
project_name: "Bureau of Ocean Energy Management - Validation"
review:
csv_dir: /blue/ewhite/b.weinstein/BOEM/annotations/review
project_name: "Bureau of Ocean Energy Management - Review"
predict:
patch_size: 2000
patch_overlap: 0
Expand All @@ -35,7 +42,7 @@ propagate:
distance_threshold_pixels: 50

detection_model:
checkpoint: "/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/checkpoints/900710af16d9431aaef84bf74034a48c.pl"
checkpoint: "/blue/ewhite/b.weinstein/BOEM/UBFAI Images with Detection Data/checkpoints/d3a74b0852df4d3ba9a6503660fcc9f1.pl"
checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/detection/checkpoints
train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train/
train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated
Expand Down Expand Up @@ -104,3 +111,6 @@ active_testing:
patch_size: 2000
patch_overlap: 0
min_score: 0.2

human_review:
n_images: 10
140 changes: 125 additions & 15 deletions src/active_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import dask.array as da
import pandas as pd

def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, classification_model=None, dask_client=None, target_labels=None, pool_limit=1000, batch_size=16):
def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, classification_model=None, dask_client=None, target_labels=None, pool_limit=1000, batch_size=16, selected_test_images=[], comet_logger=None):
"""
Choose images to annotate.
Args:
Expand All @@ -26,8 +26,11 @@ def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, p
target_labels: (list, optional): A list of target labels to filter images by. Defaults to None.
pool_limit (int, optional): The maximum number of images to consider. Defaults to 1000.
batch_size (int, optional): The batch size for prediction. Defaults to 16.
selected_test_images (list, optional): A list of test images that have already been selected. Defaults to [].
comet_logger (CometLogger, optional): A CometLogger object. Defaults to None.
Returns:
tuple: A tuple containing a list of chosen image paths and a DataFrame of preannotations.
list: A list of image paths.
pd.DataFrame: A DataFrame of preannotations.
"""
pool = glob.glob(os.path.join(image_dir,"*.jpg")) # Get all images in the data directory

Expand All @@ -44,6 +47,9 @@ def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, p
if len(pool) > pool_limit:
pool = random.sample(pool, pool_limit)

# Don't allow any test images that have just been selected to be chosen
pool = [x for x in pool if not x in selected_test_images]

if strategy=="random":
chosen_images = random.sample(pool, n)
return chosen_images
Expand Down Expand Up @@ -75,13 +81,12 @@ def update_sys_path():
preannotations = detection.predict(m=model, image_paths=pool, patch_size=patch_size, patch_overlap=patch_overlap, batch_size=batch_size)
preannotations = pd.concat(preannotations)

if comet_logger:
comet_logger.log_table("active_training_pool", preannotations)

# Print the number of preannotations before removing min score
print("There are {} preannotations before removing min score".format(preannotations.shape[0]))
print("There are {} images before removing min score".format(preannotations["image_path"].nunique()))
preannotations = preannotations[preannotations["score"] >= min_score]
print("There are {} preannotations after removing min score".format(preannotations.shape[0]))
print("There are {} images after removing min score".format(preannotations["image_path"].nunique()))


if strategy == "most-detections":
# Sort images by total number of predictions
chosen_images = preannotations.groupby("image_path").size().sort_values(ascending=False).head(n).index.tolist()
Expand All @@ -97,9 +102,12 @@ def update_sys_path():
else:
raise ValueError("Invalid strategy. Must be one of 'random', 'most-detections', or 'target-labels'.")

return chosen_images, preannotations
# Get preannotations for chosen images
chosen_preannotations = preannotations[preannotations["image_path"].isin(chosen_images)]

return chosen_images, chosen_preannotations

def choose_test_images(image_dir, strategy, n=10, patch_size=512, patch_overlap=0, min_score=0.5, model=None, model_path=None, dask_client=None, target_labels=None, pool_limit=1000, batch_size=1):
def choose_test_images(image_dir, strategy, n=10, patch_size=512, patch_overlap=0, min_score=0.5, model=None, model_path=None, dask_client=None, target_labels=None, pool_limit=1000, batch_size=1, comet_logger=None):
"""Choose images to annotate.
Args:
evaluation (dict): A dictionary of evaluation metrics.
Expand All @@ -118,8 +126,10 @@ def choose_test_images(image_dir, strategy, n=10, patch_size=512, patch_overlap=
target_labels: (list, optional): A list of target labels to filter images by. Defaults to None.
pool_limit (int, optional): The maximum number of images to consider. Defaults to 1000.
batch_size (int, optional): The batch size for prediction. Defaults to 1.
comet_logger (CometLogger, optional): A CometLogger object. Defaults to None.
Returns:
list: A list of image paths.
pd.DataFrame: A DataFrame of preannotations.
"""
pool = glob.glob(os.path.join(image_dir,"*")) # Get all images in the data directory
# Remove .csv files from the pool
Expand All @@ -138,7 +148,7 @@ def choose_test_images(image_dir, strategy, n=10, patch_size=512, patch_overlap=
if strategy=="random":
chosen_images = random.sample(pool, n)
preannotations = None
return chosen_images
return chosen_images, None
elif strategy in ["most-detections","target-labels"]:
# Predict all images
if model_path is None:
Expand Down Expand Up @@ -167,6 +177,9 @@ def update_sys_path():
preannotations = detection.predict(model=model, image_paths=pool, patch_size=patch_size, patch_overlap=patch_overlap, batch_size=batch_size)
preannotations = pd.concat(preannotations)

if comet_logger:
comet_logger.log_table("active_testing_pool", preannotations)

print("There are {} preannotations before removing min score".format(preannotations.shape[0]))
preannotations = preannotations[preannotations["score"] >= min_score]

Expand All @@ -183,9 +196,11 @@ def update_sys_path():
else:
raise ValueError("Invalid strategy. Must be one of 'random', 'most-detections', or 'target-labels'.")

return chosen_images, preannotations
# Get preannotations for chosen images
chosen_preannotations = preannotations[preannotations["image_path"].isin(chosen_images)]
return chosen_images, chosen_preannotations

def predict_and_divide(detection_model, classification_model, image_paths, patch_size, patch_overlap, confident_threshold, min_score, batch_size):
def human_review(detection_model, classification_model, image_paths, patch_size, patch_overlap, confident_threshold, min_score, batch_size):
"""
Predict on images and divide into confident and uncertain predictions.
Args:
Expand All @@ -201,7 +216,6 @@ def predict_and_divide(detection_model, classification_model, image_paths, patch
Returns:
tuple: A tuple of confident and uncertain predictions.
"""

# Check for existing predictions
if existing_predictions is not None:
image_basenames = [os.path.basename(image_path) for image_path in image_paths]
Expand All @@ -226,9 +240,105 @@ def predict_and_divide(detection_model, classification_model, image_paths, patch
# Split predictions into confident and uncertain
uncertain_predictions = combined_predictions[
combined_predictions["score"] <= confident_threshold]

confident_predictions = combined_predictions[
~combined_predictions["image_path"].isin(
uncertain_predictions["image_path"])]

return confident_predictions, uncertain_predictions
return confident_predictions, uncertain_predictions

def generate_training_pool_predictions(image_dir, patch_size=512, patch_overlap=0.1, min_score=0.1, model=None, model_path=None, dask_client=None, batch_size=16, comet_logger=None):
"""
Generate predictions for the training pool.
Args:
image_dir (str): The path to a directory of images.
patch_size (int, optional): The size of the image patches to predict on. Defaults to 512.
patch_overlap (float, optional): The amount of overlap between image patches. Defaults to 0.1.
min_score (float, optional): The minimum score for a prediction to be included. Defaults to 0.1.
model (main.deepforest, optional): A trained deepforest model. Defaults to None.
model_path (str, optional): The path to the model checkpoint file. Defaults to None. Only used in combination with dask.
dask_client (dask.distributed.Client, optional): A Dask client for parallel processing. Defaults to None.
batch_size (int, optional): The batch size for prediction. Defaults to 16.
comet_logger (CometLogger, optional): A CometLogger object. Defaults to None.
Returns:
pd.DataFrame: A DataFrame of predictions.
"""
pool = glob.glob(os.path.join(image_dir, "*.jpg")) # Get all images in the data directory

# Remove .csv files from the pool
pool = [image for image in pool if not image.endswith('.csv')]

# Remove crop dir
try:
pool.remove(os.path.join(image_dir, "crops"))
except ValueError:
pass

if dask_client:
# load model on each client
def update_sys_path():
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
dask_client.run(update_sys_path)

# Load model on each client
dask_pool = da.from_array(pool, chunks=len(pool) // len(dask_client.ncores()))
blocks = dask_pool.to_delayed().ravel()
block_futures = []
for block in blocks:
block_future = dask_client.submit(detection.predict, image_paths=block.compute(), patch_size=patch_size, patch_overlap=patch_overlap, model_path=model_path)
block_futures.append(block_future)
# Get results
dask_results = []
for block_result in block_futures:
block_result = block_result.result()
dask_results.append(pd.concat(block_result))
preannotations = pd.concat(dask_results)
else:
preannotations = detection.predict(m=model, image_paths=pool, patch_size=patch_size, patch_overlap=patch_overlap, batch_size=batch_size)
preannotations = pd.concat(preannotations)

if comet_logger:
comet_logger.log_table("active_training_pool", preannotations)

# Print the number of preannotations before removing min score
preannotations = preannotations[preannotations["score"] >= min_score]

return preannotations

def select_train_images(preannotations, strategy, n=10, target_labels=None):
"""
Select images to annotate based on the strategy.
Args:
preannotations (pd.DataFrame): A DataFrame of predictions.
strategy (str): The strategy for choosing images. Available strategies are:
- "random": Choose images randomly from the pool.
- "most-detections": Choose images with the most detections based on predictions.
- "target-labels": Choose images with target labels.
n (int, optional): The number of images to choose. Defaults to 10.
target_labels (list, optional): A list of target labels to filter images by. Defaults to None.
Returns:
list: A list of image paths.
pd.DataFrame: A DataFrame of preannotations for the chosen images.
"""
if strategy == "random":
chosen_images = random.sample(preannotations["image_path"].unique().tolist(), n)
elif strategy == "most-detections":
# Sort images by total number of predictions
chosen_images = preannotations.groupby("image_path").size().sort_values(ascending=False).head(n).index.tolist()
elif strategy == "target-labels":
if target_labels is None:
raise ValueError("Target labels are required for the 'target-labels' strategy.")
# Filter images by target labels
chosen_images = preannotations[preannotations.label.isin(target_labels)].groupby("image_path")["score"].mean().sort_values(ascending=False).head(n).index.tolist()
else:
raise ValueError("Invalid strategy. Must be one of 'random', 'most-detections', or 'target-labels'.")

# Get preannotations for chosen images
chosen_preannotations = preannotations[preannotations["image_path"].isin(chosen_images)]

return chosen_images, chosen_preannotations
18 changes: 8 additions & 10 deletions src/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,30 +60,26 @@ def load(checkpoint=None, annotations=None, checkpoint_dir=None, lr=0.0001, num_

return loaded_model

def train(model, train_dir, val_dir, comet_workspace=None, comet_project=None, fast_dev_run=False, max_epochs=10):
def train(model, train_dir, val_dir, comet_logger=None, fast_dev_run=False, max_epochs=10):
"""Train a model on labeled images.
Args:
model (CropModel): A CropModel object.
train_dir (str): The directory containing the training images.
val_dir (str): The directory containing the validation images.
fast_dev_run (bool): Whether to run a fast development run.
max_epochs (int): The maximum number of epochs to train for.
comet_logger (CometLogger): A CometLogger object.
Returns:
main.deepforest: A trained deepforest model.
"""

if comet_project:
comet_logger = CometLogger(project_name=comet_project, workspace=comet_workspace)
comet_logger.experiment.add_tags(["classification"])
else:
comet_logger = None

model.create_trainer(logger=comet_logger, fast_dev_run=fast_dev_run, max_epochs=max_epochs)

# Get the data stored from the write_crops step above.
model.load_from_disk(train_dir=train_dir, val_dir=val_dir)
model.trainer.fit(model)

with comet_logger.context_manager("classification"):
model.trainer.fit(model)

return model

Expand All @@ -97,12 +93,13 @@ def preprocess_images(model, annotations, root_dir, save_dir):
labels = annotations["label"].values
model.write_crops(boxes=boxes, root_dir=root_dir, images=images, labels=labels, savedir=save_dir)

def preprocess_and_train_classification(config, validation_df=None):
def preprocess_and_train_classification(config, validation_df=None, comet_logger=None):
"""Preprocess data and train a crop model.
Args:
config: Configuration object containing training parameters
validation_df (pd.DataFrame): A DataFrame containing validation annotations.
comet_logger: CometLogger object for logging experiments
Returns:
trained_model: Trained model object
"""
Expand Down Expand Up @@ -153,6 +150,7 @@ def preprocess_and_train_classification(config, validation_df=None):
comet_project=config.comet.project,
fast_dev_run=config.classification_model.trainer.fast_dev_run,
max_epochs=config.classification_model.trainer.max_epochs,
comet_logger=comet_logger
)

return trained_model
Loading

0 comments on commit f194dcc

Please sign in to comment.