refactor reporting code

weecology · Feb 20, 2025 · b62a380 · b62a380
1 parent 71adaf8
commit b62a380
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 32 deletions.
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -75,9 +75,9 @@ pipeline_evaluation:
   debug: False
 
 reporting:
-  report_dir: /blue/ewhite/b.weinstein/BOEM/reporting/reports
+  report_dir: /orange/ewhite/web/public/BOEM
   metadata: /blue/ewhite/b.weinstein/BOEM/reporting/metadata/metadata.csv
-  thin_factor: 500
+  thin_factor: 100
 
 
 active_learning:

diff --git a/src/pipeline.py b/src/pipeline.py
@@ -81,32 +81,31 @@ def run(self):
             detection_checkpoint_path = self.save_model(trained_detection_model,
                             self.config.detection_model.checkpoint_dir)
             classification_checkpoint_path = self.save_model(trained_classification_model,
-                            self.config.classification_model.checkpoint_dir)
-
-            pipeline_monitor = PipelineEvaluation(
-                model=trained_detection_model,
-                crop_model=trained_classification_model,
-                **self.config.pipeline_evaluation)
-
-            performance = pipeline_monitor.evaluate()
-
-            if pipeline_monitor.check_success():
-                print("Pipeline performance is satisfactory, exiting")
-                return None
+                            self.config.classification_model.checkpoint_dir)      
+
         else:
+            detection_checkpoint_path = self.config.detection_model.checkpoint
             trained_detection_model = detection.load(
                 checkpoint = self.config.detection_model.checkpoint)
 
-            if self.config.classification_model.checkpoint:
+            if self.config.classification_model.checkpoint is not None:
                 trained_classification_model = classification.load(
-                self.config.classification_model.checkpoint, checkpoint_dir=self.config.classification_model.checkpoint_dir, annotations=None)
+                    self.config.classification_model.checkpoint, checkpoint_dir=self.config.classification_model.checkpoint_dir, annotations=None)
             else:
-                trained_classification_model = None
-
-            performance = None
-            pipeline_monitor = None
-            detection_checkpoint_path = None
+                annotations = label_studio.gather_data(self.config.classification_model.train_csv_folder)
+                trained_classification_model = classification.load(
+                    checkpoint = None, checkpoint_dir=self.config.classification_model.checkpoint_dir, annotations=annotations)
+
+        pipeline_monitor = PipelineEvaluation(
+            model=trained_detection_model,
+            crop_model=trained_classification_model,
+            **self.config.pipeline_evaluation)
+
+        performance = pipeline_monitor.evaluate()
 
+        if pipeline_monitor.check_success():
+            print("Pipeline performance is satisfactory, exiting")
+            return None
         if self.config.active_learning.gpus > 1:
             dask_client = start(gpus=self.config.active_learning.gpus, mem_size="70GB")
         else:

diff --git a/src/reporting.py b/src/reporting.py
@@ -31,7 +31,6 @@ def __init__(self, report_dir, image_dir, metadata_csv, pipeline_monitor=None, m
         self.report_dir = os.path.join(report_dir, timestamp)
         self.report_file = f"{self.report_dir}/report.csv"
         self.image_dir = image_dir
-        self.sample_prediction_dir = f"{self.report_dir}/samples"
         self.model = model
         self.classification_model = classification_model
         self.patch_overlap = patch_overlap
@@ -47,7 +46,6 @@ def __init__(self, report_dir, image_dir, metadata_csv, pipeline_monitor=None, m
 
         # Check the dirs exist
         os.makedirs(self.report_dir, exist_ok=True)
-        os.makedirs(self.sample_prediction_dir, exist_ok=True)
 
         self.pipeline_monitor = pipeline_monitor
 
@@ -60,31 +58,64 @@ def concat_predictions(self):
         self.all_predictions = pd.concat(self.pipeline_monitor.predictions, ignore_index=True)
 
     def generate_report(self, create_video=False):
-        """Generate a report"""
+        """Generate a report and zip the contents
+        
+        Args:
+            create_video (bool): Whether to create visualization video
+        
+        Returns:
+            str: Path to the zipped report file
+        """
+        import zipfile
+        import os
 
+        # Generate report contents
         if self.pipeline_monitor:
             self.concat_predictions()
             self.write_predictions()
         self.write_metrics()
         if create_video:
             self.generate_video()
 
+        # Create zip file path
+        zip_path = f"{self.report_dir}.zip"
+
+        # Create zip file with just basenames
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            for root, dirs, files in os.walk(self.report_dir):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    arcname = os.path.basename(file_path)
+                    zipf.write(file_path, arcname=arcname)
+
+        return zip_path
+
     def write_predictions(self):
         """Write predictions to a csv file"""
         self.concat_predictions()
-        self.all_predictions.to_csv(f"{self.report_dir}/predictions.csv", index=False)
         self.all_predictions['timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         self.all_predictions["unique_image"] = self.all_predictions["image_path"].apply(lambda x: os.path.splitext(os.path.basename(x))[0])
 
         # Connect with metadata on location
         metadata_df = pd.read_csv(self.metadata)
         merged_predictions = self.all_predictions.merge(metadata_df[["unique_image", "flight_name","date","lat","long"]], on='unique_image')
-        merged_predictions.to_csv(f"{self.report_dir}/predictions.csv", index=False)
+        merged_predictions.to_csv(f"{self.report_dir}/validation_predictions.csv", index=False)
 
         # Create shapefile
         gpd.GeoDataFrame(merged_predictions, geometry=gpd.points_from_xy(merged_predictions.long, merged_predictions.lat)).to_file(f"{self.report_dir}/predictions.shp")
 
-        return f"{self.report_dir}/predictions.csv"
+        return f"{self.report_dir}/validation_predictions.csv"
+
+    @staticmethod
+    def crop_images(self, CropModel, annotations, root_dir, save_dir):
+        # Remove any annotations with empty boxes
+        annotations = annotations[(annotations['xmin'] != 0) & (annotations['ymin'] != 0) & (annotations['xmax'] != 0) & (annotations['ymax'] != 0)]
+        # Remove any negative values
+        annotations = annotations[(annotations['xmin'] >= 0) & (annotations['ymin'] >= 0) & (annotations['xmax'] >= 0) & (annotations['ymax'] >= 0)]
+        boxes = annotations[['xmin', 'ymin', 'xmax', 'ymax']].values.tolist()
+        images = annotations["image_path"].values
+        labels = annotations["label"].values
+        CropModel.write_crops(boxes=boxes, root_dir=root_dir, images=images, labels=labels, savedir=save_dir)
 
     def select_images_for_video(self):
         all_images = glob.glob(self.image_dir + "/*.jpg")
@@ -108,6 +139,12 @@ def predict_video_images(self, images):
 
         predictions = predictions[predictions.score > self.min_score]
 
+        # Save predictions
+        predictions.to_csv(f"{self.report_dir}/video_predictions.csv", index=False)
+
+        # Crop the images to the predictions
+        self.crop_images(CropModel=self.classification_model, annotations=predictions, root_dir=self.image_dir, save_dir=self.report_dir)
+
         return predictions
 
     def get_coco_datasets(self):
@@ -119,7 +156,10 @@ def generate_video(self):
         images = self.select_images_for_video()
         video_predictions = self.predict_video_images(images)
         visualizer = PredictionVisualizer(video_predictions, self.report_dir)
-        output_path = f"{self.report_dir}/predictions.mp4"
+
+        # Give the flightname as the video name
+        flightname = self.image_dir.split("/")[-1]
+        output_path = f"{self.report_dir}/{flightname}.mp4"
         output_path = visualizer.create_visualization(images=images)
 
         return output_path

diff --git a/src/visualization.py b/src/visualization.py
@@ -126,11 +126,13 @@ def create_visualization(
 
         height, width = first_image.shape[:2]
 
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        # Use H.264 codec and lower frame rate for better compatibility
+        fourcc = cv2.VideoWriter_fourcc(*'avc1')  # Changed from mp4v to avc1
+        fps = 5  # Reduced from 30 to 5 for slower playback
         video_writer = cv2.VideoWriter(
             output_path,
             fourcc,
-            self.fps,
+            fps,
             (width, height)
         )
 
@@ -226,4 +228,4 @@ def create_summary_image(
                 2
             )
 
-        return summary 
+        return summary
diff --git a/submit.sh b/submit.sh
@@ -15,4 +15,4 @@
 source activate BOEM
 
 cd ~/BOEM/
-python main.py check_annotations=True active_learning.pool_limit=10000 active_testing.n_images=1 active_learning.n_images=100 pipeline_evaluation.debug=False
+python main.py check_annotations=True active_learning.pool_limit=10 active_testing.n_images=1 active_learning.n_images=1 pipeline_evaluation.debug=True force_training=False