pyronear · MateoLostanlen · Jun 23, 2024 · Jun 22, 2024 · Jun 23, 2024 · Jun 23, 2024
diff --git a/pyroengine/utils.py b/pyroengine/utils.py
@@ -4,10 +4,11 @@
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 
+import cv2  # type: ignore[import-untyped]
 import numpy as np
 from tqdm import tqdm  # type: ignore[import-untyped]
 
-__all__ = ["nms", "xywh2xyxy", "DownloadProgressBar"]
+__all__ = ["nms", "xywh2xyxy", "DownloadProgressBar", "letterbox"]
 
 
 def xywh2xyxy(x: np.ndarray):
@@ -19,6 +20,51 @@
     return y
 
 
+def letterbox(
+    im: np.ndarray, new_shape: tuple = (640, 640), color: tuple = (114, 114, 114), auto: bool = False, stride: int = 32
+):
+    """Letterbox image transform for yolo models
+    Args:
+        im (np.ndarray): Input image
+        new_shape (tuple, optional): Image size. Defaults to (640, 640).
+        color (tuple, optional): Pixel fill value for the area outside the transformed image.
+        Defaults to (114, 114, 114).
+        auto (bool, optional): auto padding. Defaults to False.
+        stride (int, optional): padding stride. Defaults to 32.
+    Returns:
+        np.ndarray: Output image
+    """
+    # Resize and pad image while meeting stride-multiple constraints
+    im = np.array(im)
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+
+    # Compute padding
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    # add border
+    h, w = im.shape[:2]
+    im_b = np.zeros((h + top + bottom, w + left + right, 3)) + color
+    im_b[top : top + h, left : left + w, :] = im
+
+    return im_b.astype("uint8"), (left, top)
+
+
 def box_iou(box1: np.ndarray, box2: np.ndarray, eps: float = 1e-7):
     """
     Calculate intersection-over-union (IoU) of boxes.

diff --git a/pyroengine/vision.py b/pyroengine/vision.py
@@ -8,13 +8,12 @@
 from typing import Optional, Tuple
 from urllib.request import urlretrieve
 
-import cv2  # type: ignore[import-untyped]
 import numpy as np
 import onnxruntime
 from huggingface_hub import HfApi  # type: ignore[import-untyped]
 from PIL import Image
 
-from .utils import DownloadProgressBar, nms, xywh2xyxy
+from .utils import DownloadProgressBar, letterbox, nms, xywh2xyxy
 
 __all__ = ["Classifier"]
 
@@ -41,7 +40,7 @@ class Classifier:
         model_path: model path
     """
 
-    def __init__(self, model_path: Optional[str] = "data/model.onnx", base_img_size: int = 640) -> None:
+    def __init__(self, model_path: Optional[str] = "data/model.onnx", img_size: tuple = (640, 640)) -> None:
         if model_path is None:
             model_path = "data/model.onnx"
 
@@ -67,7 +66,7 @@ def __init__(self, model_path: Optional[str] = "data/model.onnx", base_img_size:
             self.download_model(model_path, expected_sha256)
 
         self.ort_session = onnxruntime.InferenceSession(model_path)
-        self.base_img_size = base_img_size
+        self.img_size = img_size
 
     def get_sha(self, siblings):
         # Extract the SHA256 hash from the model files metadata
@@ -99,7 +98,7 @@ def load_metadata(self, metadata_path):
                 return json.load(f)
         return None
 
-    def preprocess_image(self, pil_img: Image.Image, new_img_size: list) -> Tuple[np.ndarray, Tuple[int, int]]:
+    def preprocess_image(self, pil_img: Image.Image) -> Tuple[np.ndarray, Tuple[int, int]]:
         """Preprocess an image for inference
 
         Args:
@@ -111,20 +110,15 @@ def preprocess_image(self, pil_img: Image.Image, new_img_size: list) -> Tuple[np
             - Padding information as a tuple of integers (pad_height, pad_width).
         """
 
-        np_img = cv2.resize(np.array(pil_img), new_img_size, interpolation=cv2.INTER_LINEAR)
+        np_img, pad = letterbox(np.array(pil_img), self.img_size)  # Applies letterbox resize with padding
         np_img = np.expand_dims(np_img.astype("float"), axis=0)  # Add batch dimension
         np_img = np.ascontiguousarray(np_img.transpose((0, 3, 1, 2)))  # Convert from BHWC to BCHW format
         np_img = np_img.astype("float32") / 255  # Normalize to [0, 1]
 
-        return np_img
+        return np_img, pad
 
     def __call__(self, pil_img: Image.Image, occlusion_mask: Optional[np.ndarray] = None) -> np.ndarray:
-
-        w, h = pil_img.size
-        ratio = self.base_img_size / max(w, h)
-        new_img_size = [int(ratio * w), int(ratio * h)]
-        new_img_size = [x - x % 32 for x in new_img_size]  # size need to be a multiple of 32 to fit the model
-        np_img = self.preprocess_image(pil_img, new_img_size)
+        np_img, pad = self.preprocess_image(pil_img)
 
         # ONNX inference
         y = self.ort_session.run(["output0"], {"images": np_img})[0][0]
@@ -136,12 +130,17 @@ def __call__(self, pil_img: Image.Image, occlusion_mask: Optional[np.ndarray] =
         # Sort by confidence
         y = y[y[:, 4].argsort()]
         y = nms(y)
+        y = y[::-1]
 
         # Normalize preds
         if len(y) > 0:
-            # Normalize Output
-            y[:, :4:2] /= new_img_size[0]
-            y[:, 1:4:2] /= new_img_size[1]
+            # Remove padding
+            left_pad, top_pad = pad
+            y[:, :4:2] -= left_pad
+            y[:, 1:4:2] -= top_pad
+            y[:, :4:2] /= self.img_size[1] - 2 * left_pad
+            y[:, 1:4:2] /= self.img_size[0] - 2 * top_pad
+            y = np.clip(y, 0, 1)
         else:
             y = np.zeros((0, 5))  # normalize output
 
@@ -162,4 +161,4 @@ def __call__(self, pil_img: Image.Image, occlusion_mask: Optional[np.ndarray] =
 
             y = y[keep]
 
-        return np.clip(y, 0, 1)
+        return y
diff --git a/tests/test_vision.py b/tests/test_vision.py
@@ -30,22 +30,25 @@ def test_classifier(mock_wildfire_image):
         # Instantiate the ONNX model
         model = Classifier()
         # Check preprocessing
-        out = model.preprocess_image(mock_wildfire_image, (640, 384))
+        out, pad = model.preprocess_image(mock_wildfire_image)
         assert isinstance(out, np.ndarray) and out.dtype == np.float32
-        assert out.shape == (1, 3, 384, 640)
+        assert out.shape == (1, 3, 640, 640)
+        assert isinstance(pad, tuple)
         # Check inference
         out = model(mock_wildfire_image)
         assert out.shape == (1, 5)
         conf = np.max(out[:, 4])
         assert conf >= 0 and conf <= 1
 
         # Test mask
-        mask = np.ones((640, 384))
+        mask = np.ones((384, 640))
         out = model(mock_wildfire_image, mask)
+        print(out)
         assert out.shape == (1, 5)
 
-        mask = np.zeros((640, 384))
+        mask = np.zeros((384, 640))
         out = model(mock_wildfire_image, mask)
+        print(out)
         assert out.shape == (0, 5)
         os.remove(model_path)
         os.remove(METADATA_PATH)