feat: Add image classifier models (#1421)

* feat: add support for image classifier models Integration for Ultralytics v8 classification models through Triton * chore: upgrade to latest version of triton server
openfoodfacts · Sep 30, 2024 · bd369da · bd369da
1 parent 89f20ab
commit bd369da
Show file tree

Hide file tree

Showing 6 changed files with 219 additions and 29 deletions.
diff --git a/Makefile b/Makefile
@@ -17,6 +17,8 @@ HOSTS=127.0.0.1 robotoff.openfoodfacts.localhost
 DOCKER_COMPOSE=docker compose --env-file=${ENV_FILE}
 DOCKER_COMPOSE_TEST=COMPOSE_PROJECT_NAME=robotoff_test COMMON_NET_NAME=po_test docker compose --env-file=${ENV_FILE}
 ML_OBJECT_DETECTION_MODELS := tf-universal-logo-detector tf-nutrition-table tf-nutriscore
+# Use bash shell for variable substitution
+SHELL := /bin/bash
 
 # Spellcheck
 SPELLCHECK_IMAGE_NAME = spellcheck-batch-vllm
@@ -148,6 +150,17 @@ dl-ingredient-detection-model:
 	tar -xzvf onnx.tar.gz --strip-component=1; \
 	rm onnx.tar.gz
 
+dl-image-clf-models:
+	@echo "⏬ Downloading image classification model files …"
+	mkdir -p models/triton; \
+	cd models/triton; \
+	for asset_name in 'price-proof-classification'; \
+		do \
+			dir=$${asset_name//-/_}/1; \
+			mkdir -p $${dir}; \
+			wget -cO - https://huggingface.co/openfoodfacts/$${asset_name}/resolve/main/weights/best.onnx > $${dir}/model.onnx; \
+	done;
+
 init-elasticsearch:
 	@echo "Initializing elasticsearch indices"
 	${DOCKER_COMPOSE} up -d elasticsearch 2>&1

diff --git a/docker/ml-gpu.yml b/docker/ml-gpu.yml
@@ -7,11 +7,11 @@
 services:
   triton:
     restart: $RESTART_POLICY
-    image: nvcr.io/nvidia/tritonserver:24.01-py3
+    image: nvcr.io/nvidia/tritonserver:24.08-py3
     ports:
-     - ${TRITON_EXPOSE_HTTP:-8000}:8000
-     - ${TRITON_EXPOSE_GRPC:-8001}:8001
-     - ${TRITON_EXPOSE_METRICS:-8002}:8002
+      - ${TRITON_EXPOSE_HTTP:-8000}:8000
+      - ${TRITON_EXPOSE_GRPC:-8001}:8001
+      - ${TRITON_EXPOSE_METRICS:-8002}:8002
     volumes:
       - ${TRITON_MODELS_DIR:-../models/triton}:/models
     # We need to add nvidia_entrypoint.sh for the GPU to be correctly detected

diff --git a/docker/ml.yml b/docker/ml.yml
@@ -1,13 +1,7 @@
 services:
   triton:
     restart: $RESTART_POLICY
-    # This is a custom built of Triton with:
-    # - GRPC/HTTP support
-    # - CPU only (we don't have GPU in production)
-    # - Tensorflow 2 SavedModel and ONNX support
-    # This allows us to reduce significantly the image size
-    # See https://gist.github.com/raphael0202/091e521f2c79a8db8c6e9aceafb6e0b9 for build script
-    image: ghcr.io/openfoodfacts/triton:cpu
+    image: nvcr.io/nvidia/tritonserver:24.08-py3
     ports:
       - ${TRITON_EXPOSE_HTTP:-8000}:8000
       - ${TRITON_EXPOSE_GRPC:-8001}:8001

diff --git a/robotoff/app/api.py b/robotoff/app/api.py
@@ -68,14 +68,15 @@
     generate_json_ocr_url,
     get_barcode_from_url,
 )
-from robotoff.prediction import ingredient_list
+from robotoff.prediction import image_classifier, ingredient_list
 from robotoff.prediction.category import predict_category
 from robotoff.prediction.langid import predict_lang
 from robotoff.prediction.object_detection import ObjectDetectionModelRegistry
 from robotoff.products import get_image_id, get_product, get_product_dataset_etag
 from robotoff.taxonomy import is_prefixed_value, match_taxonomized_value
 from robotoff.types import (
     BatchJobType,
+    ImageClassificationModel,
     InsightType,
     JSONType,
     NeuralCategoryClassifierModel,
@@ -858,27 +859,36 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
         image_url = req.get_param("image_url", required=True)
         models: list[str] = req.get_param_as_list("models", required=True)
 
-        available_models = ObjectDetectionModelRegistry.get_available_models()
+        available_object_detection_models = (
+            ObjectDetectionModelRegistry.get_available_models()
+        )
+        available_clf_models = list(ImageClassificationModel.__members__.keys())
+        available_models = available_object_detection_models + available_clf_models
 
         for model_name in models:
             if model_name not in available_models:
                 raise falcon.HTTPBadRequest(
                     "invalid_model",
-                    "unknown model {}, available models: {}"
-                    "".format(model_name, ", ".join(available_models)),
+                    f"unknown model {model_name}, available models: {', '.join(available_models)}",
                 )
 
         output_image = req.get_param_as_bool("output_image")
 
         if output_image is None:
             output_image = False
 
-        if output_image and len(models) != 1:
-            raise falcon.HTTPBadRequest(
-                "invalid_request",
-                "a single model must be specified with the `models` parameter "
-                "when `output_image` is True",
-            )
+        if output_image:
+            if len(models) != 1:
+                raise falcon.HTTPBadRequest(
+                    "invalid_request",
+                    "a single model must be specified with the `models` parameter "
+                    "when `output_image` is True",
+                )
+            if models[0] not in available_object_detection_models:
+                raise falcon.HTTPBadRequest(
+                    "invalid_request",
+                    f"model {models[0]} does not support image output",
+                )
 
         image = get_image_from_url(
             image_url, session=http_session, error_raise=False, use_cache=True
@@ -890,15 +900,26 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
         predictions = {}
 
         for model_name in models:
-            model = ObjectDetectionModelRegistry.get(model_name)
-            result = model.detect_from_image(image, output_image=output_image)
-
-            if output_image:
-                boxed_image = cast(Image.Image, result.boxed_image)
-                image_response(boxed_image, resp)
-                return
+            if model_name in available_object_detection_models:
+                model = ObjectDetectionModelRegistry.get(model_name)
+                result = model.detect_from_image(image, output_image=output_image)
+
+                if output_image:
+                    boxed_image = cast(Image.Image, result.boxed_image)
+                    image_response(boxed_image, resp)
+                    return
+                else:
+                    predictions[model_name] = result.to_json()
             else:
-                predictions[model_name] = result.to_json()
+                model_enum = ImageClassificationModel[model_name]
+                classifier = image_classifier.ImageClassifier(
+                    model_enum.name,
+                    label_names=image_classifier.LABEL_NAMES[model_enum],
+                )
+                predictions[model_name] = [
+                    {"label": label, "score": score}
+                    for label, score in classifier.predict(image)
+                ]
 
         resp.media = {"predictions": predictions}
 

diff --git a/robotoff/prediction/image_classifier.py b/robotoff/prediction/image_classifier.py
@@ -0,0 +1,158 @@
+import math
+import time
+import typing
+
+import numpy as np
+from PIL import Image, ImageOps
+from tritonclient.grpc import service_pb2
+
+from robotoff.triton import get_triton_inference_stub
+from robotoff.types import ImageClassificationModel
+from robotoff.utils import get_logger
+
+logger = get_logger(__name__)
+
+
+LABEL_NAMES = {
+    ImageClassificationModel.price_proof_classification: [
+        "OTHER",
+        "PRICE_TAG",
+        "PRODUCT_WITH_PRICE",
+        "RECEIPT",
+        "SHELF",
+        "WEB_PRINT",
+    ]
+}
+
+
+def classify_transforms(
+    img: Image.Image,
+    size: int = 224,
+    mean=(0.0, 0.0, 0.0),
+    std=(1.0, 1.0, 1.0),
+    interpolation=Image.Resampling.BILINEAR,
+    crop_fraction: float = 1.0,
+) -> np.ndarray:
+    """
+    Applies a series of image transformations including resizing, center cropping,
+    normalization, and conversion to a NumPy array.
+
+    Transformation steps is based on the one used in the Ultralytics library:
+    https://github.com/ultralytics/ultralytics/blob/main/ultralytics/data/augment.py#L2319
+
+    :param img: Input Pillow image.
+    :param size: The target size for the transformed image (shortest edge).
+    :param mean: Mean values for each RGB channel used in normalization.
+    :param std: Standard deviation values for each RGB channel used in normalization.
+    :param interpolation: Interpolation method from PIL (Image.Resampling.NEAREST,
+        Image.Resampling.BILINEAR, Image.Resampling.BICUBIC).
+    :param crop_fraction: Fraction of the image to be cropped.
+    :return: The transformed image as a NumPy array.
+    """
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+
+    # Rotate the image based on the EXIF orientation if needed
+    img = typing.cast(Image.Image, ImageOps.exif_transpose(img))
+
+    # Step 1: Resize while preserving the aspect ratio
+    width, height = img.size
+
+    # Calculate scale size while preserving aspect ratio
+    scale_size = math.floor(size / crop_fraction)
+
+    aspect_ratio = width / height
+    if width < height:
+        new_width = scale_size
+        new_height = int(new_width / aspect_ratio)
+    else:
+        new_height = scale_size
+        new_width = int(new_height * aspect_ratio)
+
+    img = img.resize((new_width, new_height), interpolation)
+
+    # Step 2: Center crop
+    left = (new_width - size) // 2
+    top = (new_height - size) // 2
+    right = left + size
+    bottom = top + size
+    img = img.crop((left, top, right, bottom))
+
+    # Step 3: Convert the image to a NumPy array and scale pixel values to [0, 1]
+    img_array = np.array(img).astype(np.float32) / 255.0
+
+    # Step 4: Normalize the image
+    mean = np.array(mean, dtype=np.float32).reshape(1, 1, 3)
+    std = np.array(std, dtype=np.float32).reshape(1, 1, 3)
+    img_array = (img_array - mean) / std
+
+    # Step 5: Change the order of dimensions from (H, W, C) to (C, H, W)
+    img_array = np.transpose(img_array, (2, 0, 1))
+    return img_array
+
+
+class ImageClassifier:
+    def __init__(self, name: str, label_names: list[str]):
+        self.name: str = name
+        self.label_names = label_names
+
+    def predict(
+        self,
+        image: Image.Image,
+        triton_uri: str | None = None,
+    ) -> list[tuple[str, float]]:
+        """Run an image classification model on an image.
+
+        The model is expected to have been trained with Ultralytics library (Yolov8).
+
+        :param image: the input Pillow image
+        :param triton_uri: URI of the Triton Inference Server, defaults to
+            None. If not provided, the default value from settings is used.
+        :return: the prediction results as a list of tuples (label, confidence)
+        """
+        image_array = classify_transforms(image)
+        image_array = np.expand_dims(image_array, axis=0)
+
+        grpc_stub = get_triton_inference_stub(triton_uri)
+        request = service_pb2.ModelInferRequest()
+        request.model_name = self.name
+
+        image_input = service_pb2.ModelInferRequest().InferInputTensor()
+        image_input.name = "images"
+
+        image_input.datatype = "FP32"
+
+        image_input.shape.extend([1, 3, 224, 224])
+        request.inputs.extend([image_input])
+
+        output = service_pb2.ModelInferRequest().InferRequestedOutputTensor()
+        output.name = "output0"
+        request.outputs.extend([output])
+
+        request.raw_input_contents.extend([image_array.tobytes()])
+        start_time = time.monotonic()
+        response = grpc_stub.ModelInfer(request)
+        latency = time.monotonic() - start_time
+
+        logger.debug("Inference time for %s: %s", self.name, latency)
+
+        start_time = time.monotonic()
+        if len(response.outputs) != 1:
+            raise Exception(f"expected 1 output, got {len(response.outputs)}")
+
+        if len(response.raw_output_contents) != 1:
+            raise Exception(
+                f"expected 1 raw output content, got {len(response.raw_output_contents)}"
+            )
+
+        output_index = {output.name: i for i, output in enumerate(response.outputs)}
+        output = np.frombuffer(
+            response.raw_output_contents[output_index["output0"]],
+            dtype=np.float32,
+        ).reshape((1, len(self.label_names)))[0]
+
+        score_indices = np.argsort(-output)
+
+        latency = time.monotonic() - start_time
+        logger.debug("Post-processing time for %s: %s", self.name, latency)
+        return [(self.label_names[i], float(output[i])) for i in score_indices]
diff --git a/robotoff/types.py b/robotoff/types.py
@@ -35,6 +35,10 @@ def get_type(self) -> str:
         return "universal-logo-detector"
 
 
+class ImageClassificationModel(str, enum.Enum):
+    price_proof_classification = enum.auto()
+
+
 @enum.unique
 class NeuralCategoryClassifierModel(enum.Enum):
     keras_image_embeddings_3_0 = "keras-image-embeddings-3.0"