update

dnth · dnth · commit 3fff35e2af3a · 2024-09-30T14:28:46.000+08:00
diff --git a/03_onnx_cpu_inference.py b/03_onnx_cpu_inference.py
@@ -3,8 +3,11 @@
 
 import numpy as np
 import onnxruntime as ort
+import torch
 from PIL import Image
 
+from imagenet_classes import IMAGENET2012_CLASSES
+
 img = Image.open(
     urlopen(
         "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"
@@ -17,8 +20,8 @@ def transforms_numpy(image: Image.Image):
     image = image.resize((448, 448), Image.BICUBIC)
     img_numpy = np.array(image).astype(np.float32) / 255.0
     img_numpy = img_numpy.transpose(2, 0, 1)
-    mean = np.array([0.485, 0.456, 0.406]).reshape(-1, 1, 1)
-    std = np.array([0.229, 0.224, 0.225]).reshape(-1, 1, 1)
+    mean = np.array([0.4815, 0.4578, 0.4082]).reshape(-1, 1, 1)
+    std = np.array([0.2686, 0.2613, 0.2758]).reshape(-1, 1, 1)
     img_numpy = (img_numpy - mean) / std
     img_numpy = np.expand_dims(img_numpy, axis=0)
     img_numpy = img_numpy.astype(np.float32)
@@ -36,6 +39,17 @@ def transforms_numpy(image: Image.Image):
 # Run inference
 output = session.run([output_name], {input_name: transforms_numpy(img)})[0]
 
+# Check the output
+output = torch.from_numpy(output)
+top5_probabilities, top5_class_indices = torch.topk(output.softmax(dim=1) * 100, k=5)
+
+im_classes = list(IMAGENET2012_CLASSES.values())
+class_names = [im_classes[i] for i in top5_class_indices[0]]
+
+# Print class names and probabilities
+for name, prob in zip(class_names, top5_probabilities[0]):
+    print(f"{name}: {prob:.2f}%")
+
 # Run benchmark
 num_images = 10
 start = time.perf_counter()
diff --git a/04_onnx_cuda_inference.py b/04_onnx_cuda_inference.py
@@ -4,8 +4,11 @@
 import cupy as cp
 import numpy as np
 import onnxruntime as ort
+import torch
 from PIL import Image
 
+from imagenet_classes import IMAGENET2012_CLASSES
+
 img = Image.open(
     urlopen(
         "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"
@@ -18,8 +21,8 @@ def transforms_numpy(image: Image.Image):
     image = image.resize((448, 448), Image.BICUBIC)
     img_numpy = np.array(image).astype(np.float32) / 255.0
     img_numpy = img_numpy.transpose(2, 0, 1)
-    mean = np.array([0.485, 0.456, 0.406]).reshape(-1, 1, 1)
-    std = np.array([0.229, 0.224, 0.225]).reshape(-1, 1, 1)
+    mean = np.array([0.4815, 0.4578, 0.4082]).reshape(-1, 1, 1)
+    std = np.array([0.2686, 0.2613, 0.2758]).reshape(-1, 1, 1)
     img_numpy = (img_numpy - mean) / std
     img_numpy = np.expand_dims(img_numpy, axis=0)
     img_numpy = img_numpy.astype(np.float32)
@@ -36,8 +39,8 @@ def transforms_cupy(image: Image.Image):
     img_cupy = img_cupy.transpose(2, 0, 1)
 
     # Apply mean and std normalization
-    mean = cp.array([0.485, 0.456, 0.406], dtype=cp.float32).reshape(-1, 1, 1)
-    std = cp.array([0.229, 0.224, 0.225], dtype=cp.float32).reshape(-1, 1, 1)
+    mean = cp.array([0.4815, 0.4578, 0.4082], dtype=cp.float32).reshape(-1, 1, 1)
+    std = cp.array([0.2686, 0.2613, 0.2758], dtype=cp.float32).reshape(-1, 1, 1)
     img_cupy = (img_cupy - mean) / std
 
     # Add batch dimension
@@ -57,6 +60,18 @@ def transforms_cupy(image: Image.Image):
 # Run inference
 output = session.run([output_name], {input_name: transforms_numpy(img)})[0]
 
+
+# Check the output
+output = torch.from_numpy(output)
+top5_probabilities, top5_class_indices = torch.topk(output.softmax(dim=1) * 100, k=5)
+
+im_classes = list(IMAGENET2012_CLASSES.values())
+class_names = [im_classes[i] for i in top5_class_indices[0]]
+
+# Print class names and probabilities
+for name, prob in zip(class_names, top5_probabilities[0]):
+    print(f"{name}: {prob:.2f}%")
+
 # Run benchmark numpy
 num_images = 100
 start = time.perf_counter()
diff --git a/05_onnx_trt_inference.py b/05_onnx_trt_inference.py
@@ -18,8 +18,8 @@ def transforms_numpy(image: Image.Image):
     image = image.resize((448, 448), Image.BICUBIC)
     img_numpy = np.array(image).astype(np.float32) / 255.0
     img_numpy = img_numpy.transpose(2, 0, 1)
-    mean = np.array([0.485, 0.456, 0.406]).reshape(-1, 1, 1)
-    std = np.array([0.229, 0.224, 0.225]).reshape(-1, 1, 1)
+    mean = np.array([0.4815, 0.4578, 0.4082]).reshape(-1, 1, 1)
+    std = np.array([0.2686, 0.2613, 0.2758]).reshape(-1, 1, 1)
     img_numpy = (img_numpy - mean) / std
     img_numpy = np.expand_dims(img_numpy, axis=0)
     img_numpy = img_numpy.astype(np.float32)
@@ -36,8 +36,8 @@ def transforms_cupy(image: Image.Image):
     img_cupy = img_cupy.transpose(2, 0, 1)
 
     # Apply mean and std normalization
-    mean = cp.array([0.485, 0.456, 0.406], dtype=cp.float32).reshape(-1, 1, 1)
-    std = cp.array([0.229, 0.224, 0.225], dtype=cp.float32).reshape(-1, 1, 1)
+    mean = cp.array([0.4815, 0.4578, 0.4082], dtype=cp.float32).reshape(-1, 1, 1)
+    std = cp.array([0.2686, 0.2613, 0.2758], dtype=cp.float32).reshape(-1, 1, 1)
     img_cupy = (img_cupy - mean) / std
 
     # Add batch dimension
diff --git a/06_export_preprocessing_onnx.py b/06_export_preprocessing_onnx.py
@@ -10,8 +10,8 @@ class Preprocess(nn.Module):
     def __init__(self, input_shape: List[int]):
         super(Preprocess, self).__init__()
         self.input_shape = tuple(input_shape)
-        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
-        self.std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+        self.mean = torch.tensor([0.4815, 0.4578, 0.4082]).view(1, 3, 1, 1)
+        self.std = torch.tensor([0.2686, 0.2613, 0.2758]).view(1, 3, 1, 1)
 
     def forward(self, x: torch.Tensor):
         x = torch.nn.functional.interpolate(
diff --git a/08_inference_merged_model.py b/08_inference_merged_model.py
@@ -16,41 +16,6 @@
 )
 
 
-def transforms_numpy(image: Image.Image):
-    image = image.convert("RGB")
-    image = image.resize((448, 448), Image.BICUBIC)
-    img_numpy = np.array(image).astype(np.float32) / 255.0
-    img_numpy = img_numpy.transpose(2, 0, 1)
-
-    mean = np.array([0.485, 0.456, 0.406]).reshape(-1, 1, 1)
-    std = np.array([0.229, 0.224, 0.225]).reshape(-1, 1, 1)
-    img_numpy = (img_numpy - mean) / std
-    img_numpy = np.expand_dims(img_numpy, axis=0)
-    img_numpy = img_numpy.astype(np.float32)
-
-    return img_numpy
-
-
-def transforms_cupy(image: Image.Image):
-    # Convert image to RGB and resize
-    image = image.convert("RGB")
-    image = image.resize((448, 448), Image.BICUBIC)
-
-    # Convert to CuPy array and normalize
-    img_cupy = cp.array(image, dtype=cp.float32) / 255.0
-    img_cupy = img_cupy.transpose(2, 0, 1)
-
-    # Apply mean and std normalization
-    mean = cp.array([0.485, 0.456, 0.406], dtype=cp.float32).reshape(-1, 1, 1)
-    std = cp.array([0.229, 0.224, 0.225], dtype=cp.float32).reshape(-1, 1, 1)
-    img_cupy = (img_cupy - mean) / std
-
-    # Add batch dimension
-    img_cupy = cp.expand_dims(img_cupy, axis=0)
-
-    return img_cupy
-
-
 def read_image(image: Image.Image):
     image = image.convert("RGB")
     img_numpy = np.array(image).astype(np.float32)
diff --git a/09_video_inference.py b/09_video_inference.py
@@ -1,5 +1,5 @@
-import time
 import argparse
+import time
 from pathlib import Path
 
 import cv2
@@ -10,14 +10,18 @@
 
 from imagenet_classes import IMAGENET2012_CLASSES
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser(description="Video inference with TensorRT")
     parser.add_argument("--output_video", type=str, help="Path to output video file")
     parser.add_argument("--input_video", type=str, help="Path to input video file")
     parser.add_argument("--webcam", action="store_true", help="Use webcam as input")
-    parser.add_argument("--live", action="store_true", help="View video live during inference")
+    parser.add_argument(
+        "--live", action="store_true", help="View video live during inference"
+    )
     return parser.parse_args()
 
+
 def get_ort_session(model_path):
     providers = [
         (
@@ -38,6 +42,7 @@ def get_ort_session(model_path):
     ]
     return ort.InferenceSession(model_path, providers=providers)
 
+
 def preprocess_frame(frame):
     # Use cv2 for resizing instead of PIL for better performance
     resized = cv2.resize(frame, (448, 448), interpolation=cv2.INTER_LINEAR)
@@ -46,46 +51,93 @@ def preprocess_frame(frame):
     img_numpy = np.expand_dims(img_numpy, axis=0)
     return img_numpy
 
+
 def get_top_predictions(output, top_k=5):
     output = torch.from_numpy(output)
     probabilities, class_indices = torch.topk(output.softmax(dim=1) * 100, k=top_k)
     im_classes = list(IMAGENET2012_CLASSES.values())
     class_names = [im_classes[i] for i in class_indices[0]]
     return list(zip(class_names, probabilities[0].tolist()))
 
+
 def draw_predictions(frame, predictions, fps):
-    # Draw FPS in the top right corner
-    cv2.putText(frame, f"FPS: {fps:.2f}", (frame.shape[1] - 150, 30), 
-                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
-    
+    # Draw FPS in the top right corner with dark blue background
+    fps_text = f"FPS: {fps:.2f}"
+    (text_width, text_height), _ = cv2.getTextSize(
+        fps_text, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2
+    )
+    text_offset_x = frame.shape[1] - text_width - 10
+    text_offset_y = 30
+    box_coords = (
+        (text_offset_x - 5, text_offset_y + 5),
+        (text_offset_x + text_width + 5, text_offset_y - text_height - 5),
+    )
+    cv2.rectangle(
+        frame, box_coords[0], box_coords[1], (139, 0, 0), cv2.FILLED
+    )  # Dark blue background
+    cv2.putText(
+        frame,
+        fps_text,
+        (text_offset_x, text_offset_y),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.7,
+        (255, 255, 255),  # White text
+        2,
+    )
+
     # Draw predictions
     for i, (name, prob) in enumerate(predictions):
         text = f"{name}: {prob:.2f}%"
-        cv2.putText(frame, text, (10, 30 + i * 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
-    
-    # Draw model name at the bottom of the frame
+        cv2.putText(
+            frame,
+            text,
+            (10, 30 + i * 30),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.7,
+            (0, 255, 0),
+            2,
+        )
+
+    # Draw model name at the bottom of the frame with red background
     model_name = "Model: eva02_large_patch14_448"
-    text_size = cv2.getTextSize(model_name, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)[0]
-    text_x = (frame.shape[1] - text_size[0]) // 2
+    (text_width, text_height), _ = cv2.getTextSize(
+        model_name, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2
+    )
+    text_x = (frame.shape[1] - text_width) // 2
     text_y = frame.shape[0] - 20
-    cv2.putText(frame, model_name, (text_x, text_y), 
-                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
+    box_coords = (
+        (text_x - 5, text_y + 5),
+        (text_x + text_width + 5, text_y - text_height - 5),
+    )
+    cv2.rectangle(
+        frame, box_coords[0], box_coords[1], (0, 0, 255), cv2.FILLED
+    )  # Red background
+    cv2.putText(
+        frame,
+        model_name,
+        (text_x, text_y),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.7,
+        (255, 255, 255),  # White text
+        2,
+    )
 
     return frame
 
+
 def process_video(input_path, output_path, session, live_view=False, use_webcam=False):
     if use_webcam:
         cap = cv2.VideoCapture(0)
     else:
         cap = cv2.VideoCapture(input_path)
-    
+
     width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     fps = int(cap.get(cv2.CAP_PROP_FPS))
-    
+
     out = None
     if output_path:
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
         out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
 
     input_name = session.get_inputs()[0].name
@@ -101,29 +153,31 @@ def process_video(input_path, output_path, session, live_view=False, use_webcam=
             break
 
         start_time = time.time()
-        
+
         preprocessed = preprocess_frame(frame)
         output = session.run([output_name], {input_name: preprocessed})
         predictions = get_top_predictions(output[0])
-        
+
         end_time = time.time()
         frame_time = end_time - start_time
         current_fps = 1 / frame_time
-        
+
         frame_with_predictions = draw_predictions(frame, predictions, current_fps)
-        
+
         if out:
             out.write(frame_with_predictions)
-        
+
         if live_view:
-            cv2.imshow('Inference', frame_with_predictions)
-            if cv2.waitKey(1) & 0xFF == ord('q'):
+            cv2.imshow("Inference", frame_with_predictions)
+            if cv2.waitKey(1) & 0xFF == ord("q"):
                 break
 
         total_time += frame_time
         frame_count += 1
 
-        print(f"Processed frame {frame_count}, Time: {frame_time:.3f}s, FPS: {current_fps:.2f}")
+        print(
+            f"Processed frame {frame_count}, Time: {frame_time:.3f}s, FPS: {current_fps:.2f}"
+        )
 
     cap.release()
     if out:
@@ -134,10 +188,11 @@ def process_video(input_path, output_path, session, live_view=False, use_webcam=
     print(f"Average processing time per frame: {avg_time:.3f}s")
     print(f"Average FPS: {1/avg_time:.2f}")
 
+
 def main():
     args = parse_arguments()
     session = get_ort_session("merged_model_compose.onnx")
-    
+
     if args.webcam:
         process_video(None, args.output_video, session, args.live, use_webcam=True)
     elif args.input_video:
@@ -146,5 +201,6 @@ def main():
         print("Error: Please specify either --input_video or --webcam")
         return
 
+
 if __name__ == "__main__":
-    main()
+    main()