From f08e8b3f36246121c85e52f8e9eae30592c4236c Mon Sep 17 00:00:00 2001
From: Abhishek Kumar <abhishek22512@gmail.com>
Date: Sun, 25 Aug 2024 19:29:26 +0530
Subject: [PATCH 1/3] Add 700.image caption generator benchmark and added its
 data in benchmarks-data submodule

Signed-off-by: Abhishek Kumar <abhishek22512@gmail.com>
---
 benchmarks-data                               |  2 +-
 .../701.image-captioning/config.json          |  6 ++
 .../700.image/701.image-captioning/input.py   | 40 +++++++++++
 .../701.image-captioning/python/function.py   | 67 +++++++++++++++++++
 .../python/requirements.txt                   |  3 +
 5 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/700.image/701.image-captioning/config.json
 create mode 100644 benchmarks/700.image/701.image-captioning/input.py
 create mode 100644 benchmarks/700.image/701.image-captioning/python/function.py
 create mode 100644 benchmarks/700.image/701.image-captioning/python/requirements.txt

diff --git a/benchmarks-data b/benchmarks-data
index 6a17a460..f407c248 160000
--- a/benchmarks-data
+++ b/benchmarks-data
@@ -1 +1 @@
-Subproject commit 6a17a460f289e166abb47ea6298fb939e80e8beb
+Subproject commit f407c24814f623f77dcb535d882c241909ae7588
diff --git a/benchmarks/700.image/701.image-captioning/config.json b/benchmarks/700.image/701.image-captioning/config.json
new file mode 100644
index 00000000..a9c11904
--- /dev/null
+++ b/benchmarks/700.image/701.image-captioning/config.json
@@ -0,0 +1,6 @@
+{
+    "timeout": 60,
+    "memory": 256,
+    "languages": ["python"]
+  }
+  
\ No newline at end of file
diff --git a/benchmarks/700.image/701.image-captioning/input.py b/benchmarks/700.image/701.image-captioning/input.py
new file mode 100644
index 00000000..d371deac
--- /dev/null
+++ b/benchmarks/700.image/701.image-captioning/input.py
@@ -0,0 +1,40 @@
+import glob
+import os
+
+def buckets_count():
+    return (1, 1)
+
+'''
+    Generate test, small, and large workload for image captioning benchmark.
+
+    :param data_dir: Directory where benchmark data is placed
+    :param size: Workload size
+    :param benchmarks_bucket: Storage container for the benchmark
+    :param input_paths: List of input paths
+    :param output_paths: List of output paths
+    :param upload_func: Upload function taking three params (bucket_idx, key, filepath)
+'''
+def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func):
+    input_files = glob.glob(os.path.join(data_dir, '*.jpg')) + glob.glob(os.path.join(data_dir, '*.png')) + glob.glob(os.path.join(data_dir, '*.jpeg'))
+    
+    if not input_files:
+        raise ValueError("No input files found in the provided directory.")
+
+    for file in input_files:
+        img = os.path.relpath(file, data_dir)
+        upload_func(0, img, file)
+
+    input_config = {
+        'object': {
+            'key': img,
+            'width': 200,
+            'height': 200
+        },
+        'bucket': {
+            'bucket': benchmarks_bucket,
+            'input': input_paths[0],
+            'output': output_paths[0]
+        }
+    }
+    
+    return input_config
diff --git a/benchmarks/700.image/701.image-captioning/python/function.py b/benchmarks/700.image/701.image-captioning/python/function.py
new file mode 100644
index 00000000..89d28fd7
--- /dev/null
+++ b/benchmarks/700.image/701.image-captioning/python/function.py
@@ -0,0 +1,67 @@
+import datetime
+import io
+import os
+from urllib.parse import unquote_plus
+from PIL import Image
+import torch
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+from . import storage
+
+# Load the pre-trained ViT-GPT2 model
+# Model URL: https://huggingface.co/nlpconnect/vit-gpt2-image-captioning
+# License: Apache 2.0 License (https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md)
+model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+
+model.eval()
+
+client = storage.storage.get_instance()
+
+def generate_caption(image_bytes):
+    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+
+    with torch.no_grad():
+        generated_ids = model.generate(pixel_values, max_length=16, num_beams=4)
+        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+
+    return generated_text
+
+def handler(event):
+    bucket = event.get('bucket').get('bucket')
+    input_prefix = event.get('bucket').get('input')
+    output_prefix = event.get('bucket').get('output')
+    key = unquote_plus(event.get('object').get('key'))
+    
+    download_begin = datetime.datetime.now()
+    img = client.download_stream(bucket, os.path.join(input_prefix, key))
+    download_end = datetime.datetime.now()
+
+    process_begin = datetime.datetime.now()
+    caption = generate_caption(img)
+    process_end = datetime.datetime.now()
+
+    upload_begin = datetime.datetime.now()
+    caption_file_name = os.path.splitext(key)[0] + '.txt'
+    caption_file_path = os.path.join(output_prefix, caption_file_name)
+    client.upload_stream(bucket, caption_file_path, io.BytesIO(caption.encode('utf-8')))
+    upload_end = datetime.datetime.now()
+
+    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
+    upload_time = (upload_end - upload_begin) / datetime.timedelta(microseconds=1)
+    process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        'result': {
+            'bucket': bucket,
+            'key': caption_file_path
+        },
+        'measurement': {
+            'download_time': download_time,
+            'download_size': len(img),
+            'upload_time': upload_time,
+            'upload_size': len(caption.encode('utf-8')),
+            'compute_time': process_time
+        }
+    }
diff --git a/benchmarks/700.image/701.image-captioning/python/requirements.txt b/benchmarks/700.image/701.image-captioning/python/requirements.txt
new file mode 100644
index 00000000..8ddcfdf7
--- /dev/null
+++ b/benchmarks/700.image/701.image-captioning/python/requirements.txt
@@ -0,0 +1,3 @@
+transformers==4.44.2
+torch==2.4.0
+pillow==10.4.0

From 412b1b9b2b3a7dca8d68e06c907d01f94609e7c6 Mon Sep 17 00:00:00 2001
From: Abhishek Kumar <abhishek22512@gmail.com>
Date: Mon, 26 Aug 2024 16:55:47 +0530
Subject: [PATCH 2/3] fix

Signed-off-by: Abhishek Kumar <abhishek22512@gmail.com>
---
 .../421.image-captioning}/config.json                             | 0
 .../421.image-captioning}/input.py                                | 0
 .../421.image-captioning}/python/function.py                      | 0
 .../421.image-captioning}/python/requirements.txt                 | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename benchmarks/{700.image/701.image-captioning => 400.inference/421.image-captioning}/config.json (100%)
 rename benchmarks/{700.image/701.image-captioning => 400.inference/421.image-captioning}/input.py (100%)
 rename benchmarks/{700.image/701.image-captioning => 400.inference/421.image-captioning}/python/function.py (100%)
 rename benchmarks/{700.image/701.image-captioning => 400.inference/421.image-captioning}/python/requirements.txt (100%)

diff --git a/benchmarks/700.image/701.image-captioning/config.json b/benchmarks/400.inference/421.image-captioning/config.json
similarity index 100%
rename from benchmarks/700.image/701.image-captioning/config.json
rename to benchmarks/400.inference/421.image-captioning/config.json
diff --git a/benchmarks/700.image/701.image-captioning/input.py b/benchmarks/400.inference/421.image-captioning/input.py
similarity index 100%
rename from benchmarks/700.image/701.image-captioning/input.py
rename to benchmarks/400.inference/421.image-captioning/input.py
diff --git a/benchmarks/700.image/701.image-captioning/python/function.py b/benchmarks/400.inference/421.image-captioning/python/function.py
similarity index 100%
rename from benchmarks/700.image/701.image-captioning/python/function.py
rename to benchmarks/400.inference/421.image-captioning/python/function.py
diff --git a/benchmarks/700.image/701.image-captioning/python/requirements.txt b/benchmarks/400.inference/421.image-captioning/python/requirements.txt
similarity index 100%
rename from benchmarks/700.image/701.image-captioning/python/requirements.txt
rename to benchmarks/400.inference/421.image-captioning/python/requirements.txt

From 2c2f62b5e499789600c0553d907d49bf0325137f Mon Sep 17 00:00:00 2001
From: Abhishek Kumar <abhishek22512@gmail.com>
Date: Wed, 28 Aug 2024 15:38:24 +0530
Subject: [PATCH 3/3] return caption directly

Signed-off-by: Abhishek Kumar <abhishek22512@gmail.com>
---
 .../400.inference/421.image-captioning/input.py  |  4 +++-
 .../421.image-captioning/python/function.py      | 16 +---------------
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/benchmarks/400.inference/421.image-captioning/input.py b/benchmarks/400.inference/421.image-captioning/input.py
index d371deac..0aa63175 100644
--- a/benchmarks/400.inference/421.image-captioning/input.py
+++ b/benchmarks/400.inference/421.image-captioning/input.py
@@ -15,7 +15,9 @@ def buckets_count():
     :param upload_func: Upload function taking three params (bucket_idx, key, filepath)
 '''
 def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func):
-    input_files = glob.glob(os.path.join(data_dir, '*.jpg')) + glob.glob(os.path.join(data_dir, '*.png')) + glob.glob(os.path.join(data_dir, '*.jpeg'))
+    input_files = []
+    for ext in ['*.jpg', '*.jpeg', '*.png']:
+        input_files.extend(glob.glob(os.path.join(data_dir, ext)))
     
     if not input_files:
         raise ValueError("No input files found in the provided directory.")
diff --git a/benchmarks/400.inference/421.image-captioning/python/function.py b/benchmarks/400.inference/421.image-captioning/python/function.py
index 89d28fd7..b9ee4934 100644
--- a/benchmarks/400.inference/421.image-captioning/python/function.py
+++ b/benchmarks/400.inference/421.image-captioning/python/function.py
@@ -1,6 +1,5 @@
 import datetime
 import io
-import os
 from urllib.parse import unquote_plus
 from PIL import Image
 import torch
@@ -8,8 +7,6 @@
 from . import storage
 
 # Load the pre-trained ViT-GPT2 model
-# Model URL: https://huggingface.co/nlpconnect/vit-gpt2-image-captioning
-# License: Apache 2.0 License (https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md)
 model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
@@ -31,7 +28,6 @@ def generate_caption(image_bytes):
 def handler(event):
     bucket = event.get('bucket').get('bucket')
     input_prefix = event.get('bucket').get('input')
-    output_prefix = event.get('bucket').get('output')
     key = unquote_plus(event.get('object').get('key'))
     
     download_begin = datetime.datetime.now()
@@ -42,26 +38,16 @@ def handler(event):
     caption = generate_caption(img)
     process_end = datetime.datetime.now()
 
-    upload_begin = datetime.datetime.now()
-    caption_file_name = os.path.splitext(key)[0] + '.txt'
-    caption_file_path = os.path.join(output_prefix, caption_file_name)
-    client.upload_stream(bucket, caption_file_path, io.BytesIO(caption.encode('utf-8')))
-    upload_end = datetime.datetime.now()
-
     download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
-    upload_time = (upload_end - upload_begin) / datetime.timedelta(microseconds=1)
     process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
 
     return {
         'result': {
-            'bucket': bucket,
-            'key': caption_file_path
+            'caption': caption,
         },
         'measurement': {
             'download_time': download_time,
             'download_size': len(img),
-            'upload_time': upload_time,
-            'upload_size': len(caption.encode('utf-8')),
             'compute_time': process_time
         }
     }