pytorch
diff --git a/‎extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift‎
Lines changed: 38 additions & 3 deletions b/‎extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift‎
Lines changed: 38 additions & 3 deletions
diff --git a/‎extension/llm/apple/ExecuTorchLLM/__tests__/resources/IMG_0005.JPG‎
1.77 MB b/‎extension/llm/apple/ExecuTorchLLM/__tests__/resources/IMG_0005.JPG‎
1.77 MB
@@ -9,11 +9,43 @@
 import ExecuTorchLLM
 import XCTest
 
+extension UIImage {
+  func asImage() -> Image {
+    let cgImage = self.cgImage!
+    let width = cgImage.width
+    let height = cgImage.height
+    let pixelCount = width * height
+    let bytesPerPixel = 4
+    let bytesPerRow = bytesPerPixel * width
+    var pixelBytes = [UInt8](repeating: 0, count: pixelCount * bytesPerPixel)
+    let context = CGContext(
+      data: &pixelBytes,
+      width: width,
+      height: height,
+      bitsPerComponent: 8,
+      bytesPerRow: bytesPerRow,
+      space: CGColorSpaceCreateDeviceRGB(),
+      bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
+    )!
+    context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height))
+    var rgbBytes = [UInt8](repeating: 0, count: pixelCount * 3)
+    for i in 0..<pixelCount {
+      let pixelOffset = i * bytesPerPixel
+      rgbBytes[i] = pixelBytes[pixelOffset]
+      rgbBytes[i + pixelCount] = pixelBytes[pixelOffset + 1]
+      rgbBytes[i + pixelCount * 2] = pixelBytes[pixelOffset + 2]
+    }
+    return Image(data: Data(rgbBytes), width: width, height: height, channels: 3)
+  }
+}
+
 class MultimodalRunnerTest: XCTestCase {
   func test() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin") else {
+          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
+          let imagePath = bundle.path(forResource: "IMG_0005", ofType: "JPG"),
+          let image = UIImage(contentsOfFile: imagePath) else {
       XCTFail("Couldn't find model or tokenizer files")
       return
     }
@@ -22,12 +54,15 @@ class MultimodalRunnerTest: XCTestCase {
     var text = ""
 
     do {
-      try runner.generate([MultimodalInput("hello")], sequenceLength: 2) { token in
+      try runner.generate([
+        MultimodalInput("What's this?"),
+        MultimodalInput(image.asImage()),
+      ], sequenceLength: 2) { token in
         text += token
       }
     } catch {
       XCTFail("Failed to generate text with error \(error)")
     }
-    XCTAssertEqual("hello,", text.lowercased())
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
   }
 }