Rename image_encoder to vision_encoder to match HF naming convention (#14473)

pytorchbot · larryliu0820 · web-flow · commit 573f30dcff84 · 2025-09-22T11:23:50.000-07:00
Summary: As titled. We want to align with `optimum-executorch` naming convension ( which comes from HF `transformers`): https://github.com/huggingface/optimum-executorch/blob/main/optimum/exporters/executorch/tasks/multimodal_text_to_text.py#L238 Differential Revision: D82677835 Co-authored-by: Mengwei Liu <larryliu0820@users.noreply.github.com>
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -224,12 +224,12 @@ def export_all(llava_model: LlavaModel):
 
     lowered_and_edge = to_edge_transform_and_lower(
         {
-            "image_encoder": image_encoder_ep,
+            "vision_encoder": image_encoder_ep,
             "token_embedding": token_embedding_ep,
             "text_decoder": text_model_ep,
         },
         partitioner={
-            "image_encoder": [XnnpackPartitioner()],
+            "vision_encoder": [XnnpackPartitioner()],
             "text_decoder": [
                 # First partition the DQLinear nodes, then partition the rest of the nodes,
                 # to avoid multiple DQLinear nodes in the same partition,
@@ -254,7 +254,7 @@ def export_all(llava_model: LlavaModel):
             ],
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass={
-                "image_encoder": ConstraintBasedSymShapeEvalPass(),
+                "vision_encoder": ConstraintBasedSymShapeEvalPass(),
                 "text_decoder": ConstraintBasedSymShapeEvalPass(),
                 "token_embedding": HintBasedSymShapeEvalPass(),
             },
diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py
@@ -105,7 +105,7 @@ def test_llava_export(self):
         start_pos += pte_embeds_before_img.shape[1]
 
         # pte prefill image
-        pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
+        pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0]
         llava_module.run_method(
             "text_decoder",
             (
diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py
@@ -56,7 +56,7 @@ def main():
 
     # pte prefill image
     logging.warning("Image encoder started")
-    pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
+    pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0]
     logging.warning("Image encoder finished")
     logging.warning("Image token prefill started")
     pte_prefill_img = llava_module.run_method(
diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h
@@ -20,7 +20,7 @@ inline constexpr auto kUseKVCache = "use_kv_cache";
 inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 
 // Multimodal method name conventions
-inline constexpr auto kImageEncoderMethod = "image_encoder";
+inline constexpr auto kVisionEncoderMethod = "vision_encoder";
 inline constexpr auto kAudioEncoderMethod = "audio_encoder";
 inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
 inline constexpr auto kTextModelMethod = "text_decoder";
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
@@ -41,14 +41,46 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   ::executorch::runtime::EValue encoder_output;
   if (input.is_image()) {
     Image image = input.get_image();
-    auto image_tensor = executorch::extension::from_blob(
-        image.data.data(),
-        {3, image.height, image.width},
-        ::executorch::aten::ScalarType::Byte);
+
+    auto method_meta = ET_UNWRAP(
+        module_->method_meta(kVisionEncoderMethod),
+        "Failed to get method_meta for %s",
+        kVisionEncoderMethod);
+
+    ET_CHECK_MSG(
+        method_meta.num_inputs() > 0,
+        "Image encoder should have at least 1 input");
+    auto input_meta = ET_UNWRAP(
+        method_meta.input_tensor_meta(0),
+        "Cannot get input tensor meta at index 0");
+    auto expected_dtype = input_meta.scalar_type();
+
+    if (expected_dtype == ::executorch::aten::ScalarType::Float) {
+      ET_CHECK_MSG(
+          image.is_float(),
+          "Model expects float image data, but image has uint8_t data.");
+    } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
+      ET_CHECK_MSG(
+          image.is_uint8(),
+          "Model expects uint8_t image data, but image has float data.");
+    } else {
+      ET_LOG(
+          Error,
+          "Unsupported image encoder input dtype: %s",
+          ::executorch::runtime::toString(expected_dtype));
+      return ::executorch::runtime::Error::NotSupported;
+    }
+
+    // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
+    // tensor (CHW). Add a batch dimension of 1 if needed.
+    auto expected_dims = input_meta.sizes();
+    auto image_tensor = ET_UNWRAP(
+        image.toTensor(/*with_batch*/ expected_dims.size() == 4),
+        "Failed to convert image to tensor");
 
     // Run image encoder
     auto image_encoder_outputs =
-        ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
+        ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
 
     encoder_output = image_encoder_outputs[0];
   } else if (input.is_audio()) {
@@ -143,8 +175,8 @@ ::executorch::runtime::Error MultimodalPrefiller::load() {
       ET_UNWRAP(module_->method_names(), "Failed to get method names");
 
   // Load image_encoder method if exists.
-  if (methods.find(kImageEncoderMethod) != methods.end()) {
-    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
+  if (methods.find(kVisionEncoderMethod) != methods.end()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod));
   }
 
   if (methods.find(kAudioEncoderMethod) != methods.end()) {
@@ -171,8 +203,8 @@ bool MultimodalPrefiller::is_method_loaded() {
     ET_CHECK_MSG(false, "Failed to get method names");
   }
   std::unordered_set<std::string> methods = methods_res.get();
-  if (methods.find(kImageEncoderMethod) != methods.end()) {
-    return module_->is_method_loaded(kImageEncoderMethod);
+  if (methods.find(kVisionEncoderMethod) != methods.end()) {
+    return module_->is_method_loaded(kVisionEncoderMethod);
   }
   return true;
 }

Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ def test_llava_export(self):`
`105`	`105`	`start_pos += pte_embeds_before_img.shape[1]`
`106`	`106`
`107`	`107`	`# pte prefill image`
`108`		`- pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]`
	`108`	`+ pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0]`
`109`	`109`	`llava_module.run_method(`
`110`	`110`	`"text_decoder",`
`111`	`111`	`(`