Updated intern Feature size and moved constants to utils (#321)

quic-rishinr · web-flow · commit faa6e18a9265 · 2025-03-24T20:04:30.000+05:30
Signed-off-by: Rishin Raj &lt;quic_rishinr@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py
@@ -32,14 +32,11 @@ def __init__(self, model):
         self.language_model = self.model.language_model
 
     def forward(self, input_ids, vit_embeds, position_ids, past_key_values):
-        # TODO: Check if Hardcoding this is okay, i.e. check if this value is common for all intern models
-        IMG_CONTEXT_TOKEN = 151667
-
         input_embeds = self.model.language_model.get_input_embeddings()(input_ids)
         B, N, C = input_embeds.shape
         image_input_embeds = input_embeds.reshape(B * N, C)
         image_input_ids = input_ids.reshape(B * N)
-        selected = image_input_ids == IMG_CONTEXT_TOKEN
+        selected = image_input_ids == constants.INTERN_IMG_CONTEXT_TOKEN
         indices1 = selected.unsqueeze(0).to(torch.int64).cumsum(1) - 1
         indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1)
         image_features_expanded = vit_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
@@ -73,16 +70,16 @@ def get_specializations(
             logger.warning(
                 "User should pass `num_patches` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13"
             )
-            num_patches = 13
+            num_patches = constants.INTERN_NUM_PATCHES
 
-        prefill_seq_len = prefill_seq_len if prefill_seq_len else 3840  # 4096-256
-        ctx_len = ctx_len if ctx_len else 4096
+        prefill_seq_len = prefill_seq_len if prefill_seq_len else constants.INTERN_PREFILL_SEQ_LEN  # 4096-256
+        ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN
         if img_size is None and hasattr(self.config.vision_config, "image_size"):
             img_size = getattr(self.config.vision_config, "image_size")
         elif img_size is None:
-            img_size = 448
+            img_size = constants.INTERN_IMG_SIZE
             logger.warning("Setting img_size to be 448, as it was neither passed nor found in vision_config")
-        if img_size != 448 and kv_offload:
+        if img_size != constants.INTERN_IMG_SIZE and kv_offload:
             raise NotImplementedError("Image Size other than 448 is not supported for Intern models yet.")
         vision = [
             {
@@ -159,31 +156,40 @@ def get_output_names(self, kv_offload: bool = False):
         return output_names
 
     def get_dummy_inputs(self, kv_offload: bool = False):
-        num_patches = 13
-        C = 3
         if vis_cfg := getattr(self.config, "vision_config", None):
-            img_size = getattr(vis_cfg, "image_size", 448)
+            img_size = getattr(vis_cfg, "image_size", constants.INTERN_IMG_SIZE)
         else:
-            img_size = 448
-        if img_size != 448 and kv_offload:
+            img_size = constants.INTERN_IMG_SIZE
+        if img_size != constants.INTERN_IMG_SIZE and kv_offload:
             raise NotImplementedError("Image Size other than 448 is not supported for Intern models yet.")
 
-        # Taken from the modeling files of OpenGVLab/InternVL2_5-1B
-        feature_size = int((((self.config.vision_config.hidden_size**0.5) * self.config.downsample_ratio) ** 2))
+        patch_size = getattr(self.config.vision_config, "patch_size", None)
+        downsample_ratio = getattr(self.config, "downsample_ratio", None)
+        if patch_size and downsample_ratio:
+            computed_feature_size = int(((img_size / patch_size) * downsample_ratio) ** 2)
+            if computed_feature_size != constants.INTERN_FEATURE_SIZE:
+                logger.warning(
+                    "Discrepancy detected between estimated and actual feature sizes. Could impact on functionality or accuracy"
+                )
 
         # Define shapes
         inputs_shapes = {}
         inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
         inputs_shapes["vit_embeds"] = (
-            num_patches,
-            feature_size,
+            constants.INTERN_NUM_PATCHES,
+            constants.INTERN_FEATURE_SIZE,
             self.language_model.config.hidden_size,
         )
         inputs_shapes["position_ids"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
             constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
-        inputs_shapes["pixel_values"] = (num_patches, C, img_size, img_size)
+        inputs_shapes["pixel_values"] = (
+            constants.INTERN_NUM_PATCHES,
+            constants.INTERN_NUM_CHANNELS,
+            img_size,
+            img_size,
+        )
 
         # Define inputs
         vision_inputs = {}
@@ -220,15 +226,12 @@ def get_dummy_inputs(self, kv_offload: bool = False):
         return inputs
 
     def forward(self, input_ids, pixel_values, position_ids, past_key_values):
-        # TODO: Check if Hardcoding this is okay, i.e. check if this value is common for all intern models
-        IMG_CONTEXT_TOKEN = 151667
-
         input_embeds = self.language_model.get_input_embeddings()(input_ids)
         vit_embeds = self.extract_feature(pixel_values)
         B, N, C = input_embeds.shape
         image_input_embeds = input_embeds.reshape(B * N, C)
         image_input_ids = input_ids.reshape(B * N)
-        selected = image_input_ids == IMG_CONTEXT_TOKEN
+        selected = image_input_ids == constants.INTERN_IMG_CONTEXT_TOKEN
         indices1 = selected.unsqueeze(0).to(torch.int64).cumsum(1) - 1
         indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1)
         image_features_expanded = vit_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
@@ -48,6 +48,7 @@
     GraniteAttention,
     GraniteForCausalLM,
     GraniteModel,
+    GraniteRMSNorm,
 )
 from transformers.models.llama.modeling_llama import (
     LlamaAttention,
@@ -258,6 +259,7 @@ class CustomOpsTransform(ModuleMappingTransform):
         Phi3RMSNorm: CustomRMSNormAIC,
         Qwen2RMSNorm: CustomRMSNormAIC,
         MllamaTextRMSNorm: CustomRMSNormAIC,
+        GraniteRMSNorm: CustomRMSNormAIC,
     }
 
 
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
@@ -64,6 +64,16 @@ def get_models_dir():
 
 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"]
 
+# InternVL constants
+# Fixing the feature size with reference to OpenGVLab/InternVL2_5-1B, OpenGVLab/InternVL2_5-38B and OpenGVLab/InternVL2_5-78B
+INTERN_FEATURE_SIZE = 256
+INTERN_NUM_PATCHES = 13
+INTERN_IMG_SIZE = 448
+INTERN_CTX_LEN = 4096
+INTERN_PREFILL_SEQ_LEN = INTERN_CTX_LEN - 256  # 4096-256
+INTERN_NUM_CHANNELS = 3
+INTERN_IMG_CONTEXT_TOKEN = 151667
+
 
 class Constants:
     # Export Constants.

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@`
`48`	`48`	`GraniteAttention,`
`49`	`49`	`GraniteForCausalLM,`
`50`	`50`	`GraniteModel,`
	`51`	`+ GraniteRMSNorm,`
`51`	`52`	`)`
`52`	`53`	`from transformers.models.llama.modeling_llama import (`
`53`	`54`	`LlamaAttention,`
`@@ -258,6 +259,7 @@ class CustomOpsTransform(ModuleMappingTransform):`
`258`	`259`	`Phi3RMSNorm: CustomRMSNormAIC,`
`259`	`260`	`Qwen2RMSNorm: CustomRMSNormAIC,`
`260`	`261`	`MllamaTextRMSNorm: CustomRMSNormAIC,`
	`262`	`+ GraniteRMSNorm: CustomRMSNormAIC,`
`261`	`263`	`}`
`262`	`264`
`263`	`265`