config naming convention fix

yechank-nvidia · yechank-nvidia · commit 438cf23dc132 · 2025-10-28T10:21:42.000+09:00
Signed-off-by: yechank &lt;161688079+yechank-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_gemma3vl.py b/tensorrt_llm/_torch/models/modeling_gemma3vl.py
@@ -4,7 +4,8 @@
 from typing import List, Optional, Tuple
 
 import torch
-from transformers import AutoProcessor, Gemma3Config, PreTrainedModel
+from transformers import (AutoProcessor, AutoTokenizer, Gemma3Config,
+                          PretrainedConfig, PreTrainedModel)
 
 from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
     BaseWeightMapper
@@ -35,13 +36,18 @@ def _is_disagg() -> bool:
 
 class Gemma3InputProcessor(InputProcessor):
 
-    def __init__(self, model_path, model_config, tokenizer, trust_remote_code):
+    def __init__(self,
+                 model_path: str,
+                 config: PretrainedConfig,
+                 tokenizer: AutoTokenizer,
+                 trust_remote_code: bool = True):
 
         self.tokenizer = tokenizer
         self.processor = AutoProcessor.from_pretrained(
             model_path, trust_remote_code=trust_remote_code, use_fast=True)
-        self.model_config = model_config
-        self.device = 'cuda'
+        self.config = config
+        self.device = 'cpu'
+        self.dtype = self.config.torch_dtype
 
     @nvtx_range("[Vision] preprocess")
     def _preprocess(self, inputs):
@@ -59,7 +65,7 @@ def _preprocess(self, inputs):
             images=images,
             do_rescale=do_rescale,
             return_tensors="pt",
-            device=self.device).to(dtype=torch.bfloat16)
+            device=self.device).to(dtype=self.dtype)
 
         input_ids = processor_output["input_ids"]
         pixel_values = processor_output.get("pixel_values")
diff --git a/tensorrt_llm/_torch/models/modeling_hyperclovax.py b/tensorrt_llm/_torch/models/modeling_hyperclovax.py
@@ -568,11 +568,11 @@ class HCXVisionInputProcessor(BaseMultimodalInputProcessor, InputProcessor):
 
     def __init__(self,
                  model_path: str,
-                 model_config: PretrainedConfig,
+                 config: PretrainedConfig,
                  tokenizer: AutoTokenizer,
                  trust_remote_code: bool = True):
 
-        self.pretrained_config = model_config
+        self.config = config
         self.tokenizer = tokenizer
         self.use_fast = True
         if self.tokenizer is None:
@@ -584,13 +584,13 @@ def __init__(self,
             model_path,
             trust_remote_code=trust_remote_code,
             use_fast=self.use_fast)
-        self.tllm_multimodal_token_id = self.pretrained_config.language_config[
+        self.tllm_multimodal_token_id = self.config.language_config[
             "vocab_size"] + 1
         self.vision_query_lengths = None
         self._vision_query_generator = None
 
     def get_vocab_size(self):
-        return self.pretrained_config.language_config["vocab_size"]
+        return self.config.language_config["vocab_size"]
 
     def get_num_tokens_per_image(
         self,
@@ -656,8 +656,7 @@ def _post_process(self,
         vision_query_lengths = preprocessed_image.get("vision_query_lengths",
                                                       None)
         non_vision_query_lengths = determine_non_vision_query_lengths(
-            input_ids, self.tokenizer.pad_token_id,
-            self.pretrained_config.img_start_id)
+            input_ids, self.tokenizer.pad_token_id, self.config.img_start_id)
         batch_size = input_ids.size(0)
 
         len_inputs_embeds = max([
@@ -666,19 +665,18 @@ def _post_process(self,
                 non_vision_query_lengths, vision_query_lengths)
         ])
 
-        len_inputs_embeds = min(self.pretrained_config.decoder_max_length,
+        len_inputs_embeds = min(self.config.decoder_max_length,
                                 len_inputs_embeds)
 
-        image_cnts = (input_ids == self.pretrained_config.img_start_id).sum(
-            dim=1).tolist()
+        image_cnts = (input_ids == self.config.img_start_id).sum(dim=1).tolist()
 
         fused_input_ids = torch.zeros([batch_size, len_inputs_embeds],
                                       dtype=input_ids.dtype)
         for batch_idx, sample in enumerate(input_ids):
             non_vision_query_length = non_vision_query_lengths[batch_idx]
             sample = sample[:non_vision_query_length + image_cnts[batch_idx]]
 
-            mask = (sample == self.pretrained_config.img_start_id)
+            mask = (sample == self.config.img_start_id)
             img_start_ids = mask.nonzero()
             input_start, temp_start = 0, 0
 
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -5,8 +5,8 @@
 import torch
 from PIL.Image import Image
 from torch import nn
-from transformers import (AutoProcessor, Llama4Config, Llama4VisionModel,
-                          LlamaConfig)
+from transformers import (AutoProcessor, AutoTokenizer, Llama4Config,
+                          Llama4VisionModel, LlamaConfig, PretrainedConfig)
 from transformers.modeling_utils import load_sharded_checkpoint
 from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector
 
@@ -1045,23 +1045,23 @@ def forward(self, multimodal_params: List[MultimodalParams]):
 class Llama4InputProcessor(InputProcessor):
 
     def __init__(self,
-                 model_path,
-                 model_config,
-                 tokenizer,
+                 model_path: str,
+                 config: PretrainedConfig,
+                 tokenizer: AutoTokenizer,
                  trust_remote_code: bool = True):
         self.use_fast = True
         self.processor = AutoProcessor.from_pretrained(
             model_path,
             trust_remote_code=trust_remote_code,
             use_fast=self.use_fast)
-        self.model_config = model_config
+        self.config = config
         self.tokenizer = tokenizer
-        self.vocab_size = model_config.text_config.vocab_size
-        self.image_token_index = model_config.image_token_index
+        self.vocab_size = self.config.text_config.vocab_size
+        self.image_token_index = self.config.image_token_index
         self.fake_image_token = self.processor.fake_image_token
         self.image_token = self.processor.img_patch_token
-        self.image_token_start_index = self.model_config.boi_token_index
-        self.image_token_end_index = self.model_config.eoi_token_index
+        self.image_token_start_index = self.config.boi_token_index
+        self.image_token_end_index = self.config.eoi_token_index
 
     def attach_multimodal_embeddings(
         self, inputs: TextPrompt, multimodal_embedding: Dict[str,
@@ -1121,7 +1121,7 @@ def attach_multimodal_embeddings(
                 f"Missing required key in multimodal embedding: {e}")
 
         # Validate embedding dimensions
-        model_hidden_size = self.model_config.text_config.hidden_size
+        model_hidden_size = self.config.text_config.hidden_size
         for i, embedding in enumerate(mm_embeddings):
             if embedding.shape[-1] != model_hidden_size:
                 raise ValueError(
diff --git a/tensorrt_llm/_torch/models/modeling_llava_next.py b/tensorrt_llm/_torch/models/modeling_llava_next.py
@@ -38,7 +38,7 @@ class LlavaNextInputProcessor(BaseMultimodalInputProcessor, InputProcessor):
 
     def __init__(self,
                  model_path: str,
-                 model_config: PretrainedConfig,
+                 config: PretrainedConfig,
                  tokenizer: AutoTokenizer,
                  trust_remote_code: bool = True):
         self.tokenizer = tokenizer
@@ -52,20 +52,19 @@ def __init__(self,
             model_path,
             trust_remote_code=trust_remote_code,
             use_fast=self.use_fast)
-        self.model_config = model_config
+        self.config = config
 
-        self.image_token_index = model_config.image_token_index
-        self.vocab_size = model_config.vocab_size
-        self.config = model_config.vision_config
+        self.image_token_index = config.image_token_index
+        self.vocab_size = config.vocab_size
 
     def _postprocess(
         self, input_ids: torch.Tensor, mm_features: Union[torch.Tensor,
                                                           List[torch.Tensor]]
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Define model specific variables here before shared logic
-        mm_tokens = torch.tensor([self.model_config.image_token_index
+        mm_tokens = torch.tensor([self.config.image_token_index
                                   ]).to(input_ids.device)
-        model_hidden_size = self.model_config.text_config.hidden_size
+        model_hidden_size = self.config.text_config.hidden_size
         start_len = end_len = 0  # for llava, need not append start/end token around each image token
         # End model specific variables
 
@@ -170,12 +169,12 @@ def get_prompt_token_ids(
             raise NotImplementedError(
                 "Only one mm_handle is supported for LlavaNext for now")
         hidden_size = mm_handles[0]['tensor_size'][1]
-        assert hidden_size == self.model_config.text_config.hidden_size, "Multimodal embedding hidden size must match model hidden size"
+        assert hidden_size == self.config.text_config.hidden_size, "Multimodal embedding hidden size must match model hidden size"
         input_ids = self.tokenizer(text_prompt,
                                    return_tensors="pt").input_ids[0]
 
-        vocab_size = self.model_config.text_config.vocab_size
-        image_token_index = self.model_config.image_token_index
+        vocab_size = self.config.text_config.vocab_size
+        image_token_index = self.config.image_token_index
 
         image_mask = input_ids == image_token_index
         image_positions = torch.where(image_mask)[0]
diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py
@@ -219,7 +219,7 @@ class Mistral3InputProcessor(BaseMultimodalInputProcessor, InputProcessor):
     def __init__(
         self,
         model_path: str,
-        model_config: PretrainedConfig,
+        config: PretrainedConfig,
         tokenizer: Optional[AutoTokenizer],
         trust_remote_code: bool = False,
     ):
@@ -229,24 +229,24 @@ def __init__(
 
         # To abide by the `InputProcessor` interface.
         self.model_path = model_path
-        self.model_config = model_config
+        self.config = config
         self.tokenizer = tokenizer
 
-        self._processor = AutoProcessor.from_pretrained(model_path,
-                                                        use_fast=False)
+        self.processor = AutoProcessor.from_pretrained(model_path,
+                                                       use_fast=False)
 
     @torch.inference_mode()
     def __call__(
         self, inputs: TextPrompt, sampling_params: SamplingParams
     ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
         images = inputs.get("multi_modal_data", {}).get("image")
-        do_rescale = self._processor.image_processor.do_rescale
+        do_rescale = self.processor.image_processor.do_rescale
         if images is not None and isinstance(images[0], torch.Tensor):
             # The default multimodal input loader will normalize images to [0, 1] when the requested
             # format is "pt" (pytorch tensors), but not for "pil" (PIL images).
             do_rescale = False
 
-        processed = self._processor(
+        processed = self.processor(
             text=inputs["prompt"],
             images=images,
             do_rescale=do_rescale,
@@ -282,25 +282,25 @@ def get_vocab_size(self) -> int:
         """Return the vocab size of the model."""
         # Unlike some other VLMs, mistral3's vocab size is stored in its `text_config`, not the top-level
         # config.
-        return self.model_config.text_config.vocab_size
+        return self.config.text_config.vocab_size
 
     def get_mm_token_ids(self) -> torch.Tensor:
         """Get the IDs of all multimodal tokens (placeholders and special tokens alike)."""
         return torch.tensor([
             # This is the `[IMG]` token id inserted into the prompt that should be replaced with image
             # embeddings.
-            self._processor.image_token_id,
+            self.processor.image_token_id,
             # This is the `[IMG_BREAK]` token id at the end of every "row".
-            self._processor.image_break_token_id,
+            self.processor.image_break_token_id,
             # This is the `[IMG_END]` token id to signify the end of an image.
-            self._processor.image_end_token_id,
+            self.processor.image_end_token_id,
         ])
 
     def get_mm_special_token_ids(self) -> torch.Tensor:
         """Get the IDs of special multimodal tokens (placeholders not included)."""
         return torch.tensor([
-            self._processor.image_break_token_id,
-            self._processor.image_end_token_id,
+            self.processor.image_break_token_id,
+            self.processor.image_end_token_id,
         ])
 
 
diff --git a/tensorrt_llm/_torch/models/modeling_nanov2vlm.py b/tensorrt_llm/_torch/models/modeling_nanov2vlm.py
@@ -258,18 +258,18 @@ class NanoV2VLInputProcessor(BaseMultimodalInputProcessor, InputProcessor):
 
     def __init__(self,
                  model_path: str,
-                 model_config: transformers.PretrainedConfig,
+                 config: transformers.PretrainedConfig,
                  tokenizer: transformers.AutoTokenizer,
                  trust_remote_code: bool = True):
         if not trust_remote_code:
             raise ValueError("trust_remote_code must be True for NanoV2VL")
 
-        self.model_config = model_config
-        self.image_size = model_config.force_image_size
-        self.patch_size = model_config.patch_size
-        self.downsample_ratio = model_config.downsample_ratio
+        self.config = config
+        self.image_size = self.config.force_image_size
+        self.patch_size = self.config.patch_size
+        self.downsample_ratio = self.config.downsample_ratio
         self.spatial_merge_size = int(self.patch_size / self.downsample_ratio)
-        self.img_context_token_id = model_config.img_context_token_id
+        self.img_context_token_id = self.config.img_context_token_id
         self.num_image_token = int((self.image_size // self.patch_size)**2 *
                                    (self.downsample_ratio**2))
         self.video_pruning_ratio = VIDEO_PRUNING_RATIO
@@ -285,18 +285,18 @@ def __init__(self,
         self.processor = transformers.AutoImageProcessor.from_pretrained(
             model_path, trust_remote_code=True, use_fast=self.use_fast)
 
-        self.img_context_token = model_config.img_context_token
-        self.video_context_token = model_config.video_context_token
-        self.img_start_token = model_config.img_start_token
-        self.img_end_token = model_config.img_end_token
-        self.dtype = model_config.torch_dtype
+        self.img_context_token = self.config.img_context_token
+        self.video_context_token = self.config.video_context_token
+        self.img_start_token = self.config.img_start_token
+        self.img_end_token = self.config.img_end_token
+        self.dtype = self.config.torch_dtype
         self.image_start_token_id = self.tokenizer.encode(
             self.img_start_token, add_special_tokens=False)[0]
         self.image_end_token_id = self.tokenizer.encode(
             self.img_end_token, add_special_tokens=False)[0]
 
     def get_vocab_size(self):
-        return self.model_config.llm_config.vocab_size
+        return self.config.llm_config.vocab_size
 
     def get_mm_special_token_ids(self) -> torch.Tensor:
         " Return multimodal special token ids for NanoV2VL. "
diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py
@@ -759,13 +759,13 @@ class Phi4MMInputProcessor(BaseMultimodalInputProcessor, InputProcessor):
 
     def __init__(self,
                  model_path: str,
-                 model_config: transformers.PretrainedConfig,
+                 config: transformers.PretrainedConfig,
                  tokenizer: transformers.AutoTokenizer,
                  trust_remote_code: bool = True):
         if not trust_remote_code:
             raise ValueError("trust_remote_code must be True for Phi4MM")
 
-        self.model_config = model_config
+        self.config = config
         self.device = 'cpu'
 
         self.tokenizer = tokenizer
@@ -790,7 +790,7 @@ def __init__(self,
             self.processor.image_processor,
         )
 
-        self.dtype = model_config.torch_dtype
+        self.dtype = self.config.torch_dtype
 
     def get_mm_token_ids(self) -> Optional[torch.Tensor]:
         return torch.tensor([_IMAGE_SPECIAL_TOKEN_ID, _AUDIO_SPECIAL_TOKEN_ID],
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
diff --git a/tensorrt_llm/_torch/models/modeling_vila.py b/tensorrt_llm/_torch/models/modeling_vila.py
diff --git a/tensorrt_llm/inputs/registry.py b/tensorrt_llm/inputs/registry.py