Skip to content

Commit 438cf23

Browse files
config naming convention fix
Signed-off-by: yechank <[email protected]>
1 parent a6017f6 commit 438cf23

File tree

10 files changed

+106
-105
lines changed

10 files changed

+106
-105
lines changed

tensorrt_llm/_torch/models/modeling_gemma3vl.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
from typing import List, Optional, Tuple
55

66
import torch
7-
from transformers import AutoProcessor, Gemma3Config, PreTrainedModel
7+
from transformers import (AutoProcessor, AutoTokenizer, Gemma3Config,
8+
PretrainedConfig, PreTrainedModel)
89

910
from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
1011
BaseWeightMapper
@@ -35,13 +36,18 @@ def _is_disagg() -> bool:
3536

3637
class Gemma3InputProcessor(InputProcessor):
3738

38-
def __init__(self, model_path, model_config, tokenizer, trust_remote_code):
39+
def __init__(self,
40+
model_path: str,
41+
config: PretrainedConfig,
42+
tokenizer: AutoTokenizer,
43+
trust_remote_code: bool = True):
3944

4045
self.tokenizer = tokenizer
4146
self.processor = AutoProcessor.from_pretrained(
4247
model_path, trust_remote_code=trust_remote_code, use_fast=True)
43-
self.model_config = model_config
44-
self.device = 'cuda'
48+
self.config = config
49+
self.device = 'cpu'
50+
self.dtype = self.config.torch_dtype
4551

4652
@nvtx_range("[Vision] preprocess")
4753
def _preprocess(self, inputs):
@@ -59,7 +65,7 @@ def _preprocess(self, inputs):
5965
images=images,
6066
do_rescale=do_rescale,
6167
return_tensors="pt",
62-
device=self.device).to(dtype=torch.bfloat16)
68+
device=self.device).to(dtype=self.dtype)
6369

6470
input_ids = processor_output["input_ids"]
6571
pixel_values = processor_output.get("pixel_values")

tensorrt_llm/_torch/models/modeling_hyperclovax.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -568,11 +568,11 @@ class HCXVisionInputProcessor(BaseMultimodalInputProcessor, InputProcessor):
568568

569569
def __init__(self,
570570
model_path: str,
571-
model_config: PretrainedConfig,
571+
config: PretrainedConfig,
572572
tokenizer: AutoTokenizer,
573573
trust_remote_code: bool = True):
574574

575-
self.pretrained_config = model_config
575+
self.config = config
576576
self.tokenizer = tokenizer
577577
self.use_fast = True
578578
if self.tokenizer is None:
@@ -584,13 +584,13 @@ def __init__(self,
584584
model_path,
585585
trust_remote_code=trust_remote_code,
586586
use_fast=self.use_fast)
587-
self.tllm_multimodal_token_id = self.pretrained_config.language_config[
587+
self.tllm_multimodal_token_id = self.config.language_config[
588588
"vocab_size"] + 1
589589
self.vision_query_lengths = None
590590
self._vision_query_generator = None
591591

592592
def get_vocab_size(self):
593-
return self.pretrained_config.language_config["vocab_size"]
593+
return self.config.language_config["vocab_size"]
594594

595595
def get_num_tokens_per_image(
596596
self,
@@ -656,8 +656,7 @@ def _post_process(self,
656656
vision_query_lengths = preprocessed_image.get("vision_query_lengths",
657657
None)
658658
non_vision_query_lengths = determine_non_vision_query_lengths(
659-
input_ids, self.tokenizer.pad_token_id,
660-
self.pretrained_config.img_start_id)
659+
input_ids, self.tokenizer.pad_token_id, self.config.img_start_id)
661660
batch_size = input_ids.size(0)
662661

663662
len_inputs_embeds = max([
@@ -666,19 +665,18 @@ def _post_process(self,
666665
non_vision_query_lengths, vision_query_lengths)
667666
])
668667

669-
len_inputs_embeds = min(self.pretrained_config.decoder_max_length,
668+
len_inputs_embeds = min(self.config.decoder_max_length,
670669
len_inputs_embeds)
671670

672-
image_cnts = (input_ids == self.pretrained_config.img_start_id).sum(
673-
dim=1).tolist()
671+
image_cnts = (input_ids == self.config.img_start_id).sum(dim=1).tolist()
674672

675673
fused_input_ids = torch.zeros([batch_size, len_inputs_embeds],
676674
dtype=input_ids.dtype)
677675
for batch_idx, sample in enumerate(input_ids):
678676
non_vision_query_length = non_vision_query_lengths[batch_idx]
679677
sample = sample[:non_vision_query_length + image_cnts[batch_idx]]
680678

681-
mask = (sample == self.pretrained_config.img_start_id)
679+
mask = (sample == self.config.img_start_id)
682680
img_start_ids = mask.nonzero()
683681
input_start, temp_start = 0, 0
684682

tensorrt_llm/_torch/models/modeling_llama.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
import torch
66
from PIL.Image import Image
77
from torch import nn
8-
from transformers import (AutoProcessor, Llama4Config, Llama4VisionModel,
9-
LlamaConfig)
8+
from transformers import (AutoProcessor, AutoTokenizer, Llama4Config,
9+
Llama4VisionModel, LlamaConfig, PretrainedConfig)
1010
from transformers.modeling_utils import load_sharded_checkpoint
1111
from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector
1212

@@ -1045,23 +1045,23 @@ def forward(self, multimodal_params: List[MultimodalParams]):
10451045
class Llama4InputProcessor(InputProcessor):
10461046

10471047
def __init__(self,
1048-
model_path,
1049-
model_config,
1050-
tokenizer,
1048+
model_path: str,
1049+
config: PretrainedConfig,
1050+
tokenizer: AutoTokenizer,
10511051
trust_remote_code: bool = True):
10521052
self.use_fast = True
10531053
self.processor = AutoProcessor.from_pretrained(
10541054
model_path,
10551055
trust_remote_code=trust_remote_code,
10561056
use_fast=self.use_fast)
1057-
self.model_config = model_config
1057+
self.config = config
10581058
self.tokenizer = tokenizer
1059-
self.vocab_size = model_config.text_config.vocab_size
1060-
self.image_token_index = model_config.image_token_index
1059+
self.vocab_size = self.config.text_config.vocab_size
1060+
self.image_token_index = self.config.image_token_index
10611061
self.fake_image_token = self.processor.fake_image_token
10621062
self.image_token = self.processor.img_patch_token
1063-
self.image_token_start_index = self.model_config.boi_token_index
1064-
self.image_token_end_index = self.model_config.eoi_token_index
1063+
self.image_token_start_index = self.config.boi_token_index
1064+
self.image_token_end_index = self.config.eoi_token_index
10651065

10661066
def attach_multimodal_embeddings(
10671067
self, inputs: TextPrompt, multimodal_embedding: Dict[str,
@@ -1121,7 +1121,7 @@ def attach_multimodal_embeddings(
11211121
f"Missing required key in multimodal embedding: {e}")
11221122

11231123
# Validate embedding dimensions
1124-
model_hidden_size = self.model_config.text_config.hidden_size
1124+
model_hidden_size = self.config.text_config.hidden_size
11251125
for i, embedding in enumerate(mm_embeddings):
11261126
if embedding.shape[-1] != model_hidden_size:
11271127
raise ValueError(

tensorrt_llm/_torch/models/modeling_llava_next.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class LlavaNextInputProcessor(BaseMultimodalInputProcessor, InputProcessor):
3838

3939
def __init__(self,
4040
model_path: str,
41-
model_config: PretrainedConfig,
41+
config: PretrainedConfig,
4242
tokenizer: AutoTokenizer,
4343
trust_remote_code: bool = True):
4444
self.tokenizer = tokenizer
@@ -52,20 +52,19 @@ def __init__(self,
5252
model_path,
5353
trust_remote_code=trust_remote_code,
5454
use_fast=self.use_fast)
55-
self.model_config = model_config
55+
self.config = config
5656

57-
self.image_token_index = model_config.image_token_index
58-
self.vocab_size = model_config.vocab_size
59-
self.config = model_config.vision_config
57+
self.image_token_index = config.image_token_index
58+
self.vocab_size = config.vocab_size
6059

6160
def _postprocess(
6261
self, input_ids: torch.Tensor, mm_features: Union[torch.Tensor,
6362
List[torch.Tensor]]
6463
) -> Tuple[torch.Tensor, torch.Tensor]:
6564
# Define model specific variables here before shared logic
66-
mm_tokens = torch.tensor([self.model_config.image_token_index
65+
mm_tokens = torch.tensor([self.config.image_token_index
6766
]).to(input_ids.device)
68-
model_hidden_size = self.model_config.text_config.hidden_size
67+
model_hidden_size = self.config.text_config.hidden_size
6968
start_len = end_len = 0 # for llava, need not append start/end token around each image token
7069
# End model specific variables
7170

@@ -170,12 +169,12 @@ def get_prompt_token_ids(
170169
raise NotImplementedError(
171170
"Only one mm_handle is supported for LlavaNext for now")
172171
hidden_size = mm_handles[0]['tensor_size'][1]
173-
assert hidden_size == self.model_config.text_config.hidden_size, "Multimodal embedding hidden size must match model hidden size"
172+
assert hidden_size == self.config.text_config.hidden_size, "Multimodal embedding hidden size must match model hidden size"
174173
input_ids = self.tokenizer(text_prompt,
175174
return_tensors="pt").input_ids[0]
176175

177-
vocab_size = self.model_config.text_config.vocab_size
178-
image_token_index = self.model_config.image_token_index
176+
vocab_size = self.config.text_config.vocab_size
177+
image_token_index = self.config.image_token_index
179178

180179
image_mask = input_ids == image_token_index
181180
image_positions = torch.where(image_mask)[0]

tensorrt_llm/_torch/models/modeling_mistral.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ class Mistral3InputProcessor(BaseMultimodalInputProcessor, InputProcessor):
219219
def __init__(
220220
self,
221221
model_path: str,
222-
model_config: PretrainedConfig,
222+
config: PretrainedConfig,
223223
tokenizer: Optional[AutoTokenizer],
224224
trust_remote_code: bool = False,
225225
):
@@ -229,24 +229,24 @@ def __init__(
229229

230230
# To abide by the `InputProcessor` interface.
231231
self.model_path = model_path
232-
self.model_config = model_config
232+
self.config = config
233233
self.tokenizer = tokenizer
234234

235-
self._processor = AutoProcessor.from_pretrained(model_path,
236-
use_fast=False)
235+
self.processor = AutoProcessor.from_pretrained(model_path,
236+
use_fast=False)
237237

238238
@torch.inference_mode()
239239
def __call__(
240240
self, inputs: TextPrompt, sampling_params: SamplingParams
241241
) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
242242
images = inputs.get("multi_modal_data", {}).get("image")
243-
do_rescale = self._processor.image_processor.do_rescale
243+
do_rescale = self.processor.image_processor.do_rescale
244244
if images is not None and isinstance(images[0], torch.Tensor):
245245
# The default multimodal input loader will normalize images to [0, 1] when the requested
246246
# format is "pt" (pytorch tensors), but not for "pil" (PIL images).
247247
do_rescale = False
248248

249-
processed = self._processor(
249+
processed = self.processor(
250250
text=inputs["prompt"],
251251
images=images,
252252
do_rescale=do_rescale,
@@ -282,25 +282,25 @@ def get_vocab_size(self) -> int:
282282
"""Return the vocab size of the model."""
283283
# Unlike some other VLMs, mistral3's vocab size is stored in its `text_config`, not the top-level
284284
# config.
285-
return self.model_config.text_config.vocab_size
285+
return self.config.text_config.vocab_size
286286

287287
def get_mm_token_ids(self) -> torch.Tensor:
288288
"""Get the IDs of all multimodal tokens (placeholders and special tokens alike)."""
289289
return torch.tensor([
290290
# This is the `[IMG]` token id inserted into the prompt that should be replaced with image
291291
# embeddings.
292-
self._processor.image_token_id,
292+
self.processor.image_token_id,
293293
# This is the `[IMG_BREAK]` token id at the end of every "row".
294-
self._processor.image_break_token_id,
294+
self.processor.image_break_token_id,
295295
# This is the `[IMG_END]` token id to signify the end of an image.
296-
self._processor.image_end_token_id,
296+
self.processor.image_end_token_id,
297297
])
298298

299299
def get_mm_special_token_ids(self) -> torch.Tensor:
300300
"""Get the IDs of special multimodal tokens (placeholders not included)."""
301301
return torch.tensor([
302-
self._processor.image_break_token_id,
303-
self._processor.image_end_token_id,
302+
self.processor.image_break_token_id,
303+
self.processor.image_end_token_id,
304304
])
305305

306306

tensorrt_llm/_torch/models/modeling_nanov2vlm.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -258,18 +258,18 @@ class NanoV2VLInputProcessor(BaseMultimodalInputProcessor, InputProcessor):
258258

259259
def __init__(self,
260260
model_path: str,
261-
model_config: transformers.PretrainedConfig,
261+
config: transformers.PretrainedConfig,
262262
tokenizer: transformers.AutoTokenizer,
263263
trust_remote_code: bool = True):
264264
if not trust_remote_code:
265265
raise ValueError("trust_remote_code must be True for NanoV2VL")
266266

267-
self.model_config = model_config
268-
self.image_size = model_config.force_image_size
269-
self.patch_size = model_config.patch_size
270-
self.downsample_ratio = model_config.downsample_ratio
267+
self.config = config
268+
self.image_size = self.config.force_image_size
269+
self.patch_size = self.config.patch_size
270+
self.downsample_ratio = self.config.downsample_ratio
271271
self.spatial_merge_size = int(self.patch_size / self.downsample_ratio)
272-
self.img_context_token_id = model_config.img_context_token_id
272+
self.img_context_token_id = self.config.img_context_token_id
273273
self.num_image_token = int((self.image_size // self.patch_size)**2 *
274274
(self.downsample_ratio**2))
275275
self.video_pruning_ratio = VIDEO_PRUNING_RATIO
@@ -285,18 +285,18 @@ def __init__(self,
285285
self.processor = transformers.AutoImageProcessor.from_pretrained(
286286
model_path, trust_remote_code=True, use_fast=self.use_fast)
287287

288-
self.img_context_token = model_config.img_context_token
289-
self.video_context_token = model_config.video_context_token
290-
self.img_start_token = model_config.img_start_token
291-
self.img_end_token = model_config.img_end_token
292-
self.dtype = model_config.torch_dtype
288+
self.img_context_token = self.config.img_context_token
289+
self.video_context_token = self.config.video_context_token
290+
self.img_start_token = self.config.img_start_token
291+
self.img_end_token = self.config.img_end_token
292+
self.dtype = self.config.torch_dtype
293293
self.image_start_token_id = self.tokenizer.encode(
294294
self.img_start_token, add_special_tokens=False)[0]
295295
self.image_end_token_id = self.tokenizer.encode(
296296
self.img_end_token, add_special_tokens=False)[0]
297297

298298
def get_vocab_size(self):
299-
return self.model_config.llm_config.vocab_size
299+
return self.config.llm_config.vocab_size
300300

301301
def get_mm_special_token_ids(self) -> torch.Tensor:
302302
" Return multimodal special token ids for NanoV2VL. "

tensorrt_llm/_torch/models/modeling_phi4mm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -759,13 +759,13 @@ class Phi4MMInputProcessor(BaseMultimodalInputProcessor, InputProcessor):
759759

760760
def __init__(self,
761761
model_path: str,
762-
model_config: transformers.PretrainedConfig,
762+
config: transformers.PretrainedConfig,
763763
tokenizer: transformers.AutoTokenizer,
764764
trust_remote_code: bool = True):
765765
if not trust_remote_code:
766766
raise ValueError("trust_remote_code must be True for Phi4MM")
767767

768-
self.model_config = model_config
768+
self.config = config
769769
self.device = 'cpu'
770770

771771
self.tokenizer = tokenizer
@@ -790,7 +790,7 @@ def __init__(self,
790790
self.processor.image_processor,
791791
)
792792

793-
self.dtype = model_config.torch_dtype
793+
self.dtype = self.config.torch_dtype
794794

795795
def get_mm_token_ids(self) -> Optional[torch.Tensor]:
796796
return torch.tensor([_IMAGE_SPECIAL_TOKEN_ID, _AUDIO_SPECIAL_TOKEN_ID],

0 commit comments

Comments
 (0)