@@ -219,7 +219,7 @@ class Mistral3InputProcessor(BaseMultimodalInputProcessor, InputProcessor):
219219 def __init__ (
220220 self ,
221221 model_path : str ,
222- model_config : PretrainedConfig ,
222+ config : PretrainedConfig ,
223223 tokenizer : Optional [AutoTokenizer ],
224224 trust_remote_code : bool = False ,
225225 ):
@@ -229,24 +229,24 @@ def __init__(
229229
230230 # To abide by the `InputProcessor` interface.
231231 self .model_path = model_path
232- self .model_config = model_config
232+ self .config = config
233233 self .tokenizer = tokenizer
234234
235- self ._processor = AutoProcessor .from_pretrained (model_path ,
236- use_fast = False )
235+ self .processor = AutoProcessor .from_pretrained (model_path ,
236+ use_fast = False )
237237
238238 @torch .inference_mode ()
239239 def __call__ (
240240 self , inputs : TextPrompt , sampling_params : SamplingParams
241241 ) -> Tuple [List [int ], Optional [ExtraProcessedInputs ]]:
242242 images = inputs .get ("multi_modal_data" , {}).get ("image" )
243- do_rescale = self ._processor .image_processor .do_rescale
243+ do_rescale = self .processor .image_processor .do_rescale
244244 if images is not None and isinstance (images [0 ], torch .Tensor ):
245245 # The default multimodal input loader will normalize images to [0, 1] when the requested
246246 # format is "pt" (pytorch tensors), but not for "pil" (PIL images).
247247 do_rescale = False
248248
249- processed = self ._processor (
249+ processed = self .processor (
250250 text = inputs ["prompt" ],
251251 images = images ,
252252 do_rescale = do_rescale ,
@@ -282,25 +282,25 @@ def get_vocab_size(self) -> int:
282282 """Return the vocab size of the model."""
283283 # Unlike some other VLMs, mistral3's vocab size is stored in its `text_config`, not the top-level
284284 # config.
285- return self .model_config .text_config .vocab_size
285+ return self .config .text_config .vocab_size
286286
287287 def get_mm_token_ids (self ) -> torch .Tensor :
288288 """Get the IDs of all multimodal tokens (placeholders and special tokens alike)."""
289289 return torch .tensor ([
290290 # This is the `[IMG]` token id inserted into the prompt that should be replaced with image
291291 # embeddings.
292- self ._processor .image_token_id ,
292+ self .processor .image_token_id ,
293293 # This is the `[IMG_BREAK]` token id at the end of every "row".
294- self ._processor .image_break_token_id ,
294+ self .processor .image_break_token_id ,
295295 # This is the `[IMG_END]` token id to signify the end of an image.
296- self ._processor .image_end_token_id ,
296+ self .processor .image_end_token_id ,
297297 ])
298298
299299 def get_mm_special_token_ids (self ) -> torch .Tensor :
300300 """Get the IDs of special multimodal tokens (placeholders not included)."""
301301 return torch .tensor ([
302- self ._processor .image_break_token_id ,
303- self ._processor .image_end_token_id ,
302+ self .processor .image_break_token_id ,
303+ self .processor .image_end_token_id ,
304304 ])
305305
306306
0 commit comments