From acbbd10f2a298dbff135c6f3d3e690d426472ec7 Mon Sep 17 00:00:00 2001 From: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> Date: Wed, 3 Sep 2025 15:21:13 +0000 Subject: [PATCH 1/4] phi4 fp4 Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> update tokenizer and processor Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> revert WAR for shapes Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> fix fp8 scale Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> fix image_audio Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_phi3.py | 81 ++++++++++++++----- tensorrt_llm/_torch/models/modeling_phi4mm.py | 19 +++-- tensorrt_llm/inputs/utils.py | 6 +- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_phi3.py b/tensorrt_llm/_torch/models/modeling_phi3.py index 272cd41e5b2..7f8d165956b 100644 --- a/tensorrt_llm/_torch/models/modeling_phi3.py +++ b/tensorrt_llm/_torch/models/modeling_phi3.py @@ -217,26 +217,46 @@ def filter_weights(prefix: str, weights: dict): if "self_attn.qkv_proj" in name: # The weights need to be split correctly before sharding to support tp_size >1. qkv_weight = module_weights['weight'][:] - q_weight = qkv_weight[:hidden_size, :] - k_weight = qkv_weight[hidden_size:hidden_size + - num_kv_heads * head_dim, :] - v_weight = qkv_weight[hidden_size + - num_kv_heads * head_dim:, :] + qk_split_index = hidden_size + kv_split_index = hidden_size + num_kv_heads * head_dim + + q_dict = {'weight': qkv_weight[:qk_split_index, :]} + k_dict = { + 'weight': + qkv_weight[qk_split_index:kv_split_index, :] + } + v_dict = {'weight': qkv_weight[kv_split_index:, :]} # Get the scale factor for the fused QKV projection qkv_scale = module_weights.get('weight_scale', None) - q_dict = {'weight': q_weight} - if qkv_scale is not None: - q_dict['weight_scale'] = qkv_scale - - k_dict = {'weight': k_weight} if qkv_scale is not None: - k_dict['weight_scale'] = qkv_scale # Use same scale - - v_dict = {'weight': v_weight} - if qkv_scale is not None: - v_dict['weight_scale'] = qkv_scale # Use same scale + if qkv_scale.shape and qkv_scale.shape[ + 0] == qkv_weight.shape[0]: + q_dict[ + 'weight_scale'] = qkv_scale[: + qk_split_index, :] + k_dict['weight_scale'] = qkv_scale[ + qk_split_index:kv_split_index, :] + v_dict['weight_scale'] = qkv_scale[ + kv_split_index:, :] + else: # use same scale + q_dict['weight_scale'] = qkv_scale + k_dict['weight_scale'] = qkv_scale + v_dict['weight_scale'] = qkv_scale + + input_scale = module_weights.get('input_scale', None) + if input_scale is not None: + q_dict['input_scale'] = input_scale + k_dict['input_scale'] = input_scale + v_dict['input_scale'] = input_scale + + weight_scale_2 = module_weights.get( + 'weight_scale_2', None) + if weight_scale_2 is not None: + q_dict['weight_scale_2'] = weight_scale_2 + k_dict['weight_scale_2'] = weight_scale_2 + v_dict['weight_scale_2'] = weight_scale_2 module.load_weights(weights=[q_dict, k_dict, v_dict]) elif "mlp.gate_up_proj" in name: @@ -246,16 +266,33 @@ def filter_weights(prefix: str, weights: dict): gate_weight = gate_up_weight[:intermediate_size, :] up_weight = gate_up_weight[intermediate_size:, :] - # Get the scale factors if they exist - gate_up_scale = module_weights.get('weight_scale', None) - gate_dict = {'weight': gate_weight} - if gate_up_scale is not None: - gate_dict['weight_scale'] = gate_up_scale - up_dict = {'weight': up_weight} + + # Get the scale factors if they exist + gate_up_scale = module_weights.get('weight_scale', None) if gate_up_scale is not None: - up_dict['weight_scale'] = gate_up_scale + if gate_up_scale.shape and gate_up_scale.shape[ + 0] == gate_up_weight.shape[0]: + gate_dict[ + 'weight_scale'] = gate_up_scale[: + intermediate_size, :] + up_dict['weight_scale'] = gate_up_scale[ + intermediate_size:, :] + else: # use same scale + gate_dict['weight_scale'] = gate_up_scale + up_dict['weight_scale'] = gate_up_scale + + input_scale = module_weights.get('input_scale', None) + if input_scale is not None: + gate_dict['input_scale'] = input_scale + up_dict['input_scale'] = input_scale + + weight_scale_2 = module_weights.get( + 'weight_scale_2', None) + if weight_scale_2 is not None: + gate_dict['weight_scale_2'] = weight_scale_2 + up_dict['weight_scale_2'] = weight_scale_2 module.load_weights(weights=[gate_dict, up_dict]) else: diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py index 129c43bc633..b9736afd829 100644 --- a/tensorrt_llm/_torch/models/modeling_phi4mm.py +++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py @@ -88,10 +88,10 @@ def _load_phi4mm_classes(local_path): # Add parent folder to sys.path to enable relative import. original_sys_path = sys.path.copy() package_folder = Path(local_path) + package_name = package_folder.name parent_folder = str(package_folder.parent) if parent_folder not in sys.path: sys.path.insert(0, parent_folder) - try: # Import Phi4MMConfig from configuration_phi4mm.py. config_path = os.path.join(local_path, 'configuration_phi4mm.py') @@ -111,8 +111,7 @@ def _load_phi4mm_classes(local_path): # `Phi-4-multimodal-instruct` as the package name to avoid relative import errors. # `hf_modeling_phi4mm` as the module name to avoid name conflicts. spec = importlib.util.spec_from_file_location( - "Phi-4-multimodal-instruct.hf_modeling_phi4mm", - modeling_phi4mm_path) + f"{package_name}.hf_modeling_phi4mm", modeling_phi4mm_path) hf_modeling_phi4mm = importlib.util.module_from_spec(spec) spec.loader.exec_module(hf_modeling_phi4mm) Phi4MMAudioEmbedding = hf_modeling_phi4mm.Phi4MMAudioEmbedding @@ -989,12 +988,16 @@ def load_weights(self, weights): weights = {k: v for k, v in weights.items() if '.lora_' not in k} # Rename base layer weights. updated_weights = {} + base_layers = [ + 'weight', 'input_scale', 'weight_scale', 'weight_scale_2' + ] for k in weights.keys(): - if 'base_layer.weight' in k: - new_k = k.replace('base_layer.weight', 'weight') - updated_weights[new_k] = weights[k] - else: - updated_weights[k] = weights[k] + new_k = k + for layer in base_layers: + if f'base_layer.{layer}' in k: + new_k = k.replace(f'base_layer.{layer}', layer) + break + updated_weights[new_k] = weights[k] weights = updated_weights self.llm.load_weights(weights) diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py index f935d2ffe05..33780cffc3d 100644 --- a/tensorrt_llm/inputs/utils.py +++ b/tensorrt_llm/inputs/utils.py @@ -580,10 +580,10 @@ def convert_to_conversation_message( # Check if mdata is a MultimodalData if isinstance(mdata, dict) and "modality" in mdata and "data" in mdata: - modality = mdata["modality"] + mdata_modality = mdata["modality"] if modality == "multiple_image": - modality = "image" - mm_data_tracker.add_data(modality, mdata["data"]) + mdata_modality = "image" + mm_data_tracker.add_data(mdata_modality, mdata["data"]) else: # Add embeddings to the tracker for placeholder handling mm_data_tracker.add_data(mdata["modality"], From 8744d3f2d565a34a8e64e56ba5a91b365d14c2f9 Mon Sep 17 00:00:00 2001 From: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> Date: Tue, 14 Oct 2025 06:29:49 +0000 Subject: [PATCH 2/4] add tests Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> --- .../defs/accuracy/references/gsm8k.yaml | 4 + .../defs/accuracy/references/mmlu.yaml | 4 + .../defs/accuracy/test_llm_api_pytorch.py | 18 +++ tests/integration/defs/perf/test_perf.py | 16 ++ tests/integration/defs/test_e2e.py | 147 +++++++++++++++--- .../test_lists/qa/llm_function_core.txt | 22 ++- .../test_lists/qa/llm_function_l20.txt | 8 +- .../test_lists/qa/llm_function_rtx6k.txt | 13 ++ .../test_lists/test-db/l0_l40s.yml | 6 +- .../test_lists/test-db/l0_rtx_pro_6000.yml | 7 + tests/integration/test_lists/waives.txt | 4 +- 11 files changed, 215 insertions(+), 34 deletions(-) diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 068977b3de2..b5be618f80f 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -187,6 +187,10 @@ mistralai/Mistral-Small-3.1-24B-Instruct-2503: accuracy: 89.23 microsoft/Phi-4-multimodal-instruct: - accuracy: 81.19 + - quant_algo: FP8 + accuracy: 80.82 + - quant_algo: NVFP4 + accuracy: 69.33 microsoft/Phi-4-multimodal-instruct-long-rope: - accuracy: 75.85 microsoft/Phi-4-mini-instruct: diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index f70baa59313..cfe0a0de894 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -294,6 +294,10 @@ mistralai/Ministral-8B-Instruct-2410: accuracy: 65.96 microsoft/Phi-4-multimodal-instruct: - accuracy: 69.69 + - quant_algo: FP8 + accuracy: 68.86 + - quant_algo: NVFP4 + accuracy: 64.04 microsoft/Phi-4-multimodal-instruct-long-rope: - accuracy: 65.98 microsoft/phi-4: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 3174e86cc9c..bbd0594df24 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -3314,6 +3314,24 @@ def test_auto_dtype_long_rope(self): task = GSM8K(model_name) task.evaluate(llm) + @skip_pre_blackwell + def test_fp4(self): + model_path = f"{self.MODEL_PATH}-FP4" + with LLM(model_path, max_seq_len=4096) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_hopper + def test_fp8(self): + model_path = f"{self.MODEL_PATH}-FP8" + with LLM(model_path, max_seq_len=4096) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + @skip_pre_hopper @pytest.mark.skip_less_device_memory(80000) diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 2757434818d..4df17c3fe6c 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -127,6 +127,14 @@ "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct", "phi_4_multimodal_instruct_image": "multimodals/Phi-4-multimodal-instruct", "phi_4_multimodal_instruct_audio": "multimodals/Phi-4-multimodal-instruct", + "phi_4_multimodal_instruct_fp4_image": + "multimodals/Phi-4-multimodal-instruct-FP4", + "phi_4_multimodal_instruct_fp4_audio": + "multimodals/Phi-4-multimodal-instruct-FP4", + "phi_4_multimodal_instruct_fp8_image": + "multimodals/Phi-4-multimodal-instruct-FP8", + "phi_4_multimodal_instruct_fp8_audio": + "multimodals/Phi-4-multimodal-instruct-FP8", "bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct", "bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8", "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503", @@ -177,6 +185,14 @@ "multimodals/Phi-4-multimodal-instruct/vision-lora", "phi_4_multimodal_instruct_audio": "multimodals/Phi-4-multimodal-instruct/speech-lora", + "phi_4_multimodal_instruct_fp4_image": + "multimodals/Phi-4-multimodal-instruct-FP4/vision-lora", + "phi_4_multimodal_instruct_fp4_audio": + "multimodals/Phi-4-multimodal-instruct-FP4/speech-lora", + "phi_4_multimodal_instruct_fp8_image": + "multimodals/Phi-4-multimodal-instruct-FP8/vision-lora", + "phi_4_multimodal_instruct_fp8_audio": + "multimodals/Phi-4-multimodal-instruct-FP8/speech-lora", } TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "") diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index d5a669b01bd..6dbd1613ee7 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2667,6 +2667,14 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, ("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.8), ("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct", 0.8), + pytest.param("phi4-multimodal-instruct-fp4", + "multimodals/Phi-4-multimodal-instruct-FP4", + 0.8, + marks=skip_pre_blackwell), + pytest.param("phi4-multimodal-instruct-fp8", + "multimodals/Phi-4-multimodal-instruct-FP8", + 0.8, + marks=skip_pre_hopper), pytest.param( "mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503", @@ -2686,7 +2694,8 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv, print(f"Accuracy test {model_name} {modality} mode with example inputs.") if modality == "video" and model_name in { "llava-v1.6-mistral-7b", "mistral-small-3.1-24b-instruct", - "phi4-multimodal-instruct" + "phi4-multimodal-instruct", "phi4-multimodal-instruct-fp4", + "phi4-multimodal-instruct-fp8" }: pytest.skip(f"Skipping video modality test for {model_name}") @@ -2740,6 +2749,22 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv, ], ] * num_same_requests, }, + "phi4-multimodal-instruct-fp4": { + "image": [ + [ + "image", "depicts", "natural", "environment", "ocean", + "water", "waves", "sky" + ], + ] * num_same_requests, + }, + "phi4-multimodal-instruct-fp8": { + "image": [ + [ + "image", "depicts", "natural", "environment", "ocean", + "water", "waves", "sky" + ], + ] * num_same_requests, + }, } cmd = [ @@ -2760,7 +2785,7 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv, ] and modality == "video": cmd.append("--max_num_tokens=16384") - if model_name == "phi4-multimodal-instruct": + if model_name.startswith("phi4-multimodal-instruct"): cmd.append("--max_seq_len=4096") cmd.append("--load_lora") cmd.append("--auto_model_name") @@ -2792,6 +2817,14 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv, ("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.8), ("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct", 0.8), + pytest.param("phi4-multimodal-instruct-fp4", + "multimodals/Phi-4-multimodal-instruct-FP4", + 0.8, + marks=skip_pre_blackwell), + pytest.param("phi4-multimodal-instruct-fp8", + "multimodals/Phi-4-multimodal-instruct-FP8", + 0.8, + marks=skip_pre_hopper), pytest.param( "mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503", @@ -2811,7 +2844,8 @@ def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv, print(f"Accuracy test {model_name} {modality} mode with example inputs.") if modality == "video" and model_name in { "llava-v1.6-mistral-7b", "mistral-small-3.1-24b-instruct", - "phi4-multimodal-instruct" + "phi4-multimodal-instruct", "phi4-multimodal-instruct-fp4", + "phi4-multimodal-instruct-fp8" }: pytest.skip(f"Skipping video modality test for {model_name}") accuracy_inputs = { @@ -2890,6 +2924,35 @@ def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv, ], ], }, + "phi4-multimodal-instruct-fp8": { + "image": [ + [ + "image", "depicts", "natural", "environment", "ocean", + "water", "waves", "sky" + ], + [ + "object", "mountain", "weather", "condition", "clear", + "visible" + ], + [ + "traffic", "condition", "road", "moderate", "vehicles", + "lanes", "cars", "bus" + ], + ], + }, + "phi4-multimodal-instruct-fp4": { + "image": [ + [ + "image", "depicts", "natural", "environment", "ocean", + "water", "waves", "sky" + ], + ["rock", "formation", "sunny", "sky", "clouds"], + [ + "traffic", "condition", "road", "moderate", "vehicles", + "lane", "flow", "traffic" + ], + ], + }, } cmd = [ @@ -2905,7 +2968,7 @@ def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv, "--enable_chunked_prefill", "--max_num_tokens=256", ] - if model_name == "phi4-multimodal-instruct": + if model_name.startswith("phi4-multimodal-instruct"): cmd.append("--max_seq_len=4096") cmd.append("--load_lora") cmd.append("--auto_model_name") @@ -2926,10 +2989,17 @@ def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv, @pytest.mark.parametrize("modality", ["image", "audio", "image_audio"]) -def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality): - model_name = "Phi-4-multimodal-instruct" - model_path = "multimodals/Phi-4-multimodal-instruct" - +@pytest.mark.parametrize("model_name,model_path", [ + ("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"), + pytest.param("phi4-multimodal-instruct-fp4", + "multimodals/Phi-4-multimodal-instruct-FP4", + marks=skip_pre_blackwell), + pytest.param("phi4-multimodal-instruct-fp8", + "multimodals/Phi-4-multimodal-instruct-FP8", + marks=skip_pre_hopper), +]) +def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, model_name, + model_path, modality): example_root = Path(os.path.join(llm_root, "examples", "llm-api")) test_data_root = Path( os.path.join(llm_models_root(), "multimodals", "test_data")) @@ -2983,6 +3053,11 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality): ], } + if model_name == "phi4-multimodal-instruct-fp4": + expected_keywords["image_audio"] = [ + ["image", "shows", "mountain", "El", "Capitan", "road", "trees"], + ] + cmd = [ str(example_root / "quickstart_multimodal.py"), "--model_dir", @@ -2998,8 +3073,6 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality): "--load_lora", "--auto_model_name", "Phi4MMForCausalLM", - # TODO: remove this once kv cache reuse is supported for Phi-4-multimodal - "--disable_kv_cache_reuse", ] output = llm_venv.run_cmd(cmd, caller=check_output) @@ -3022,7 +3095,13 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality): pytest.param( "gemma-3-27b-it", "gemma/gemma-3-27b-it", marks=skip_post_blackwell), ("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503"), - ("Phi-4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"), + ("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"), + pytest.param("phi4-multimodal-instruct-fp4", + "multimodals/Phi-4-multimodal-instruct-FP4", + marks=skip_pre_blackwell), + pytest.param("phi4-multimodal-instruct-fp8", + "multimodals/Phi-4-multimodal-instruct-FP8", + marks=skip_pre_hopper), ]) def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, model_path): @@ -3063,7 +3142,19 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, ], ], }, - "Phi-4-multimodal-instruct": { + "phi4-multimodal-instruct": { + "image": [ + ["object", "mountain", "weather", "clear", "clouds"], + ["traffic", "road", "vehicles", "cars", "bus"], + ], + }, + "phi4-multimodal-instruct-fp4": { + "image": [ + ["object", "mountain", "weather", "clear", "clouds"], + ["traffic", "road", "vehicles", "cars", "bus"], + ], + }, + "phi4-multimodal-instruct-fp8": { "image": [ ["object", "mountain", "weather", "clear", "clouds"], ["traffic", "road", "vehicles", "cars", "bus"], @@ -3096,14 +3187,12 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, cmd.append("--disable_kv_cache_reuse") cmd.append("--kv_cache_fraction=0.5") cmd.append("--max_seq_len=1024") - elif model_name == "Phi-4-multimodal-instruct": + elif model_name.startswith("phi4-multimodal-instruct"): # Set max_seq_len to 4096 to use short rope factor. cmd.append("--max_seq_len=4096") cmd.append("--load_lora") cmd.append("--auto_model_name") cmd.append("Phi4MMForCausalLM") - # TODO: remove this once kv cache reuse is supported for Phi-4-multimodal - cmd.append("--disable_kv_cache_reuse") elif model_name == "mistral-small-3.1-24b-instruct": # TODO: remove this once kv cache reuse is supported for Mistral cmd.append("--disable_kv_cache_reuse") @@ -3112,7 +3201,7 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, # Set match ratio based on model match_ratio = 4.0 / 5 - if model_name == "Phi-4-multimodal-instruct": + if model_name.startswith("phi4-multimodal-instruct"): match_ratio = 0.6 # Check output accuracy @@ -3131,7 +3220,13 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, @pytest.mark.skip_less_device_memory(80000) @pytest.mark.parametrize("model_name,model_path", [ ("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503"), - ("Phi-4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"), + ("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"), + pytest.param("phi4-multimodal-instruct-fp4", + "multimodals/Phi-4-multimodal-instruct-FP4", + marks=skip_pre_blackwell), + pytest.param("phi4-multimodal-instruct-fp8", + "multimodals/Phi-4-multimodal-instruct-FP8", + marks=skip_pre_hopper), pytest.param( "gemma-3-27b-it", "gemma/gemma-3-27b-it", marks=skip_post_blackwell), ]) @@ -3179,6 +3274,18 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name, ["atmosphere", "serene", "sense", "scene", "majestic"], ], }, + "Phi-4-multimodal-instruct-fp4": { + "image": [ + ["depicts", "landscape", "mountain", "half", "dome"], + ["atmosphere", "serene", "sense", "scene", "majestic"], + ], + }, + "Phi-4-multimodal-instruct-fp8": { + "image": [ + ["depicts", "landscape", "mountain", "half", "dome"], + ["atmosphere", "serene", "sense", "scene", "majestic"], + ], + }, } # Build command for image modality cmd = [ @@ -3205,14 +3312,12 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name, cmd.append("--kv_cache_fraction=0.5") cmd.append("--max_seq_len=1024") - elif model_name == "Phi-4-multimodal-instruct": + elif model_name.startswith("Phi-4-multimodal-instruct"): # Set max_seq_len to 4096 to use short rope factor. cmd.append("--max_seq_len=4096") cmd.append("--load_lora") cmd.append("--auto_model_name") cmd.append("Phi4MMForCausalLM") - # TODO: remove this once kv cache reuse is supported for Phi-4 - cmd.append("--disable_kv_cache_reuse") elif model_name == "mistral-small-3.1-24b-instruct": # TODO: remove this once kv cache reuse is supported for Mistral @@ -3222,7 +3327,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name, print("output:", output) # Set match ratio based on model match_ratio = 4.0 / 5 - if model_name == "Phi-4-multimodal-instruct": + if model_name.startswith("Phi-4-multimodal-instruct"): match_ratio = 0.6 # Check output accuracy diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 779094c6ee6..ef4fef76c09 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -599,6 +599,8 @@ accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4 +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8 @@ -664,22 +666,32 @@ test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it- test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-0.8-image] test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image] +test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image] +test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-0.8-image] test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-image] test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-video] test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-0.8-image] test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image] +test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-0.8-image] +test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image] test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-video] test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-image] -test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[audio] -test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image] -test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image_audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio] test_e2e.py::test_ptp_quickstart_multimodal_2gpu[gemma-3-27b-it-gemma/gemma-3-27b-it] test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] -test_e2e.py::test_ptp_quickstart_multimodal_2gpu[Phi-4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] +test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] +test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8] +test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4] test_e2e.py::test_ptp_quickstart_multimodal_multiturn[gemma-3-27b-it-gemma/gemma-3-27b-it] test_e2e.py::test_ptp_quickstart_multimodal_multiturn[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] -test_e2e.py::test_ptp_quickstart_multimodal_multiturn[Phi-4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] +test_e2e.py::test_ptp_quickstart_multimodal_multiturn[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] +test_e2e.py::test_ptp_quickstart_multimodal_multiturn[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8] +test_e2e.py::test_ptp_quickstart_multimodal_multiturn[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4] test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] diff --git a/tests/integration/test_lists/qa/llm_function_l20.txt b/tests/integration/test_lists/qa/llm_function_l20.txt index c95aa0ab7d2..3f046845d25 100644 --- a/tests/integration/test_lists/qa/llm_function_l20.txt +++ b/tests/integration/test_lists/qa/llm_function_l20.txt @@ -41,6 +41,8 @@ accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4 +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype @@ -53,9 +55,9 @@ test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-True] test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-False] test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-True] -test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[audio] -test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image] -test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image_audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio] test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] diff --git a/tests/integration/test_lists/qa/llm_function_rtx6k.txt b/tests/integration/test_lists/qa/llm_function_rtx6k.txt index 9f0746697a2..ed257078678 100644 --- a/tests/integration/test_lists/qa/llm_function_rtx6k.txt +++ b/tests/integration/test_lists/qa/llm_function_rtx6k.txt @@ -19,6 +19,10 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUT accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4 +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False] @@ -41,3 +45,12 @@ test_e2e.py::test_ptp_quickstart_advanced_2gpus_sm120[Llama3.1-70B-BF16-llama-3. test_e2e.py::test_ptp_quickstart_advanced_2gpus_sm120[Nemotron-Super-49B-v1-BF16-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1] test_e2e.py::test_ptp_quickstart_advanced_2gpus_sm120[Mixtral-8x7B-BF16-Mixtral-8x7B-Instruct-v0.1] test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-0528-FP4-DeepSeek-R1/DeepSeek-R1-0528-FP4] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio] diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml index 5c31cef019f..847bb1d63be 100644 --- a/tests/integration/test_lists/test-db/l0_l40s.yml +++ b/tests/integration/test_lists/test-db/l0_l40s.yml @@ -22,9 +22,9 @@ l0_l40s: - test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] - test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False] - test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] - - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[audio] - - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image] - - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image_audio] + - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] + - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image] + - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio] - accuracy/test_llm_api_pytorch.py::TestQwen2_VL_7B::test_auto_dtype - condition: ranges: diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml index 1c65ff76021..a27520d2d93 100644 --- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml +++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml @@ -35,9 +35,14 @@ l0_rtx_pro_6000: - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B_nvfp4_hf-Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf] # 2mins - test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-20B-gpt_oss/gpt-oss-20b] - test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-120B-gpt_oss/gpt-oss-120b] + - test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image] + - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio] + - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] # 8mins - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] # 8 mins - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] + - accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4 + - accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8 - condition: ranges: @@ -102,3 +107,5 @@ l0_rtx_pro_6000: # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] # failed - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True] + - test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8] + - test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 774fea69920..0316a18f719 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -339,8 +339,8 @@ examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_t examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2] SKIP (https://nvbugs/5546507) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True] SKIP (https://nvbugs/5546510) examples/serve/test_serve.py::test_extra_llm_api_options SKIP (https://nvbugs/5546510) -test_e2e.py::test_ptp_quickstart_multimodal_multiturn[Phi-4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] SKIP (https://nvbugs/5547437) -test_e2e.py::test_ptp_quickstart_multimodal_2gpu[Phi-4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] SKIP (https://nvbugs/5547435) +test_e2e.py::test_ptp_quickstart_multimodal_multiturn[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] SKIP (https://nvbugs/5547437) +test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] SKIP (https://nvbugs/5547435) test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-image] SKIP (https://nvbugs/5547434) test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-video] SKIP (https://nvbugs/5547434) cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5550689) From 47242c0d07f863078d7254a0d475d5f70929c703 Mon Sep 17 00:00:00 2001 From: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> Date: Wed, 15 Oct 2025 08:57:57 +0000 Subject: [PATCH 3/4] fix Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> --- tests/integration/defs/test_e2e.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 6dbd1613ee7..8e7b5c690ce 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -3268,19 +3268,19 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name, ["atmosphere", "serene", "majestic", "clear", "sky", "trees"], ], }, - "Phi-4-multimodal-instruct": { + "phi4-multimodal-instruct": { "image": [ ["depicts", "landscape", "mountain", "half", "dome"], ["atmosphere", "serene", "sense", "scene", "majestic"], ], }, - "Phi-4-multimodal-instruct-fp4": { + "phi4-multimodal-instruct-fp4": { "image": [ ["depicts", "landscape", "mountain", "half", "dome"], ["atmosphere", "serene", "sense", "scene", "majestic"], ], }, - "Phi-4-multimodal-instruct-fp8": { + "phi4-multimodal-instruct-fp8": { "image": [ ["depicts", "landscape", "mountain", "half", "dome"], ["atmosphere", "serene", "sense", "scene", "majestic"], @@ -3312,7 +3312,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name, cmd.append("--kv_cache_fraction=0.5") cmd.append("--max_seq_len=1024") - elif model_name.startswith("Phi-4-multimodal-instruct"): + elif model_name.startswith("phi4-multimodal-instruct"): # Set max_seq_len to 4096 to use short rope factor. cmd.append("--max_seq_len=4096") cmd.append("--load_lora") From 872c7b22bc491a954cc8aa5b0a592515ded08ffc Mon Sep 17 00:00:00 2001 From: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> Date: Mon, 20 Oct 2025 01:53:17 +0000 Subject: [PATCH 4/4] update var name Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_phi4mm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py index b9736afd829..f80d09da078 100644 --- a/tensorrt_llm/_torch/models/modeling_phi4mm.py +++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py @@ -988,14 +988,14 @@ def load_weights(self, weights): weights = {k: v for k, v in weights.items() if '.lora_' not in k} # Rename base layer weights. updated_weights = {} - base_layers = [ + base_layer_weight_names = [ 'weight', 'input_scale', 'weight_scale', 'weight_scale_2' ] for k in weights.keys(): new_k = k - for layer in base_layers: - if f'base_layer.{layer}' in k: - new_k = k.replace(f'base_layer.{layer}', layer) + for weight_name in base_layer_weight_names: + if f'base_layer.{weight_name}' in k: + new_k = k.replace(f'base_layer.{weight_name}', weight_name) break updated_weights[new_k] = weights[k] weights = updated_weights