From acbbd10f2a298dbff135c6f3d3e690d426472ec7 Mon Sep 17 00:00:00 2001
From: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>
Date: Wed, 3 Sep 2025 15:21:13 +0000
Subject: [PATCH 1/4] phi4 fp4

Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>

update tokenizer and processor

Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>

revert WAR for shapes

Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>

fix fp8 scale

Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>

fix image_audio

Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>
---
 tensorrt_llm/_torch/models/modeling_phi3.py   | 81 ++++++++++++++-----
 tensorrt_llm/_torch/models/modeling_phi4mm.py | 19 +++--
 tensorrt_llm/inputs/utils.py                  |  6 +-
 3 files changed, 73 insertions(+), 33 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_phi3.py b/tensorrt_llm/_torch/models/modeling_phi3.py
index 272cd41e5b2..7f8d165956b 100644
--- a/tensorrt_llm/_torch/models/modeling_phi3.py
+++ b/tensorrt_llm/_torch/models/modeling_phi3.py
@@ -217,26 +217,46 @@ def filter_weights(prefix: str, weights: dict):
                     if "self_attn.qkv_proj" in name:
                         # The weights need to be split correctly before sharding to support tp_size >1.
                         qkv_weight = module_weights['weight'][:]
-                        q_weight = qkv_weight[:hidden_size, :]
-                        k_weight = qkv_weight[hidden_size:hidden_size +
-                                              num_kv_heads * head_dim, :]
-                        v_weight = qkv_weight[hidden_size +
-                                              num_kv_heads * head_dim:, :]
+                        qk_split_index = hidden_size
+                        kv_split_index = hidden_size + num_kv_heads * head_dim
+
+                        q_dict = {'weight': qkv_weight[:qk_split_index, :]}
+                        k_dict = {
+                            'weight':
+                            qkv_weight[qk_split_index:kv_split_index, :]
+                        }
+                        v_dict = {'weight': qkv_weight[kv_split_index:, :]}
 
                         # Get the scale factor for the fused QKV projection
                         qkv_scale = module_weights.get('weight_scale', None)
 
-                        q_dict = {'weight': q_weight}
-                        if qkv_scale is not None:
-                            q_dict['weight_scale'] = qkv_scale
-
-                        k_dict = {'weight': k_weight}
                         if qkv_scale is not None:
-                            k_dict['weight_scale'] = qkv_scale  # Use same scale
-
-                        v_dict = {'weight': v_weight}
-                        if qkv_scale is not None:
-                            v_dict['weight_scale'] = qkv_scale  # Use same scale
+                            if qkv_scale.shape and qkv_scale.shape[
+                                    0] == qkv_weight.shape[0]:
+                                q_dict[
+                                    'weight_scale'] = qkv_scale[:
+                                                                qk_split_index, :]
+                                k_dict['weight_scale'] = qkv_scale[
+                                    qk_split_index:kv_split_index, :]
+                                v_dict['weight_scale'] = qkv_scale[
+                                    kv_split_index:, :]
+                            else:  # use same scale
+                                q_dict['weight_scale'] = qkv_scale
+                                k_dict['weight_scale'] = qkv_scale
+                                v_dict['weight_scale'] = qkv_scale
+
+                        input_scale = module_weights.get('input_scale', None)
+                        if input_scale is not None:
+                            q_dict['input_scale'] = input_scale
+                            k_dict['input_scale'] = input_scale
+                            v_dict['input_scale'] = input_scale
+
+                        weight_scale_2 = module_weights.get(
+                            'weight_scale_2', None)
+                        if weight_scale_2 is not None:
+                            q_dict['weight_scale_2'] = weight_scale_2
+                            k_dict['weight_scale_2'] = weight_scale_2
+                            v_dict['weight_scale_2'] = weight_scale_2
 
                         module.load_weights(weights=[q_dict, k_dict, v_dict])
                     elif "mlp.gate_up_proj" in name:
@@ -246,16 +266,33 @@ def filter_weights(prefix: str, weights: dict):
                         gate_weight = gate_up_weight[:intermediate_size, :]
                         up_weight = gate_up_weight[intermediate_size:, :]
 
-                        # Get the scale factors if they exist
-                        gate_up_scale = module_weights.get('weight_scale', None)
-
                         gate_dict = {'weight': gate_weight}
-                        if gate_up_scale is not None:
-                            gate_dict['weight_scale'] = gate_up_scale
-
                         up_dict = {'weight': up_weight}
+
+                        # Get the scale factors if they exist
+                        gate_up_scale = module_weights.get('weight_scale', None)
                         if gate_up_scale is not None:
-                            up_dict['weight_scale'] = gate_up_scale
+                            if gate_up_scale.shape and gate_up_scale.shape[
+                                    0] == gate_up_weight.shape[0]:
+                                gate_dict[
+                                    'weight_scale'] = gate_up_scale[:
+                                                                    intermediate_size, :]
+                                up_dict['weight_scale'] = gate_up_scale[
+                                    intermediate_size:, :]
+                            else:  # use same scale
+                                gate_dict['weight_scale'] = gate_up_scale
+                                up_dict['weight_scale'] = gate_up_scale
+
+                        input_scale = module_weights.get('input_scale', None)
+                        if input_scale is not None:
+                            gate_dict['input_scale'] = input_scale
+                            up_dict['input_scale'] = input_scale
+
+                        weight_scale_2 = module_weights.get(
+                            'weight_scale_2', None)
+                        if weight_scale_2 is not None:
+                            gate_dict['weight_scale_2'] = weight_scale_2
+                            up_dict['weight_scale_2'] = weight_scale_2
 
                         module.load_weights(weights=[gate_dict, up_dict])
                     else:
diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py
index 129c43bc633..b9736afd829 100644
--- a/tensorrt_llm/_torch/models/modeling_phi4mm.py
+++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py
@@ -88,10 +88,10 @@ def _load_phi4mm_classes(local_path):
     # Add parent folder to sys.path to enable relative import.
     original_sys_path = sys.path.copy()
     package_folder = Path(local_path)
+    package_name = package_folder.name
     parent_folder = str(package_folder.parent)
     if parent_folder not in sys.path:
         sys.path.insert(0, parent_folder)
-
     try:
         # Import Phi4MMConfig from configuration_phi4mm.py.
         config_path = os.path.join(local_path, 'configuration_phi4mm.py')
@@ -111,8 +111,7 @@ def _load_phi4mm_classes(local_path):
         # `Phi-4-multimodal-instruct` as the package name to avoid relative import errors.
         # `hf_modeling_phi4mm` as the module name to avoid name conflicts.
         spec = importlib.util.spec_from_file_location(
-            "Phi-4-multimodal-instruct.hf_modeling_phi4mm",
-            modeling_phi4mm_path)
+            f"{package_name}.hf_modeling_phi4mm", modeling_phi4mm_path)
         hf_modeling_phi4mm = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(hf_modeling_phi4mm)
         Phi4MMAudioEmbedding = hf_modeling_phi4mm.Phi4MMAudioEmbedding
@@ -989,12 +988,16 @@ def load_weights(self, weights):
         weights = {k: v for k, v in weights.items() if '.lora_' not in k}
         # Rename base layer weights.
         updated_weights = {}
+        base_layers = [
+            'weight', 'input_scale', 'weight_scale', 'weight_scale_2'
+        ]
         for k in weights.keys():
-            if 'base_layer.weight' in k:
-                new_k = k.replace('base_layer.weight', 'weight')
-                updated_weights[new_k] = weights[k]
-            else:
-                updated_weights[k] = weights[k]
+            new_k = k
+            for layer in base_layers:
+                if f'base_layer.{layer}' in k:
+                    new_k = k.replace(f'base_layer.{layer}', layer)
+                    break
+            updated_weights[new_k] = weights[k]
         weights = updated_weights
         self.llm.load_weights(weights)
 
diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py
index f935d2ffe05..33780cffc3d 100644
--- a/tensorrt_llm/inputs/utils.py
+++ b/tensorrt_llm/inputs/utils.py
@@ -580,10 +580,10 @@ def convert_to_conversation_message(
             # Check if mdata is a MultimodalData
             if isinstance(mdata,
                           dict) and "modality" in mdata and "data" in mdata:
-                modality = mdata["modality"]
+                mdata_modality = mdata["modality"]
                 if modality == "multiple_image":
-                    modality = "image"
-                mm_data_tracker.add_data(modality, mdata["data"])
+                    mdata_modality = "image"
+                mm_data_tracker.add_data(mdata_modality, mdata["data"])
             else:
                 # Add embeddings to the tracker for placeholder handling
                 mm_data_tracker.add_data(mdata["modality"],

From 8744d3f2d565a34a8e64e56ba5a91b365d14c2f9 Mon Sep 17 00:00:00 2001
From: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>
Date: Tue, 14 Oct 2025 06:29:49 +0000
Subject: [PATCH 2/4] add tests

Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>
---
 .../defs/accuracy/references/gsm8k.yaml       |   4 +
 .../defs/accuracy/references/mmlu.yaml        |   4 +
 .../defs/accuracy/test_llm_api_pytorch.py     |  18 +++
 tests/integration/defs/perf/test_perf.py      |  16 ++
 tests/integration/defs/test_e2e.py            | 147 +++++++++++++++---
 .../test_lists/qa/llm_function_core.txt       |  22 ++-
 .../test_lists/qa/llm_function_l20.txt        |   8 +-
 .../test_lists/qa/llm_function_rtx6k.txt      |  13 ++
 .../test_lists/test-db/l0_l40s.yml            |   6 +-
 .../test_lists/test-db/l0_rtx_pro_6000.yml    |   7 +
 tests/integration/test_lists/waives.txt       |   4 +-
 11 files changed, 215 insertions(+), 34 deletions(-)

diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index 068977b3de2..b5be618f80f 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -187,6 +187,10 @@ mistralai/Mistral-Small-3.1-24B-Instruct-2503:
     accuracy: 89.23
 microsoft/Phi-4-multimodal-instruct:
   - accuracy: 81.19
+  - quant_algo: FP8
+    accuracy: 80.82
+  - quant_algo: NVFP4
+    accuracy: 69.33
 microsoft/Phi-4-multimodal-instruct-long-rope:
   - accuracy: 75.85
 microsoft/Phi-4-mini-instruct:
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index f70baa59313..cfe0a0de894 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -294,6 +294,10 @@ mistralai/Ministral-8B-Instruct-2410:
     accuracy: 65.96
 microsoft/Phi-4-multimodal-instruct:
   - accuracy: 69.69
+  - quant_algo: FP8
+    accuracy: 68.86
+  - quant_algo: NVFP4
+    accuracy: 64.04
 microsoft/Phi-4-multimodal-instruct-long-rope:
   - accuracy: 65.98
 microsoft/phi-4:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 3174e86cc9c..bbd0594df24 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -3314,6 +3314,24 @@ def test_auto_dtype_long_rope(self):
             task = GSM8K(model_name)
             task.evaluate(llm)
 
+    @skip_pre_blackwell
+    def test_fp4(self):
+        model_path = f"{self.MODEL_PATH}-FP4"
+        with LLM(model_path, max_seq_len=4096) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_hopper
+    def test_fp8(self):
+        model_path = f"{self.MODEL_PATH}-FP8"
+        with LLM(model_path, max_seq_len=4096) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
 @skip_pre_hopper
 @pytest.mark.skip_less_device_memory(80000)
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index 2757434818d..4df17c3fe6c 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -127,6 +127,14 @@
     "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct",
     "phi_4_multimodal_instruct_image": "multimodals/Phi-4-multimodal-instruct",
     "phi_4_multimodal_instruct_audio": "multimodals/Phi-4-multimodal-instruct",
+    "phi_4_multimodal_instruct_fp4_image":
+    "multimodals/Phi-4-multimodal-instruct-FP4",
+    "phi_4_multimodal_instruct_fp4_audio":
+    "multimodals/Phi-4-multimodal-instruct-FP4",
+    "phi_4_multimodal_instruct_fp8_image":
+    "multimodals/Phi-4-multimodal-instruct-FP8",
+    "phi_4_multimodal_instruct_fp8_audio":
+    "multimodals/Phi-4-multimodal-instruct-FP8",
     "bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct",
     "bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8",
     "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503",
@@ -177,6 +185,14 @@
     "multimodals/Phi-4-multimodal-instruct/vision-lora",
     "phi_4_multimodal_instruct_audio":
     "multimodals/Phi-4-multimodal-instruct/speech-lora",
+    "phi_4_multimodal_instruct_fp4_image":
+    "multimodals/Phi-4-multimodal-instruct-FP4/vision-lora",
+    "phi_4_multimodal_instruct_fp4_audio":
+    "multimodals/Phi-4-multimodal-instruct-FP4/speech-lora",
+    "phi_4_multimodal_instruct_fp8_image":
+    "multimodals/Phi-4-multimodal-instruct-FP8/vision-lora",
+    "phi_4_multimodal_instruct_fp8_audio":
+    "multimodals/Phi-4-multimodal-instruct-FP8/speech-lora",
 }
 
 TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "")
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index d5a669b01bd..6dbd1613ee7 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -2667,6 +2667,14 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
         ("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.8),
         ("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct",
          0.8),
+        pytest.param("phi4-multimodal-instruct-fp4",
+                     "multimodals/Phi-4-multimodal-instruct-FP4",
+                     0.8,
+                     marks=skip_pre_blackwell),
+        pytest.param("phi4-multimodal-instruct-fp8",
+                     "multimodals/Phi-4-multimodal-instruct-FP8",
+                     0.8,
+                     marks=skip_pre_hopper),
         pytest.param(
             "mistral-small-3.1-24b-instruct",
             "Mistral-Small-3.1-24B-Instruct-2503",
@@ -2686,7 +2694,8 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
     print(f"Accuracy test {model_name} {modality} mode with example inputs.")
     if modality == "video" and model_name in {
             "llava-v1.6-mistral-7b", "mistral-small-3.1-24b-instruct",
-            "phi4-multimodal-instruct"
+            "phi4-multimodal-instruct", "phi4-multimodal-instruct-fp4",
+            "phi4-multimodal-instruct-fp8"
     }:
         pytest.skip(f"Skipping video modality test for {model_name}")
 
@@ -2740,6 +2749,22 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
                 ],
             ] * num_same_requests,
         },
+        "phi4-multimodal-instruct-fp4": {
+            "image": [
+                [
+                    "image", "depicts", "natural", "environment", "ocean",
+                    "water", "waves", "sky"
+                ],
+            ] * num_same_requests,
+        },
+        "phi4-multimodal-instruct-fp8": {
+            "image": [
+                [
+                    "image", "depicts", "natural", "environment", "ocean",
+                    "water", "waves", "sky"
+                ],
+            ] * num_same_requests,
+        },
     }
 
     cmd = [
@@ -2760,7 +2785,7 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
                       ] and modality == "video":
         cmd.append("--max_num_tokens=16384")
 
-    if model_name == "phi4-multimodal-instruct":
+    if model_name.startswith("phi4-multimodal-instruct"):
         cmd.append("--max_seq_len=4096")
         cmd.append("--load_lora")
         cmd.append("--auto_model_name")
@@ -2792,6 +2817,14 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
         ("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.8),
         ("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct",
          0.8),
+        pytest.param("phi4-multimodal-instruct-fp4",
+                     "multimodals/Phi-4-multimodal-instruct-FP4",
+                     0.8,
+                     marks=skip_pre_blackwell),
+        pytest.param("phi4-multimodal-instruct-fp8",
+                     "multimodals/Phi-4-multimodal-instruct-FP8",
+                     0.8,
+                     marks=skip_pre_hopper),
         pytest.param(
             "mistral-small-3.1-24b-instruct",
             "Mistral-Small-3.1-24B-Instruct-2503",
@@ -2811,7 +2844,8 @@ def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv,
     print(f"Accuracy test {model_name} {modality} mode with example inputs.")
     if modality == "video" and model_name in {
             "llava-v1.6-mistral-7b", "mistral-small-3.1-24b-instruct",
-            "phi4-multimodal-instruct"
+            "phi4-multimodal-instruct", "phi4-multimodal-instruct-fp4",
+            "phi4-multimodal-instruct-fp8"
     }:
         pytest.skip(f"Skipping video modality test for {model_name}")
     accuracy_inputs = {
@@ -2890,6 +2924,35 @@ def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv,
                 ],
             ],
         },
+        "phi4-multimodal-instruct-fp8": {
+            "image": [
+                [
+                    "image", "depicts", "natural", "environment", "ocean",
+                    "water", "waves", "sky"
+                ],
+                [
+                    "object", "mountain", "weather", "condition", "clear",
+                    "visible"
+                ],
+                [
+                    "traffic", "condition", "road", "moderate", "vehicles",
+                    "lanes", "cars", "bus"
+                ],
+            ],
+        },
+        "phi4-multimodal-instruct-fp4": {
+            "image": [
+                [
+                    "image", "depicts", "natural", "environment", "ocean",
+                    "water", "waves", "sky"
+                ],
+                ["rock", "formation", "sunny", "sky", "clouds"],
+                [
+                    "traffic", "condition", "road", "moderate", "vehicles",
+                    "lane", "flow", "traffic"
+                ],
+            ],
+        },
     }
 
     cmd = [
@@ -2905,7 +2968,7 @@ def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv,
         "--enable_chunked_prefill",
         "--max_num_tokens=256",
     ]
-    if model_name == "phi4-multimodal-instruct":
+    if model_name.startswith("phi4-multimodal-instruct"):
         cmd.append("--max_seq_len=4096")
         cmd.append("--load_lora")
         cmd.append("--auto_model_name")
@@ -2926,10 +2989,17 @@ def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv,
 
 
 @pytest.mark.parametrize("modality", ["image", "audio", "image_audio"])
-def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
-    model_name = "Phi-4-multimodal-instruct"
-    model_path = "multimodals/Phi-4-multimodal-instruct"
-
+@pytest.mark.parametrize("model_name,model_path", [
+    ("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"),
+    pytest.param("phi4-multimodal-instruct-fp4",
+                 "multimodals/Phi-4-multimodal-instruct-FP4",
+                 marks=skip_pre_blackwell),
+    pytest.param("phi4-multimodal-instruct-fp8",
+                 "multimodals/Phi-4-multimodal-instruct-FP8",
+                 marks=skip_pre_hopper),
+])
+def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, model_name,
+                                          model_path, modality):
     example_root = Path(os.path.join(llm_root, "examples", "llm-api"))
     test_data_root = Path(
         os.path.join(llm_models_root(), "multimodals", "test_data"))
@@ -2983,6 +3053,11 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
         ],
     }
 
+    if model_name == "phi4-multimodal-instruct-fp4":
+        expected_keywords["image_audio"] = [
+            ["image", "shows", "mountain", "El", "Capitan", "road", "trees"],
+        ]
+
     cmd = [
         str(example_root / "quickstart_multimodal.py"),
         "--model_dir",
@@ -2998,8 +3073,6 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
         "--load_lora",
         "--auto_model_name",
         "Phi4MMForCausalLM",
-        # TODO: remove this once kv cache reuse is supported for Phi-4-multimodal
-        "--disable_kv_cache_reuse",
     ]
     output = llm_venv.run_cmd(cmd, caller=check_output)
 
@@ -3022,7 +3095,13 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
     pytest.param(
         "gemma-3-27b-it", "gemma/gemma-3-27b-it", marks=skip_post_blackwell),
     ("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503"),
-    ("Phi-4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"),
+    ("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"),
+    pytest.param("phi4-multimodal-instruct-fp4",
+                 "multimodals/Phi-4-multimodal-instruct-FP4",
+                 marks=skip_pre_blackwell),
+    pytest.param("phi4-multimodal-instruct-fp8",
+                 "multimodals/Phi-4-multimodal-instruct-FP8",
+                 marks=skip_pre_hopper),
 ])
 def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
                                         model_path):
@@ -3063,7 +3142,19 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
                 ],
             ],
         },
-        "Phi-4-multimodal-instruct": {
+        "phi4-multimodal-instruct": {
+            "image": [
+                ["object", "mountain", "weather", "clear", "clouds"],
+                ["traffic", "road", "vehicles", "cars", "bus"],
+            ],
+        },
+        "phi4-multimodal-instruct-fp4": {
+            "image": [
+                ["object", "mountain", "weather", "clear", "clouds"],
+                ["traffic", "road", "vehicles", "cars", "bus"],
+            ],
+        },
+        "phi4-multimodal-instruct-fp8": {
             "image": [
                 ["object", "mountain", "weather", "clear", "clouds"],
                 ["traffic", "road", "vehicles", "cars", "bus"],
@@ -3096,14 +3187,12 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
         cmd.append("--disable_kv_cache_reuse")
         cmd.append("--kv_cache_fraction=0.5")
         cmd.append("--max_seq_len=1024")
-    elif model_name == "Phi-4-multimodal-instruct":
+    elif model_name.startswith("phi4-multimodal-instruct"):
         # Set max_seq_len to 4096 to use short rope factor.
         cmd.append("--max_seq_len=4096")
         cmd.append("--load_lora")
         cmd.append("--auto_model_name")
         cmd.append("Phi4MMForCausalLM")
-        # TODO: remove this once kv cache reuse is supported for Phi-4-multimodal
-        cmd.append("--disable_kv_cache_reuse")
     elif model_name == "mistral-small-3.1-24b-instruct":
         # TODO: remove this once kv cache reuse is supported for Mistral
         cmd.append("--disable_kv_cache_reuse")
@@ -3112,7 +3201,7 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
 
     # Set match ratio based on model
     match_ratio = 4.0 / 5
-    if model_name == "Phi-4-multimodal-instruct":
+    if model_name.startswith("phi4-multimodal-instruct"):
         match_ratio = 0.6
 
     # Check output accuracy
@@ -3131,7 +3220,13 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
 @pytest.mark.skip_less_device_memory(80000)
 @pytest.mark.parametrize("model_name,model_path", [
     ("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503"),
-    ("Phi-4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"),
+    ("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"),
+    pytest.param("phi4-multimodal-instruct-fp4",
+                 "multimodals/Phi-4-multimodal-instruct-FP4",
+                 marks=skip_pre_blackwell),
+    pytest.param("phi4-multimodal-instruct-fp8",
+                 "multimodals/Phi-4-multimodal-instruct-FP8",
+                 marks=skip_pre_hopper),
     pytest.param(
         "gemma-3-27b-it", "gemma/gemma-3-27b-it", marks=skip_post_blackwell),
 ])
@@ -3179,6 +3274,18 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
                 ["atmosphere", "serene", "sense", "scene", "majestic"],
             ],
         },
+        "Phi-4-multimodal-instruct-fp4": {
+            "image": [
+                ["depicts", "landscape", "mountain", "half", "dome"],
+                ["atmosphere", "serene", "sense", "scene", "majestic"],
+            ],
+        },
+        "Phi-4-multimodal-instruct-fp8": {
+            "image": [
+                ["depicts", "landscape", "mountain", "half", "dome"],
+                ["atmosphere", "serene", "sense", "scene", "majestic"],
+            ],
+        },
     }
     # Build command for image modality
     cmd = [
@@ -3205,14 +3312,12 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
         cmd.append("--kv_cache_fraction=0.5")
         cmd.append("--max_seq_len=1024")
 
-    elif model_name == "Phi-4-multimodal-instruct":
+    elif model_name.startswith("Phi-4-multimodal-instruct"):
         # Set max_seq_len to 4096 to use short rope factor.
         cmd.append("--max_seq_len=4096")
         cmd.append("--load_lora")
         cmd.append("--auto_model_name")
         cmd.append("Phi4MMForCausalLM")
-        # TODO: remove this once kv cache reuse is supported for Phi-4
-        cmd.append("--disable_kv_cache_reuse")
 
     elif model_name == "mistral-small-3.1-24b-instruct":
         # TODO: remove this once kv cache reuse is supported for Mistral
@@ -3222,7 +3327,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
     print("output:", output)
     # Set match ratio based on model
     match_ratio = 4.0 / 5
-    if model_name == "Phi-4-multimodal-instruct":
+    if model_name.startswith("Phi-4-multimodal-instruct"):
         match_ratio = 0.6
 
     # Check output accuracy
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
index 779094c6ee6..ef4fef76c09 100644
--- a/tests/integration/test_lists/qa/llm_function_core.txt
+++ b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -599,6 +599,8 @@ accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
+accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4
+accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
@@ -664,22 +666,32 @@ test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-
 test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
 test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-0.8-image]
 test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image]
+test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image]
+test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-0.8-image]
 test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-image]
 test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-video]
 test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
 test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-0.8-image]
 test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image]
+test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-0.8-image]
+test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image]
 test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-video]
 test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-image]
-test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[audio]
-test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image]
-test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image_audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio]
 test_e2e.py::test_ptp_quickstart_multimodal_2gpu[gemma-3-27b-it-gemma/gemma-3-27b-it]
 test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503]
-test_e2e.py::test_ptp_quickstart_multimodal_2gpu[Phi-4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct]
+test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct]
+test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8]
+test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4]
 test_e2e.py::test_ptp_quickstart_multimodal_multiturn[gemma-3-27b-it-gemma/gemma-3-27b-it]
 test_e2e.py::test_ptp_quickstart_multimodal_multiturn[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503]
-test_e2e.py::test_ptp_quickstart_multimodal_multiturn[Phi-4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct]
+test_e2e.py::test_ptp_quickstart_multimodal_multiturn[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct]
+test_e2e.py::test_ptp_quickstart_multimodal_multiturn[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8]
+test_e2e.py::test_ptp_quickstart_multimodal_multiturn[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4]
 test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
 test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
 test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
diff --git a/tests/integration/test_lists/qa/llm_function_l20.txt b/tests/integration/test_lists/qa/llm_function_l20.txt
index c95aa0ab7d2..3f046845d25 100644
--- a/tests/integration/test_lists/qa/llm_function_l20.txt
+++ b/tests/integration/test_lists/qa/llm_function_l20.txt
@@ -41,6 +41,8 @@ accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
+accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4
+accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
 
@@ -53,9 +55,9 @@ test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-True]
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-False]
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-True]
-test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[audio]
-test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image]
-test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image_audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio]
 test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
 test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
 test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
diff --git a/tests/integration/test_lists/qa/llm_function_rtx6k.txt b/tests/integration/test_lists/qa/llm_function_rtx6k.txt
index 9f0746697a2..ed257078678 100644
--- a/tests/integration/test_lists/qa/llm_function_rtx6k.txt
+++ b/tests/integration/test_lists/qa/llm_function_rtx6k.txt
@@ -19,6 +19,10 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUT
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
+accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
+accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4
+accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
@@ -41,3 +45,12 @@ test_e2e.py::test_ptp_quickstart_advanced_2gpus_sm120[Llama3.1-70B-BF16-llama-3.
 test_e2e.py::test_ptp_quickstart_advanced_2gpus_sm120[Nemotron-Super-49B-v1-BF16-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1]
 test_e2e.py::test_ptp_quickstart_advanced_2gpus_sm120[Mixtral-8x7B-BF16-Mixtral-8x7B-Instruct-v0.1]
 test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-0528-FP4-DeepSeek-R1/DeepSeek-R1-0528-FP4]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio]
diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml
index 5c31cef019f..847bb1d63be 100644
--- a/tests/integration/test_lists/test-db/l0_l40s.yml
+++ b/tests/integration/test_lists/test-db/l0_l40s.yml
@@ -22,9 +22,9 @@ l0_l40s:
   - test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False]
   - test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False]
   - test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False]
-  - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[audio]
-  - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image]
-  - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image_audio]
+  - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]
+  - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
+  - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio]
   - accuracy/test_llm_api_pytorch.py::TestQwen2_VL_7B::test_auto_dtype
 - condition:
     ranges:
diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
index 1c65ff76021..a27520d2d93 100644
--- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
+++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
@@ -35,9 +35,14 @@ l0_rtx_pro_6000:
   - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B_nvfp4_hf-Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf] # 2mins
   - test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-20B-gpt_oss/gpt-oss-20b]
   - test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-120B-gpt_oss/gpt-oss-120b]
+  - test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image]
+  - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio]
+  - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] # 8mins
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] # 8 mins
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
+  - accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4
+  - accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
 
 - condition:
     ranges:
@@ -102,3 +107,5 @@ l0_rtx_pro_6000:
   # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] # failed
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True]
+  - test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8]
+  - test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 774fea69920..0316a18f719 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -339,8 +339,8 @@ examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_t
 examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2] SKIP (https://nvbugs/5546507)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True] SKIP (https://nvbugs/5546510)
 examples/serve/test_serve.py::test_extra_llm_api_options SKIP (https://nvbugs/5546510)
-test_e2e.py::test_ptp_quickstart_multimodal_multiturn[Phi-4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] SKIP (https://nvbugs/5547437)
-test_e2e.py::test_ptp_quickstart_multimodal_2gpu[Phi-4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] SKIP (https://nvbugs/5547435)
+test_e2e.py::test_ptp_quickstart_multimodal_multiturn[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] SKIP (https://nvbugs/5547437)
+test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] SKIP (https://nvbugs/5547435)
 test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-image] SKIP (https://nvbugs/5547434)
 test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-0.8-video] SKIP (https://nvbugs/5547434)
 cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5550689)

From 47242c0d07f863078d7254a0d475d5f70929c703 Mon Sep 17 00:00:00 2001
From: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>
Date: Wed, 15 Oct 2025 08:57:57 +0000
Subject: [PATCH 3/4] fix

Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>
---
 tests/integration/defs/test_e2e.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 6dbd1613ee7..8e7b5c690ce 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -3268,19 +3268,19 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
                 ["atmosphere", "serene", "majestic", "clear", "sky", "trees"],
             ],
         },
-        "Phi-4-multimodal-instruct": {
+        "phi4-multimodal-instruct": {
             "image": [
                 ["depicts", "landscape", "mountain", "half", "dome"],
                 ["atmosphere", "serene", "sense", "scene", "majestic"],
             ],
         },
-        "Phi-4-multimodal-instruct-fp4": {
+        "phi4-multimodal-instruct-fp4": {
             "image": [
                 ["depicts", "landscape", "mountain", "half", "dome"],
                 ["atmosphere", "serene", "sense", "scene", "majestic"],
             ],
         },
-        "Phi-4-multimodal-instruct-fp8": {
+        "phi4-multimodal-instruct-fp8": {
             "image": [
                 ["depicts", "landscape", "mountain", "half", "dome"],
                 ["atmosphere", "serene", "sense", "scene", "majestic"],
@@ -3312,7 +3312,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
         cmd.append("--kv_cache_fraction=0.5")
         cmd.append("--max_seq_len=1024")
 
-    elif model_name.startswith("Phi-4-multimodal-instruct"):
+    elif model_name.startswith("phi4-multimodal-instruct"):
         # Set max_seq_len to 4096 to use short rope factor.
         cmd.append("--max_seq_len=4096")
         cmd.append("--load_lora")

From 872c7b22bc491a954cc8aa5b0a592515ded08ffc Mon Sep 17 00:00:00 2001
From: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>
Date: Mon, 20 Oct 2025 01:53:17 +0000
Subject: [PATCH 4/4] update var name

Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>
---
 tensorrt_llm/_torch/models/modeling_phi4mm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py
index b9736afd829..f80d09da078 100644
--- a/tensorrt_llm/_torch/models/modeling_phi4mm.py
+++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py
@@ -988,14 +988,14 @@ def load_weights(self, weights):
         weights = {k: v for k, v in weights.items() if '.lora_' not in k}
         # Rename base layer weights.
         updated_weights = {}
-        base_layers = [
+        base_layer_weight_names = [
             'weight', 'input_scale', 'weight_scale', 'weight_scale_2'
         ]
         for k in weights.keys():
             new_k = k
-            for layer in base_layers:
-                if f'base_layer.{layer}' in k:
-                    new_k = k.replace(f'base_layer.{layer}', layer)
+            for weight_name in base_layer_weight_names:
+                if f'base_layer.{weight_name}' in k:
+                    new_k = k.replace(f'base_layer.{weight_name}', weight_name)
                     break
             updated_weights[new_k] = weights[k]
         weights = updated_weights