[None][test] Add gpt_oss_20b Model to Sanity Perf Test (#8265)

yufeiwu-nv · web-flow · commit 0e36484fba6f · 2025-10-28T13:36:28.000+08:00
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
@@ -139,6 +139,9 @@
     "bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8",
     "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503",
     "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
+    "gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b",
+    "nemotron_nano_9b_v2": "NVIDIA-Nemotron-Nano-12B-v2",
+    "starcoder2_7b": "starcoder2-7b",
 }
 # Model PATH of HuggingFace
 HF_MODEL_PATH = {
diff --git a/tests/integration/test_lists/qa/llm_perf_sanity.yml b/tests/integration/test_lists/qa/llm_perf_sanity.yml
@@ -1,29 +1,23 @@
 version: 0.0.1
 llm_perf_sanity:
+# A100, L40S, L20, H20, H100, H200, Blackwell
 - condition:
     ranges:
       system_gpu_count:
         gte: 1
-    wildcards:
-      gpu:
-      - '*h100*'
-      - '*h200*'
-      - '*a100*'
-      - '*l40s*'
-      - '*l20*'
-      - '*h20*'
   tests:
   # E2E trtllm-bench
 
   #llama_v3.1_8b_instruct
   #trt backend
   - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
   - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:512,32]
+  - perf/test_perf.py::test_perf[starcoder2_7b-bench-bfloat16-input_output_len:512,512]
+  #pytorch backend
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:500,2000]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:512,32]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:500,2000]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:512,32]
-  #pytorch backend
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
   # Phi-4-multimodal-instruct
   - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
@@ -32,22 +26,17 @@ llm_perf_sanity:
   # Ministral-8B
   - perf/test_perf.py::test_perf[ministral_8b-bench-pytorch-bfloat16-input_output_len:500,2000-reqs:500-con:250]
   - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:500,2000]
+  - perf/test_perf.py::test_perf[nemotron_nano_9b_v2-bench-pytorch-bfloat16-input_output_len:512,512]
 
   # Test list validation
   - test_list_validation.py::test_list_validation
 
 
 # FP8 specific tests
+# A100, L40S, L20, H20, H100, H200, Blackwell
 - condition:
     terms:
       supports_fp8: true
-    wildcards:
-      gpu:
-      - '*h100*'
-      - '*h200*'
-      - '*l40s*'
-      - '*l20*'
-      - '*h20*'
   tests:
   #llama_v3.1_8b_instruct_fp8
   #pytorch backend
@@ -59,19 +48,12 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250]
   - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250]
 
-# Tests for systems with 2+ GPUs
+# Tests for ALL systems with 2+ GPUs
+# A100, L40S, L20, H20, H100, H200, Blackwell
 - condition:
     ranges:
       system_gpu_count:
         gte: 2
-    wildcards:
-      gpu:
-      - '*h100*'
-      - '*h200*'
-      - '*a100*'
-      - '*l40s*'
-      - '*l20*'
-      - '*h20*'
   tests:
   #llama_v3.1_8b_instruct
   #pytorch backend
@@ -81,53 +63,38 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
 
 # FP8 tests for systems with 2+ GPUs
+# A100, L40S, L20, H20, H100, H200, Blackwell
 - condition:
     terms:
       supports_fp8: true
     ranges:
       system_gpu_count:
         gte: 2
-    wildcards:
-      gpu:
-      - '*h100*'
-      - '*h200*'
-      - '*l40s*'
-      - '*l20*'
-      - '*h20*'
+
   tests:
   #mixtral_8x7b_v0.1_fp8 pytorch backend
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2]
 
 # Tests for systems with 2+ GPUs and high memory
+# A100, L40S, H20, H100, H200, Blackwell
 - condition:
     ranges:
       system_gpu_count:
         gte: 2
       gpu_memory:
         gt: 80000
-    wildcards:
-      gpu:
-      - '*h100*'
-      - '*h200*'
-      - '*a100*'
-      - '*l40s*'
-      - '*h20*'
+
   tests:
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2]
 
 # Tests for systems with 4+ GPUs
+# A100, L40S, H20, H100, H200, Blackwell
 - condition:
     ranges:
       system_gpu_count:
         gte: 4
-    wildcards:
-      gpu:
-      - '*h100*'
-      - '*h200*'
-      - '*a100*'
-      - '*l40s*'
-      - '*h20*'
+
   tests:
   #llama_v3.1_70b
   #trt backend
@@ -136,37 +103,28 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
 
 # FP8 specific tests
+# L40S, H20, H100, H200, Blackwell
 - condition:
     terms:
       supports_fp8: true
     ranges:
       system_gpu_count:
         gte: 4
-    wildcards:
-      gpu:
-      - '*h100*'
-      - '*h200*'
-      - '*l40s*'
-      - '*h20*'
+
   tests:
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
 
 # Tests for systems with 8+ GPUs
+# A100, L40S, H20, H100, H200, Blackwell
 - condition:
     ranges:
       system_gpu_count:
         gte: 8
       gpu_memory:
         gt: 46000
-    wildcards:
-      gpu:
-      - '*h100*'
-      - '*h200*'
-      - '*a100*'
-      - '*l40s*'
-      - '*h20*'
+
   tests:
   #llama_v3.1_70b
   #pytorch backend
@@ -176,18 +134,13 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
 
 # FP8 tests for systems with 8+ GPUs
+# L40S, H20, H100, H200, Blackwell
 - condition:
     terms:
       supports_fp8: true
     ranges:
       system_gpu_count:
         gte: 8
-    wildcards:
-      gpu:
-      - '*h100*'
-      - '*h200*'
-      - '*l40s*'
-      - '*h20*'
 
   tests:
   #llama_v3.1_70b
@@ -199,17 +152,14 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:64-gpus:8]
 
 
+# FP4, FP8 tests for systems with 8+ GPUs
+# H20, H100, H200, Blackwell
 - condition:
-    terms:
-      supports_fp8: true
     ranges:
       system_gpu_count:
         gte: 8
-    wildcards:
-      gpu:
-      - '*h100*'
-      - '*h200*'
-      - '*h20*'
+      compute_capability:
+        gte: 9.0
 
   tests:
   - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128]
@@ -224,3 +174,5 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
   - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-disagg_server-ctx_dp:4-gen_tp:4]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-disagg_server-ctx_dp:4-gen_tp:4]
+    # gpt_oss_20b_fp4
+  - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512]