11version : 0.0.1
22llm_perf_sanity :
3+ # A100, L40S, L20, H20, H100, H200, Blackwell
34- condition :
45 ranges :
56 system_gpu_count :
67 gte : 1
7- wildcards :
8- gpu :
9- - ' *h100*'
10- - ' *h200*'
11- - ' *a100*'
12- - ' *l40s*'
13- - ' *l20*'
14- - ' *h20*'
158 tests :
169 # E2E trtllm-bench
1710
1811 # llama_v3.1_8b_instruct
1912 # trt backend
2013 - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
2114 - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:512,32]
15+ - perf/test_perf.py::test_perf[starcoder2_7b-bench-bfloat16-input_output_len:512,512]
16+ # pytorch backend
2217 - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:500,2000]
2318 - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:512,32]
2419 - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:500,2000]
2520 - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:512,32]
26- # pytorch backend
2721 - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
2822 # Phi-4-multimodal-instruct
2923 - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
@@ -32,22 +26,17 @@ llm_perf_sanity:
3226 # Ministral-8B
3327 - perf/test_perf.py::test_perf[ministral_8b-bench-pytorch-bfloat16-input_output_len:500,2000-reqs:500-con:250]
3428 - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:500,2000]
29+ - perf/test_perf.py::test_perf[nemotron_nano_9b_v2-bench-pytorch-bfloat16-input_output_len:512,512]
3530
3631 # Test list validation
3732 - test_list_validation.py::test_list_validation
3833
3934
4035# FP8 specific tests
36+ # A100, L40S, L20, H20, H100, H200, Blackwell
4137- condition :
4238 terms :
4339 supports_fp8 : true
44- wildcards :
45- gpu :
46- - ' *h100*'
47- - ' *h200*'
48- - ' *l40s*'
49- - ' *l20*'
50- - ' *h20*'
5140 tests :
5241 # llama_v3.1_8b_instruct_fp8
5342 # pytorch backend
@@ -59,19 +48,12 @@ llm_perf_sanity:
5948 - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250]
6049 - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250]
6150
62- # Tests for systems with 2+ GPUs
51+ # Tests for ALL systems with 2+ GPUs
52+ # A100, L40S, L20, H20, H100, H200, Blackwell
6353- condition :
6454 ranges :
6555 system_gpu_count :
6656 gte : 2
67- wildcards :
68- gpu :
69- - ' *h100*'
70- - ' *h200*'
71- - ' *a100*'
72- - ' *l40s*'
73- - ' *l20*'
74- - ' *h20*'
7557 tests :
7658 # llama_v3.1_8b_instruct
7759 # pytorch backend
@@ -81,53 +63,38 @@ llm_perf_sanity:
8163 - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
8264
8365# FP8 tests for systems with 2+ GPUs
66+ # A100, L40S, L20, H20, H100, H200, Blackwell
8467- condition :
8568 terms :
8669 supports_fp8 : true
8770 ranges :
8871 system_gpu_count :
8972 gte : 2
90- wildcards :
91- gpu :
92- - ' *h100*'
93- - ' *h200*'
94- - ' *l40s*'
95- - ' *l20*'
96- - ' *h20*'
73+
9774 tests :
9875 # mixtral_8x7b_v0.1_fp8 pytorch backend
9976 - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2]
10077
10178# Tests for systems with 2+ GPUs and high memory
79+ # A100, L40S, H20, H100, H200, Blackwell
10280- condition :
10381 ranges :
10482 system_gpu_count :
10583 gte : 2
10684 gpu_memory :
10785 gt : 80000
108- wildcards :
109- gpu :
110- - ' *h100*'
111- - ' *h200*'
112- - ' *a100*'
113- - ' *l40s*'
114- - ' *h20*'
86+
11587 tests :
11688 - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
11789 - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2]
11890
11991# Tests for systems with 4+ GPUs
92+ # A100, L40S, H20, H100, H200, Blackwell
12093- condition :
12194 ranges :
12295 system_gpu_count :
12396 gte : 4
124- wildcards :
125- gpu :
126- - ' *h100*'
127- - ' *h200*'
128- - ' *a100*'
129- - ' *l40s*'
130- - ' *h20*'
97+
13198 tests :
13299 # llama_v3.1_70b
133100 # trt backend
@@ -136,37 +103,28 @@ llm_perf_sanity:
136103 - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
137104
138105# FP8 specific tests
106+ # L40S, H20, H100, H200, Blackwell
139107- condition :
140108 terms :
141109 supports_fp8 : true
142110 ranges :
143111 system_gpu_count :
144112 gte : 4
145- wildcards :
146- gpu :
147- - ' *h100*'
148- - ' *h200*'
149- - ' *l40s*'
150- - ' *h20*'
113+
151114 tests :
152115 - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
153116 - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
154117 - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
155118
156119# Tests for systems with 8+ GPUs
120+ # A100, L40S, H20, H100, H200, Blackwell
157121- condition :
158122 ranges :
159123 system_gpu_count :
160124 gte : 8
161125 gpu_memory :
162126 gt : 46000
163- wildcards :
164- gpu :
165- - ' *h100*'
166- - ' *h200*'
167- - ' *a100*'
168- - ' *l40s*'
169- - ' *h20*'
127+
170128 tests :
171129 # llama_v3.1_70b
172130 # pytorch backend
@@ -176,18 +134,13 @@ llm_perf_sanity:
176134 - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
177135
178136# FP8 tests for systems with 8+ GPUs
137+ # L40S, H20, H100, H200, Blackwell
179138- condition :
180139 terms :
181140 supports_fp8 : true
182141 ranges :
183142 system_gpu_count :
184143 gte : 8
185- wildcards :
186- gpu :
187- - ' *h100*'
188- - ' *h200*'
189- - ' *l40s*'
190- - ' *h20*'
191144
192145 tests :
193146 # llama_v3.1_70b
@@ -199,17 +152,14 @@ llm_perf_sanity:
199152 - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:64-gpus:8]
200153
201154
155+ # FP4, FP8 tests for systems with 8+ GPUs
156+ # H20, H100, H200, Blackwell
202157- condition :
203- terms :
204- supports_fp8 : true
205158 ranges :
206159 system_gpu_count :
207160 gte : 8
208- wildcards :
209- gpu :
210- - ' *h100*'
211- - ' *h200*'
212- - ' *h20*'
161+ compute_capability :
162+ gte : 9.0
213163
214164 tests :
215165 - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128]
@@ -224,3 +174,5 @@ llm_perf_sanity:
224174 - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
225175 - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-disagg_server-ctx_dp:4-gen_tp:4]
226176 - perf/test_perf.py::test_perf[llama_v3.1_8b-disagg_server-ctx_dp:4-gen_tp:4]
177+ # gpt_oss_20b_fp4
178+ - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512]
0 commit comments