Skip to content

tests: [TRTQA-2905] improve timeout report for qa test cases #4753

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions tests/integration/defs/examples/test_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@
venv_mpi_check_call)
# yapf: disable
from defs.conftest import (get_device_count, get_device_memory,
get_host_total_memory, skip_fp8_pre_ada,
skip_no_nvls, skip_post_blackwell, skip_pre_ada,
skip_pre_blackwell)
get_host_total_memory, get_sm_version,
skip_fp8_pre_ada, skip_no_nvls, skip_post_blackwell,
skip_pre_ada, skip_pre_blackwell)
# yapf: enable
from defs.trt_test_alternative import check_call, exists

Expand Down Expand Up @@ -3022,6 +3022,7 @@ def test_llm_llama_v3_8b_1048k_long_context_ppl(llama_example_root,
'Llama-3-8B-Instruct-Gradient-1048k', 'Llama-3-70B-Instruct-Gradient-1048k'
],
indirect=True)
@pytest.mark.timeout(10800 if get_sm_version() < 89 else 3600)
def test_llm_llama_v3_1m_long_context_8gpus(llama_example_root,
llama_model_root, llm_venv,
engine_dir, cmodel_dir):
Expand Down
20 changes: 19 additions & 1 deletion tests/integration/defs/examples/test_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,40 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module test_mistral test mistral examples."""
import multiprocessing
import platform

import psutil
import pytest
from defs.common import (convert_weights, quantize_data,
test_multi_lora_support, venv_check_call)
from defs.conftest import skip_pre_ada
from defs.trt_test_alternative import check_call


def get_optimal_jobs():
cpu_count = multiprocessing.cpu_count()
available_memory = psutil.virtual_memory().available / (1024 * 1024 * 1024)
memory_per_job = 4
memory_based_jobs = int(available_memory / memory_per_job)
system_load = psutil.getloadavg()[0] / cpu_count
if system_load > 0.7:
cpu_factor = 0.5
else:
cpu_factor = 0.75
cpu_based_jobs = max(1, int(cpu_count * cpu_factor))
optimal_jobs = max(1, min(cpu_based_jobs, memory_based_jobs))
return optimal_jobs


@pytest.fixture(autouse=True, scope="module")
def mistral_example_root(llm_venv):
if platform.system() != "Windows":
# https://github.com/Dao-AILab/flash-attention/issues/345
# No wheel for flash-attn on windows and compilation fails locally.
max_jobs = get_optimal_jobs()
install_cmd = [
"MAX_JOBS=4",
f"MAX_JOBS={max_jobs}",
"python3",
"-m",
"pip",
Expand Down
29 changes: 27 additions & 2 deletions tests/integration/defs/trt_test_alternative.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,12 @@ def call(*popenargs,
**kwargs):
if not suppress_output_info:
print(f"Start subprocess with call({popenargs}, {kwargs})")
actual_timeout = get_pytest_timeout(timeout)
with popen(*popenargs,
start_new_session=start_new_session,
suppress_output_info=True,
**kwargs) as p:
return p.wait(timeout=timeout)
return p.wait(timeout=actual_timeout)


def check_call(*popenargs, **kwargs):
Expand All @@ -223,12 +224,13 @@ def check_call(*popenargs, **kwargs):

def check_output(*popenargs, timeout=None, start_new_session=True, **kwargs):
print(f"Start subprocess with check_output({popenargs}, {kwargs})")
actual_timeout = get_pytest_timeout(timeout)
with Popen(*popenargs,
stdout=subprocess.PIPE,
start_new_session=start_new_session,
**kwargs) as process:
try:
stdout, stderr = process.communicate(None, timeout=timeout)
stdout, stderr = process.communicate(None, timeout=actual_timeout)
except subprocess.TimeoutExpired as exc:
cleanup_process_tree(process, start_new_session)
if is_windows():
Expand Down Expand Up @@ -303,3 +305,26 @@ def check_call_negative_test(*popenargs, **kwargs):
f"Subprocess expected to fail with check_call_negative_test({popenargs}, {kwargs}), but passed."
)
raise subprocess.CalledProcessError(1, cmd)


def get_pytest_timeout(timeout=None):
try:
import pytest
marks = None
try:
current_item = pytest.current_test
if hasattr(current_item, 'iter_markers'):
marks = list(current_item.iter_markers('timeout'))
except (AttributeError, NameError):
pass

if marks and len(marks) > 0:
timeout_mark = marks[0]
timeout_pytest = timeout_mark.args[0] if timeout_mark.args else None
if timeout_pytest and isinstance(timeout_pytest, (int, float)):
return max(30, int(timeout_pytest * 0.9))

except (ImportError, Exception) as e:
print(f"Error getting pytest timeout: {e}")

return timeout
28 changes: 14 additions & 14 deletions tests/integration/test_lists/qa/examples_test_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-ena
examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-enable_weight_only]
examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only]
examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[enable_weight_only]
examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only]
examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only]
examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] TIMEOUT (40)
examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only] TIMEOUT (40)
examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8]
examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1]
examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2]
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-bart-large-cnn-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8]
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-bart-large-cnn-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] TIMEOUT (60)
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8]
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8]
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8]
Expand Down Expand Up @@ -70,7 +70,7 @@ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoder]
examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus]
examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoder2]
examples/test_llama.py::test_mistral_nemo_fp8_with_bf16_lora[Mistral-Nemo-12b-Base]
examples/test_mistral.py::test_mistral_nemo_minitron_fp8_with_bf16_lora[Mistral-NeMo-Minitron-8B-Instruct]
examples/test_mistral.py::test_mistral_nemo_minitron_fp8_with_bf16_lora[Mistral-NeMo-Minitron-8B-Instruct] TIMEOUT (40)
examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2]
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-mini-128k-instruct]
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct]
Expand Down Expand Up @@ -110,9 +110,9 @@ examples/test_llama.py::test_llm_llama_code_llama_1gpu_summary[CodeLlama-7b-Inst
examples/test_llama.py::test_llm_llama_code_llama_1gpu_summary[CodeLlama-7b-Instruct-enable_with_fp32_acc-enable_gemm_plugin-enable_attention_plugin-nb:1]
examples/test_llama.py::test_llm_llama_code_llama_multi_gpus_summary[CodeLlama-34b-Instruct-tp4pp1-nb:4]
examples/test_llama.py::test_llm_llama_code_llama_multi_gpus_summary[CodeLlama-70b-hf-tp2pp2-nb:1]
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp2pp2-int4_awq-nb:4]
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp2pp2-int4_awq-nb:4] TIMEOUT (40)
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp4pp1-fp8-nb:1]
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp2pp2-int4_awq-nb:1]
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp2pp2-int4_awq-nb:1] TIMEOUT (40)
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp4pp1-fp8-nb:4]
examples/test_llama.py::test_codellama_fp8_with_bf16_lora[CodeLlama-7b-Instruct]
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v2-7b-hf]
Expand All @@ -135,11 +135,11 @@ examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-lla
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_int8_wo]
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_sq_ootb]
examples/test_llama.py::test_llm_llama_v2_lora_benchmark_2gpu[chinese_lora-llama-v2-13b-hf]
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-enable_fp8]
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-fp8-disable_fp8]
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8]
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-enable_fp8] TIMEOUT (120)
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-fp8-disable_fp8] TIMEOUT (90)
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8] TIMEOUT (40)
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[disable_gemm_allreduce_plugin-llama-3.1-70b-enable_fp8]
examples/test_llama.py::test_llm_llama_v3_1m_long_context_8gpus[Llama-3-8B-Instruct-Gradient-1048k]
examples/test_llama.py::test_llm_llama_v3_1m_long_context_8gpus[Llama-3-8B-Instruct-Gradient-1048k] TIMEOUT (180)
examples/test_llama.py::test_llm_llama_v3_dora_1gpu[commonsense-llama-v3-8b-dora-r32-llama-v3-8b-hf-base_fp16]
examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16]
examples/test_llama.py::test_llm_llama_2gpu_fp4[llama-3.1-70b-instruct-fp4_plugin]
Expand Down Expand Up @@ -216,7 +216,7 @@ examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16]
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-mini-128k-instruct-fp8-float16]
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-fp8-bfloat16]
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16]
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-MoE-instruct-fp8-bfloat16]
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-MoE-instruct-fp8-bfloat16] TIMEOUT (60)
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16]
examples/test_qwen.py::test_llm_qwen1_5_7b_single_gpu_lora[qwen1.5_7b_chat-Qwen1.5-7B-Chat-750Mb-lora]
examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA]
Expand Down Expand Up @@ -394,13 +394,13 @@ accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int8_tp2
accuracy/test_cli_flow.py::TestMixtral8x7B::test_pp_reduce_scatter_tp2pp2
accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[expert_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[mixed_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[tensor_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[tensor_parallel] TIMEOUT (40)
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[no_renormalize-tensor_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-expert_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-mixed_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-tensor_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_nvfp4_prequantized
accuracy/test_cli_flow.py::TestMixtral8x22B::test_fp8_tp2pp2
accuracy/test_cli_flow.py::TestMixtral8x22B::test_fp8_tp2pp2 TIMEOUT (50)
accuracy/test_cli_flow.py::TestMixtral8x22B::test_int8_plugin_tp8[renormalize-tensor_parallel]
accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype
accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8]
Expand Down Expand Up @@ -509,7 +509,7 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus[Nemotron-Ultra-253B-nemotron-nas
test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image]
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video]
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video] TIMEOUT (60)
test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image]
test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image]
test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-video]
Expand Down
Loading