NVIDIA · LarryXFly · Jun 3, 2025 · May 29, 2025 · May 29, 2025 · May 29, 2025
diff --git a/tests/integration/defs/examples/test_llama.py b/tests/integration/defs/examples/test_llama.py
@@ -30,9 +30,9 @@
                          venv_mpi_check_call)
 # yapf: disable
 from defs.conftest import (get_device_count, get_device_memory,
-                           get_host_total_memory, skip_fp8_pre_ada,
-                           skip_no_nvls, skip_post_blackwell, skip_pre_ada,
-                           skip_pre_blackwell)
+                           get_host_total_memory, get_sm_version,
+                           skip_fp8_pre_ada, skip_no_nvls, skip_post_blackwell,
+                           skip_pre_ada, skip_pre_blackwell)
 # yapf: enable
 from defs.trt_test_alternative import check_call, exists
 
@@ -3022,6 +3022,7 @@ def test_llm_llama_v3_8b_1048k_long_context_ppl(llama_example_root,
     'Llama-3-8B-Instruct-Gradient-1048k', 'Llama-3-70B-Instruct-Gradient-1048k'
 ],
                          indirect=True)
+@pytest.mark.timeout(10800 if get_sm_version() < 89 else 3600)
 def test_llm_llama_v3_1m_long_context_8gpus(llama_example_root,
                                             llama_model_root, llm_venv,
                                             engine_dir, cmodel_dir):

diff --git a/tests/integration/defs/examples/test_mistral.py b/tests/integration/defs/examples/test_mistral.py
@@ -13,22 +13,40 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Module test_mistral test mistral examples."""
+import multiprocessing
 import platform
 
+import psutil
 import pytest
 from defs.common import (convert_weights, quantize_data,
                          test_multi_lora_support, venv_check_call)
 from defs.conftest import skip_pre_ada
 from defs.trt_test_alternative import check_call
 
 
+def get_optimal_jobs():
+    cpu_count = multiprocessing.cpu_count()
+    available_memory = psutil.virtual_memory().available / (1024 * 1024 * 1024)
+    memory_per_job = 4
+    memory_based_jobs = int(available_memory / memory_per_job)
+    system_load = psutil.getloadavg()[0] / cpu_count
+    if system_load > 0.7:
+        cpu_factor = 0.5
+    else:
+        cpu_factor = 0.75
+    cpu_based_jobs = max(1, int(cpu_count * cpu_factor))
+    optimal_jobs = max(1, min(cpu_based_jobs, memory_based_jobs))
+    return optimal_jobs
+
+
 @pytest.fixture(autouse=True, scope="module")
 def mistral_example_root(llm_venv):
     if platform.system() != "Windows":
         # https://github.com/Dao-AILab/flash-attention/issues/345
         # No wheel for flash-attn on windows and compilation fails locally.
+        max_jobs = get_optimal_jobs()
         install_cmd = [
-            "MAX_JOBS=4",
+            f"MAX_JOBS={max_jobs}",
             "python3",
             "-m",
             "pip",

diff --git a/tests/integration/defs/trt_test_alternative.py b/tests/integration/defs/trt_test_alternative.py
@@ -203,11 +203,12 @@ def call(*popenargs,
          **kwargs):
     if not suppress_output_info:
         print(f"Start subprocess with call({popenargs}, {kwargs})")
+    actual_timeout = get_pytest_timeout(timeout)
     with popen(*popenargs,
                start_new_session=start_new_session,
                suppress_output_info=True,
                **kwargs) as p:
-        return p.wait(timeout=timeout)
+        return p.wait(timeout=actual_timeout)
 
 
 def check_call(*popenargs, **kwargs):
@@ -223,12 +224,13 @@ def check_call(*popenargs, **kwargs):
 
 def check_output(*popenargs, timeout=None, start_new_session=True, **kwargs):
     print(f"Start subprocess with check_output({popenargs}, {kwargs})")
+    actual_timeout = get_pytest_timeout(timeout)
     with Popen(*popenargs,
                stdout=subprocess.PIPE,
                start_new_session=start_new_session,
                **kwargs) as process:
         try:
-            stdout, stderr = process.communicate(None, timeout=timeout)
+            stdout, stderr = process.communicate(None, timeout=actual_timeout)
         except subprocess.TimeoutExpired as exc:
             cleanup_process_tree(process, start_new_session)
             if is_windows():
@@ -303,3 +305,26 @@ def check_call_negative_test(*popenargs, **kwargs):
             f"Subprocess expected to fail with check_call_negative_test({popenargs}, {kwargs}), but passed."
         )
         raise subprocess.CalledProcessError(1, cmd)
+
+
+def get_pytest_timeout(timeout=None):
+    try:
+        import pytest
+        marks = None
+        try:
+            current_item = pytest.current_test
+            if hasattr(current_item, 'iter_markers'):
+                marks = list(current_item.iter_markers('timeout'))
+        except (AttributeError, NameError):
+            pass
+
+        if marks and len(marks) > 0:
+            timeout_mark = marks[0]
+            timeout_pytest = timeout_mark.args[0] if timeout_mark.args else None
+            if timeout_pytest and isinstance(timeout_pytest, (int, float)):
+                return max(30, int(timeout_pytest * 0.9))
+
+    except (ImportError, Exception) as e:
+        print(f"Error getting pytest timeout: {e}")
+
+    return timeout
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -14,12 +14,12 @@ examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-ena
 examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-enable_weight_only]
 examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only]
 examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[enable_weight_only]
-examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only]
-examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only]
+examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] TIMEOUT (40)
+examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only] TIMEOUT (40)
 examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8]
 examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1]
 examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2]
-examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-bart-large-cnn-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8]
+examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-bart-large-cnn-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] TIMEOUT (60)
 examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8]
 examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8]
 examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8]
@@ -70,7 +70,7 @@ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoder]
 examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus]
 examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoder2]
 examples/test_llama.py::test_mistral_nemo_fp8_with_bf16_lora[Mistral-Nemo-12b-Base]
-examples/test_mistral.py::test_mistral_nemo_minitron_fp8_with_bf16_lora[Mistral-NeMo-Minitron-8B-Instruct]
+examples/test_mistral.py::test_mistral_nemo_minitron_fp8_with_bf16_lora[Mistral-NeMo-Minitron-8B-Instruct] TIMEOUT (40)
 examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2]
 examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-mini-128k-instruct]
 examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct]
@@ -110,9 +110,9 @@ examples/test_llama.py::test_llm_llama_code_llama_1gpu_summary[CodeLlama-7b-Inst
 examples/test_llama.py::test_llm_llama_code_llama_1gpu_summary[CodeLlama-7b-Instruct-enable_with_fp32_acc-enable_gemm_plugin-enable_attention_plugin-nb:1]
 examples/test_llama.py::test_llm_llama_code_llama_multi_gpus_summary[CodeLlama-34b-Instruct-tp4pp1-nb:4]
 examples/test_llama.py::test_llm_llama_code_llama_multi_gpus_summary[CodeLlama-70b-hf-tp2pp2-nb:1]
-examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp2pp2-int4_awq-nb:4]
+examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp2pp2-int4_awq-nb:4] TIMEOUT (40)
 examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp4pp1-fp8-nb:1]
-examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp2pp2-int4_awq-nb:1]
+examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp2pp2-int4_awq-nb:1] TIMEOUT (40)
 examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp4pp1-fp8-nb:4]
 examples/test_llama.py::test_codellama_fp8_with_bf16_lora[CodeLlama-7b-Instruct]
 examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v2-7b-hf]
@@ -135,11 +135,11 @@ examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-lla
 examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_int8_wo]
 examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_sq_ootb]
 examples/test_llama.py::test_llm_llama_v2_lora_benchmark_2gpu[chinese_lora-llama-v2-13b-hf]
-examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-enable_fp8]
-examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-fp8-disable_fp8]
-examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8]
+examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-enable_fp8] TIMEOUT (120)
+examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-fp8-disable_fp8] TIMEOUT (90)
+examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8] TIMEOUT (40)
 examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[disable_gemm_allreduce_plugin-llama-3.1-70b-enable_fp8]
-examples/test_llama.py::test_llm_llama_v3_1m_long_context_8gpus[Llama-3-8B-Instruct-Gradient-1048k]
+examples/test_llama.py::test_llm_llama_v3_1m_long_context_8gpus[Llama-3-8B-Instruct-Gradient-1048k] TIMEOUT (180)
 examples/test_llama.py::test_llm_llama_v3_dora_1gpu[commonsense-llama-v3-8b-dora-r32-llama-v3-8b-hf-base_fp16]
 examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16]
 examples/test_llama.py::test_llm_llama_2gpu_fp4[llama-3.1-70b-instruct-fp4_plugin]
@@ -216,7 +216,7 @@ examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16]
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-mini-128k-instruct-fp8-float16]
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-fp8-bfloat16]
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16]
-examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-MoE-instruct-fp8-bfloat16]
+examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-MoE-instruct-fp8-bfloat16] TIMEOUT (60)
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16]
 examples/test_qwen.py::test_llm_qwen1_5_7b_single_gpu_lora[qwen1.5_7b_chat-Qwen1.5-7B-Chat-750Mb-lora]
 examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA]
@@ -394,13 +394,13 @@ accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int8_tp2
 accuracy/test_cli_flow.py::TestMixtral8x7B::test_pp_reduce_scatter_tp2pp2
 accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[expert_parallel]
 accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[mixed_parallel]
-accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[tensor_parallel]
+accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[tensor_parallel] TIMEOUT (40)
 accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[no_renormalize-tensor_parallel]
 accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-expert_parallel]
 accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-mixed_parallel]
 accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-tensor_parallel]
 accuracy/test_cli_flow.py::TestMixtral8x7B::test_nvfp4_prequantized
-accuracy/test_cli_flow.py::TestMixtral8x22B::test_fp8_tp2pp2
+accuracy/test_cli_flow.py::TestMixtral8x22B::test_fp8_tp2pp2 TIMEOUT (50)
 accuracy/test_cli_flow.py::TestMixtral8x22B::test_int8_plugin_tp8[renormalize-tensor_parallel]
 accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype
 accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8]
@@ -509,7 +509,7 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus[Nemotron-Ultra-253B-nemotron-nas
 test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
 test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
 test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image]
-test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video]
+test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video] TIMEOUT (60)
 test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image]
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image]
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-video]