[Fix]Add import checking to trace_replay and fix the issue of unclosed network resources

t00939662 · t00939662 · commit dc420955a2a0 · 2025-10-23T20:41:09.000+08:00
diff --git a/benchmarks/trace_replay.py b/benchmarks/trace_replay.py
@@ -454,174 +454,175 @@ async def replay_trace_by_time(
     if test_request is None:
         raise ValueError("No request found for initial test run.")
 
-    session = aiohttp.ClientSession(
+    async with aiohttp.ClientSession(
         trust_env=True,
         timeout=aiohttp.ClientTimeout(total=6 * 60 * 60),
-    )
-
-    test_input = RequestFuncInput(
-        model=model_id,
-        model_name=model_name,
-        prompt=test_request.prompt,
-        api_url=api_url,
-        prompt_len=test_request.prompt_len,
-        output_len=test_request.expected_output_len,
-        logprobs=None,
-        multi_modal_content=getattr(test_request, "multi_modal_data", None),
-        ignore_eos=True,
-        extra_body={"temperature": 0.9},
-    )
-
-    test_output = await request_func(request_func_input=test_input, session=session)
-
-    if not getattr(test_output, "success", False):
-        raise ValueError(
-            "Initial test run failed - Please make sure arguments "
-            f"are correctly specified. Error: {getattr(test_output, 'error', '')}"
-        )
-    else:
-        print("Initial test run completed. Starting main run...")
-
-    total = sum(len(req_list) for req_list in req_groups.values())
-    pbar = None if disable_tqdm else tqdm(total=total)
-    semaphore = (
-        asyncio.Semaphore(args.max_concurrency)
-        if getattr(args, "max_concurrency", None)
-        else None
-    )
-    start_time = time.perf_counter()
-    print(f"Start time is {start_time}")
-    tasks = []
-    flat_requests = []
-
-    async def _run_one_request(sample_req):
-        sampling_params = {"temperature": 0.9}
-        req_input = RequestFuncInput(
+    ) as session:
+        test_input = RequestFuncInput(
             model=model_id,
             model_name=model_name,
-            prompt=sample_req.prompt,
+            prompt=test_request.prompt,
             api_url=api_url,
-            prompt_len=sample_req.prompt_len,
-            output_len=sample_req.expected_output_len,
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
             logprobs=None,
-            extra_body=sampling_params,
+            multi_modal_content=getattr(test_request, "multi_modal_data", None),
             ignore_eos=True,
+            extra_body={"temperature": 0.9},
+        )
+
+        test_output = await request_func(request_func_input=test_input, session=session)
+
+        if not getattr(test_output, "success", False):
+            raise ValueError(
+                "Initial test run failed - Please make sure arguments "
+                f"are correctly specified. Error: {getattr(test_output, 'error', '')}"
+            )
+        else:
+            print("Initial test run completed. Starting main run...")
+
+        total = sum(len(req_list) for req_list in req_groups.values())
+        pbar = None if disable_tqdm else tqdm(total=total)
+        semaphore = (
+            asyncio.Semaphore(args.max_concurrency)
+            if getattr(args, "max_concurrency", None)
+            else None
         )
-        if semaphore is not None:
-            async with semaphore:
+        start_time = time.perf_counter()
+        print(f"Start time is {start_time}")
+        tasks = []
+        flat_requests = []
+
+        async def _run_one_request(sample_req):
+            sampling_params = {"temperature": 0.9}
+            req_input = RequestFuncInput(
+                model=model_id,
+                model_name=model_name,
+                prompt=sample_req.prompt,
+                api_url=api_url,
+                prompt_len=sample_req.prompt_len,
+                output_len=sample_req.expected_output_len,
+                logprobs=None,
+                extra_body=sampling_params,
+                ignore_eos=True,
+            )
+            if semaphore is not None:
+                async with semaphore:
+                    return await request_func(
+                        request_func_input=req_input, session=session, pbar=pbar
+                    )
+            else:
                 return await request_func(
                     request_func_input=req_input, session=session, pbar=pbar
                 )
-        else:
-            return await request_func(
-                request_func_input=req_input, session=session, pbar=pbar
-            )
-
-    for sec, reqs in sorted(req_groups.items()):
-        delay = sec - (time.perf_counter() - start_time)
-        delay = max(0, delay)
 
-        async def send_group(r=reqs, d=delay):
-            await asyncio.sleep(d)
-            print(
-                f"Sending request at {time.perf_counter() - start_time:.3f}s with {len(r)} reqs"
-            )
-            group_tasks = [asyncio.create_task(_run_one_request(req)) for req in r]
-            try:
-                return await asyncio.gather(*group_tasks)
-            except asyncio.TimeoutError:
-                print(f"Request timed out: group at delay {d:.3f}s")
-                return []
-            except Exception as e:
-                print(f"Request failed: {e}")
-                return []
-
-        tasks.append(asyncio.create_task(send_group(reqs, delay)))
-        flat_requests.extend(reqs)
-
-    group_results = await asyncio.gather(*tasks)
-    outputs = []
-    for res in group_results:
-        if isinstance(res, list):
-            outputs.extend(res)
-
-    if pbar is not None:
-        pbar.close()
-
-    benchmark_duration = time.perf_counter() - start_time
-    metrics, actual_output_lens = calculate_metrics(
-        input_requests=flat_requests,
-        outputs=outputs,
-        dur_s=benchmark_duration,
-        tokenizer=tokenizer,
-        selected_percentiles=[25.0, 50.0, 75.0, 99.0],
-        goodput_config_dict={"ttft": 2000, "tpot": 50},
-    )
+        for sec, reqs in sorted(req_groups.items()):
+            delay = sec - (time.perf_counter() - start_time)
+            delay = max(0, delay)
 
-    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
-    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
-    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Request throughput (req/s):", metrics.request_throughput
-        )
-    )
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Output token throughput (tok/s):", metrics.output_throughput
-        )
-    )
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Total Token throughput (tok/s):", metrics.total_token_throughput
+            async def send_group(r=reqs, d=delay):
+                await asyncio.sleep(d)
+                print(
+                    f"Sending request at {time.perf_counter() - start_time:.3f}s with {len(r)} reqs"
+                )
+                group_tasks = [asyncio.create_task(_run_one_request(req)) for req in r]
+                try:
+                    return await asyncio.gather(*group_tasks)
+                except asyncio.TimeoutError:
+                    print(f"Request timed out: group at delay {d:.3f}s")
+                    return []
+                except Exception as e:
+                    print(f"Request failed: {e}")
+                    return []
+
+            tasks.append(asyncio.create_task(send_group(reqs, delay)))
+            flat_requests.extend(reqs)
+
+        group_results = await asyncio.gather(*tasks)
+        outputs = []
+        for res in group_results:
+            if isinstance(res, list):
+                outputs.extend(res)
+
+        if pbar is not None:
+            pbar.close()
+
+        benchmark_duration = time.perf_counter() - start_time
+        metrics, actual_output_lens = calculate_metrics(
+            input_requests=flat_requests,
+            outputs=outputs,
+            dur_s=benchmark_duration,
+            tokenizer=tokenizer,
+            selected_percentiles=[25.0, 50.0, 75.0, 99.0],
+            goodput_config_dict={"ttft": 2000, "tpot": 50},
         )
-    )
 
-    # Define the process_one_metric function, which can access the outer scope's selected_percentile_metrics
-    def process_one_metric(
-        metric_attribute_name: str,
-        metric_name: str,
-        metric_header: str,
-    ):
-        selected_percentile_metrics = ["ttft", "tpot", "itl", "e2el"]
-        if metric_attribute_name not in selected_percentile_metrics:
-            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+        print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+        print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+        print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+        print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
         print(
             "{:<40} {:<10.2f}".format(
-                f"Mean {metric_name} (ms):",
-                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+                "Request throughput (req/s):", metrics.request_throughput
             )
         )
         print(
             "{:<40} {:<10.2f}".format(
-                f"Median {metric_name} (ms):",
-                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+                "Output token throughput (tok/s):", metrics.output_throughput
             )
         )
-        # standard deviation
         print(
             "{:<40} {:<10.2f}".format(
-                f"Std {metric_name} (ms):",
-                getattr(metrics, f"std_{metric_attribute_name}_ms"),
+                "Total Token throughput (tok/s):", metrics.total_token_throughput
             )
         )
-        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
-            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
-
-    process_one_metric("ttft", "TTFT", "Time to First Token")
-    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
-    process_one_metric("itl", "ITL", "Inter-token Latency")
-    process_one_metric("e2el", "E2EL", "End-to-end Latency")
-    print("=" * 50)
-
-    output_dir = args.result_dir if args.result_dir is not None else "./"
-    if args.save_result:
-        save_metrics_to_file(metrics=metrics, output_dir=output_dir)
-        save_req_results_to_file(outputs=outputs, output_dir=output_dir)
+
+        # Define the process_one_metric function, which can access the outer scope's selected_percentile_metrics
+        def process_one_metric(
+            metric_attribute_name: str,
+            metric_name: str,
+            metric_header: str,
+        ):
+            selected_percentile_metrics = ["ttft", "tpot", "itl", "e2el"]
+            if metric_attribute_name not in selected_percentile_metrics:
+                return
+            print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+            print(
+                "{:<40} {:<10.2f}".format(
+                    f"Mean {metric_name} (ms):",
+                    getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+                )
+            )
+            print(
+                "{:<40} {:<10.2f}".format(
+                    f"Median {metric_name} (ms):",
+                    getattr(metrics, f"median_{metric_attribute_name}_ms"),
+                )
+            )
+            # standard deviation
+            print(
+                "{:<40} {:<10.2f}".format(
+                    f"Std {metric_name} (ms):",
+                    getattr(metrics, f"std_{metric_attribute_name}_ms"),
+                )
+            )
+            for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
+                p_word = str(int(p)) if int(p) == p else str(p)
+                print(
+                    "{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)
+                )
+
+        process_one_metric("ttft", "TTFT", "Time to First Token")
+        process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
+        process_one_metric("itl", "ITL", "Inter-token Latency")
+        process_one_metric("e2el", "E2EL", "End-to-end Latency")
+        print("=" * 50)
+
+        output_dir = args.result_dir if args.result_dir is not None else "./"
+        if args.save_result:
+            save_metrics_to_file(metrics=metrics, output_dir=output_dir)
+            save_req_results_to_file(outputs=outputs, output_dir=output_dir)
     return
 
 
@@ -678,6 +679,14 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == "__main__":
+    # Check openpyxl for Excel export
+    try:
+        import openpyxl
+    except ImportError:
+        print("\nMissing package: openpyxl")
+        print("Please install openpyxl via pip install.\n")
+        sys.exit(1)
+
     parser = create_argument_trace()
     args = parser.parse_args()
     main(args)