[BE] Add testing for output saving logic (#153)

PaliC · web-flow · commit 65b7c1a9873e · 2025-09-22T09:12:38.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,4 @@ generated_kernels/
 *.csv
 backendbench_output*
 .DS_Store
+*.bak
diff --git a/BackendBench/output.py b/BackendBench/output.py
@@ -7,6 +7,7 @@
 import csv
 import json
 import logging
+import os
 from collections import defaultdict
 from dataclasses import asdict
 from pathlib import Path
@@ -19,38 +20,15 @@
 logger = logging.getLogger(__name__)
 
 
-def save_results(
+def _prepare_results_data(
     correctness_results: List[CorrectnessTestResult],
     performance_results: List[PerformanceTestResult],
-    output_path: Union[str, Path] = "backendbench_output",
-    command: str = None,
-    mean_correctness: float = None,
-    geomean_perf: float = None,
-    perf_at_p_score: float = None,
-    p: float = 1.0,
-):
-    """Save results without creating per-operator directories.
-
-    Args:
-        correctness_results: List of correctness test results
-        performance_results: List of performance test results
-        output_path: Base directory for saving results
-        command: Command used to run the benchmark
-        mean_correctness: Mean correctness score
-        geomean_perf: Geometric mean of performance scores
-        perf_at_p_score: Performance at threshold p score
-        p: The threshold value used for perf@p calculation
+) -> Tuple[List[dict], List[dict], dict]:
+    """Prepare and process results data without file I/O.
 
-    Structure created:
-        output_path/
-        ├── OVERALL_SUMMARY.md         # Top level summary of results
-        ├── full_results.json          # Complete results log
-        ├── operator_summary.csv       # Operator-level summary
-        └── failed_ops.json            # Log of failed operations
+    Returns:
+        Tuple of (all_results, failed_tests, op_summaries)
     """
-    base_dir = Path(output_path)
-    base_dir.mkdir(parents=True, exist_ok=True)
-
     # Prep work: save all results as a list of dicts
     all_results = [asdict(result) for result in correctness_results] + [
         asdict(result) for result in performance_results
@@ -59,16 +37,7 @@ def save_results(
     # sort by op_name, then args
     all_results.sort(key=lambda x: (x["op_name"], x["args"]))
 
-    # 1. Save the full log in the base directory
-    full_log_path = base_dir / "full_results.json"
-    failed_ops_path = base_dir / "failed_ops.json"
-    summary_csv_path = base_dir / "operator_summary.csv"
-
-    with open(full_log_path, "w") as f:
-        json.dump(all_results, f, indent=2)
-    logger.info(f"Full results saved to {full_log_path}")
-
-    # 2. Organize results by operator for csv
+    # Organize results by operator for csv
     op_all_results = defaultdict(list)
     op_summaries = {}
 
@@ -87,8 +56,10 @@ def save_results(
         ]
 
         # Calculate operator-level summary
-        total_tests = len(op_tests)
-        correct_tests = sum(1 for result in op_correctness_results if result.is_correct)
+        correct_correctness_tests = sum(1 for result in op_correctness_results if result.is_correct)
+        passed_performance_tests = sum(
+            1 for result in op_performance_results if result.successfully_ran
+        )
         # Collect performance metrics
         speedups = []
         abs_errors = []
@@ -105,25 +76,90 @@ def save_results(
                 speedups.append(float(test.speedup))
 
         # Calculate summary statistics
-        correctness_rate = correct_tests / total_tests if total_tests > 0 else 0.0
+        correctness_rate = (
+            correct_correctness_tests / len(op_correctness_results)
+            if len(op_correctness_results) > 0
+            else 0.0
+        )
         avg_speedup = sum(speedups) / len(speedups) if speedups else 0.0
         geomean_speedup = torch.tensor(speedups).log().mean().exp().item() if speedups else 0.0
         max_abs_error = max(abs_errors) if abs_errors else 0.0
         max_rel_error = max(rel_errors) if rel_errors else 0.0
 
         op_summaries[op_name] = {
             "operator": op_name,
-            "total_tests": total_tests,
-            "passed_tests": correct_tests,
-            "failed_tests": total_tests - correct_tests,
+            "total_tests": len(op_all_results),
+            "correctness_tests": len(op_correctness_results),
+            "performance_tests": len(op_performance_results),
+            "passed_correctness_tests": correct_correctness_tests,
+            "passed_performance_tests": passed_performance_tests,
+            "failed_correctness_tests": len(op_correctness_results) - correct_correctness_tests,
+            "failed_performance_tests": len(op_performance_results) - passed_performance_tests,
             "correctness_rate": correctness_rate,
             "avg_speedup": avg_speedup,
             "geomean_speedup": geomean_speedup,
             "max_absolute_error": max_abs_error,
             "max_relative_error": max_rel_error,
         }
 
-    # 3. Create operator-level summary CSV
+    # Prepare failed operations log
+    failed_tests = [asdict(result) for result in correctness_results if not result.is_correct] + [
+        asdict(result) for result in performance_results if not result.successfully_ran
+    ]
+
+    # sort failed_tests
+    failed_tests.sort(key=lambda x: (x["op_name"], x["args"]))
+
+    return all_results, failed_tests, op_summaries
+
+
+def save_results(
+    correctness_results: List[CorrectnessTestResult],
+    performance_results: List[PerformanceTestResult],
+    output_path: str,
+    command: str,
+    mean_correctness: float,
+    geomean_perf: float,
+    perf_at_p_score: float,
+    p: float = 1.0,
+) -> Tuple[List[dict], List[dict], dict]:
+    """Prepare and process results data without file I/O.
+
+    Args:
+        correctness_results: List of correctness test results
+        performance_results: List of performance test results
+        output_path: Base directory for saving results
+        command: Command used to run the benchmark
+        mean_correctness: Mean correctness score
+        geomean_perf: Geometric mean of performance scores
+        perf_at_p_score: Performance at threshold p score
+        p: The threshold value used for perf@p calculation
+
+    Structure created:
+        output_path/
+        ├── OVERALL_SUMMARY.md         # Top level summary of results
+        ├── full_results.json          # Complete results log
+        ├── operator_summary.csv       # Operator-level summary
+        └── failed_tests.json            # Log of failed operations
+    """
+    base_dir = Path(output_path)
+    base_dir.mkdir(parents=True, exist_ok=True)
+
+    # Process data using the extracted function
+    all_results, failed_tests, op_summaries = _prepare_results_data(
+        correctness_results, performance_results
+    )
+
+    # 1. Save the full log in the base directory
+    full_log_path = os.path.join(base_dir, "full_results.json")
+    failed_tests_path = os.path.join(base_dir, "failed_tests.json")
+    summary_csv_path = os.path.join(base_dir, "operator_summary.csv")
+
+    with open(full_log_path, "w") as f:
+        json.dump(all_results, f, indent=2)
+    logger.info(f"Full results saved to {full_log_path}")
+
+    # 2. Create operator-level summary CSV
     if len(op_summaries) > 0:
         op_summary_list = list(op_summaries.values())
         fieldnames = list(op_summary_list[0].keys())
@@ -136,16 +172,10 @@ def save_results(
 
         logger.info(f"Operator summary CSV saved to {summary_csv_path}")
 
-    # 4. Save failed operations log
-    failed_tests = [asdict(result) for result in correctness_results if not result.is_correct] + [
-        asdict(result) for result in performance_results if not result.successfully_ran
-    ]
-    # sort failed_tests
-    failed_tests.sort(key=lambda x: (x["op_name"], x["args"]))
-
-    with open(failed_ops_path, "w") as f:
+    # 3. Save failed operations log
+    with open(failed_tests_path, "w") as f:
         json.dump(failed_tests, f, indent=2)
-    logger.info(f"Failed operations log saved to {failed_ops_path}")
+    logger.info(f"Failed operations log saved to {failed_tests_path}")
 
     # Save overall_summary if metrics are provided
     if all(x is not None for x in [command, mean_correctness, geomean_perf, perf_at_p_score]):
@@ -203,6 +233,61 @@ def _get_summary_op_results(
     return op_results
 
 
+def _generate_overall_summary_content(
+    command: str,
+    mean_correctness: float,
+    geomean_perf: float,
+    perf_at_p_score: float,
+    p: float = 1.0,
+    performance_results: List[PerformanceTestResult] = None,
+    correctness_results: List[CorrectnessTestResult] = None,
+) -> str:
+    """Generate the content for the overall summary markdown file.
+
+    Returns:
+        The markdown content as a string.
+    """
+    op_results = _get_summary_op_results(performance_results, correctness_results)
+
+    content = []
+    content.append("# BackendBench Run Summary\n")
+
+    content.append("## Command")
+    content.append("```bash")
+    content.append(f"{command}")
+    content.append("```\n")
+
+    content.append("## Results\n")
+    content.append("| Metric | Value |")
+    content.append("|--------|-------|")
+    content.append(f"| Correctness Score | {mean_correctness:.2f} |")
+    content.append(f"| Performance Score (geomean speedup) | {geomean_perf:.2f} |")
+    content.append(f"| Perf@{p} Score | {perf_at_p_score:.2f} |")
+    content.append("")
+
+    content.append("### Metric Descriptions\n")
+    content.append("- **Correctness Score**: Mean pass rate over all operators")
+    content.append("- **Performance Score**: Geometric mean speedup over all operators")
+    content.append(f"- **Perf@{p} Score**: Rate of correct samples with a speedup greater than {p}")
+    content.append("")
+
+    content.append("## Output Files\n")
+    content.append("The following files are saved in this directory:\n")
+    content.append("- `full_results.json`: Complete test results for all operators")
+    content.append("- `operator_summary.csv`: Operator-level summary statistics")
+    content.append("- `failed_tests.json`: Log of failed tests (if any)")
+    content.append("- `OVERALL_SUMMARY.md`: This file")
+
+    content.append("### Operator Speedups vs Eager in Descending Order\n")
+    content.append("| Operator | Correctness Ratio | Speedup vs Eager |")
+    content.append("|----------|-----------|----------------|")
+    for op, correctness, speedup in op_results:
+        content.append(f"| {op} | {correctness} | {speedup}|")
+    content.append("")
+
+    return "\n".join(content)
+
+
 def save_overall_summary(
     output_path: Union[str, Path],
     command: str,
@@ -226,43 +311,19 @@ def save_overall_summary(
     base_dir = Path(output_path)
     base_dir.mkdir(parents=True, exist_ok=True)
 
-    overall_summary_path = base_dir / "OVERALL_SUMMARY.md"
-    op_results = _get_summary_op_results(performance_results, correctness_results)
+    overall_summary_path = os.path.join(base_dir, "OVERALL_SUMMARY.md")
+
+    content = _generate_overall_summary_content(
+        command,
+        mean_correctness,
+        geomean_perf,
+        perf_at_p_score,
+        p,
+        performance_results,
+        correctness_results,
+    )
 
     with open(overall_summary_path, "w") as f:
-        f.write("# BackendBench Run Summary\n\n")
-
-        f.write("## Command\n")
-        f.write("```bash\n")
-        f.write(f"{command}\n")
-        f.write("```\n\n")
-
-        f.write("## Results\n\n")
-        f.write("| Metric | Value |\n")
-        f.write("|--------|-------|\n")
-        f.write(f"| Correctness Score | {mean_correctness:.2f} |\n")
-        f.write(f"| Performance Score (geomean speedup) | {geomean_perf:.2f} |\n")
-        f.write(f"| Perf@{p} Score | {perf_at_p_score:.2f} |\n")
-        f.write("\n")
-
-        f.write("### Metric Descriptions\n\n")
-        f.write("- **Correctness Score**: Mean pass rate over all operators\n")
-        f.write("- **Performance Score**: Geometric mean speedup over all operators\n")
-        f.write(f"- **Perf@{p} Score**: Rate of correct samples with a speedup greater than {p}\n")
-        f.write("\n")
-
-        f.write("## Output Files\n\n")
-        f.write("The following files are saved in this directory:\n\n")
-        f.write("- `full_results.json`: Complete test results for all operators\n")
-        f.write("- `operator_summary.csv`: Operator-level summary statistics\n")
-        f.write("- `failed_ops.json`: Log of failed operations (if any)\n")
-        f.write("- `OVERALL_SUMMARY.md`: This file\n")
-
-        f.write("### Operator Speedups vs Eager in Descending Order\n\n")
-        f.write("| Operator | Correctness Ratio | Speedup vs Eager |\n")
-        f.write("|----------|-----------|----------------|\n")
-        for op, correctness, speedup in op_results:
-            f.write(f"| {op} | {correctness} | {speedup}|\n")
-        f.write("\n")
+        f.write(content)
 
     logger.info(f"Overall summary saved to {overall_summary_path}")
diff --git a/test/test_output.py b/test/test_output.py