fixup

lukaszstolarczuk · lukaszstolarczuk · commit 72296b332478 · 2025-11-14T18:26:22.000+01:00
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
@@ -99,7 +99,12 @@ def run_iterations(
     failures: dict[str, str],
     run_trace: TracingType = TracingType.NONE,
     force_trace: bool = False,
-):
+) -> bool:
+    """
+    Returns True if all iterations completed successfully, False otherwise.
+    Unless options.exit_on_failure is set, then exception is raised.
+    """
+
     for iter in range(iters):
         log.info(f"running {benchmark.name()}, iteration {iter}... ")
         try:
@@ -111,7 +116,7 @@ def run_iterations(
                     raise RuntimeError(f"Benchmark produced no results!")
                 else:
                     failures[benchmark.name()] = "benchmark produced no results!"
-                    break
+                    return False
 
             for bench_result in bench_results:
                 log.info(
@@ -132,10 +137,15 @@ def run_iterations(
                     f"Benchmark failed: {failure_label} verification failed: {str(e)}"
                 )
             else:
-                failures[failure_label] = f"verification failed: {str(e)}"
-                log.error(f"complete ({failure_label}: verification failed: {str(e)}).")
+                failures[failure_label] = (
+                    f"{failure_label}: verification failed: {str(e)}"
+                )
+                log.error(f"{failure_label}: verification failed: {str(e)}.")
                 continue
 
+    # Iterations completed successfully
+    return True
+
 
 # https://www.statology.org/modified-z-score/
 def modified_z_score(values: list[float]) -> list[float]:
@@ -341,6 +351,7 @@ def main(directory, additional_env_vars, compare_names, filter, execution_stats)
             merged_env_vars = {**additional_env_vars}
             intermediate_results: dict[str, list[Result]] = {}
             processed: list[Result] = []
+            iterations_rc = False
 
             # Determine if we should run regular benchmarks
             # Run regular benchmarks if:
@@ -355,7 +366,7 @@ def main(directory, additional_env_vars, compare_names, filter, execution_stats)
 
             if should_run_regular:
                 for _ in range(options.iterations_stddev):
-                    run_iterations(
+                    iterations_rc = run_iterations(
                         benchmark,
                         merged_env_vars,
                         options.iterations,
@@ -375,7 +386,7 @@ def main(directory, additional_env_vars, compare_names, filter, execution_stats)
             if options.unitrace and (
                 benchmark.traceable(TracingType.UNITRACE) or args.unitrace == "force"
             ):
-                run_iterations(
+                iterations_rc = run_iterations(
                     benchmark,
                     merged_env_vars,
                     1,
@@ -389,7 +400,7 @@ def main(directory, additional_env_vars, compare_names, filter, execution_stats)
                 benchmark.traceable(TracingType.FLAMEGRAPH)
                 or args.flamegraph == "force"
             ):
-                run_iterations(
+                iterations_rc = run_iterations(
                     benchmark,
                     merged_env_vars,
                     1,
@@ -400,7 +411,10 @@ def main(directory, additional_env_vars, compare_names, filter, execution_stats)
                 )
 
             results += processed
-            execution_stats["tests_passed"] += 1
+            if iterations_rc:
+                execution_stats["tests_passed"] += 1
+            else:
+                execution_stats["tests_failed"] += 1
         except Exception as e:
             execution_stats["tests_failed"] += 1
             if options.exit_on_failure: