diff --git a/devops/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py index efbf7d77e003d..ea12ae04562e6 100644 --- a/devops/scripts/benchmarks/benches/base.py +++ b/devops/scripts/benchmarks/benches/base.py @@ -71,6 +71,17 @@ def teardown(self): @abstractmethod def run(self, env_vars) -> list[Result]: + """Execute the benchmark with the given environment variables. + + Args: + env_vars: Environment variables to use when running the benchmark. + + Returns: + A list of Result objects with the benchmark results. + + Raises: + Exception: If the benchmark fails for any reason. + """ pass @staticmethod diff --git a/devops/scripts/benchmarks/benches/syclbench.py b/devops/scripts/benchmarks/benches/syclbench.py index ffb164e2ce7cd..e851d17e0d9be 100644 --- a/devops/scripts/benchmarks/benches/syclbench.py +++ b/devops/scripts/benchmarks/benches/syclbench.py @@ -158,11 +158,13 @@ def run(self, env_vars) -> list[Result]: res_list = [] for row in reader: if not row[0].startswith("#"): + # Check if the test passed + if row[1] != "PASS": + raise Exception(f"{row[0]} failed") res_list.append( Result( label=f"{self.name()} {row[0]}", value=float(row[12]) * 1000, # convert to ms - passed=(row[1] == "PASS"), command=command, env=env_vars, unit="ms", diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py index d90824bbb8c38..74b77a9d30581 100755 --- a/devops/scripts/benchmarks/main.py +++ b/devops/scripts/benchmarks/main.py @@ -41,37 +41,37 @@ def run_iterations( ): for iter in range(iters): print(f"running {benchmark.name()}, iteration {iter}... ", flush=True) - bench_results = benchmark.run(env_vars) - if bench_results is None: - if options.exit_on_failure: - raise RuntimeError(f"Benchmark {benchmark.name()} produced no results!") - else: - failures[benchmark.name()] = "benchmark produced no results!" - break - - for bench_result in bench_results: - if not bench_result.passed: + try: + bench_results = benchmark.run(env_vars) + if bench_results is None: if options.exit_on_failure: - raise RuntimeError( - f"Benchmark {benchmark.name()} failed: {bench_result.label} verification failed." - ) + raise RuntimeError(f"Benchmark produced no results!") else: - failures[bench_result.label] = "verification failed" - print(f"complete ({bench_result.label}: verification failed).") - continue - - print( - f"{benchmark.name()} complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit})." - ) + failures[benchmark.name()] = "benchmark produced no results!" + break - bench_result.name = bench_result.label - bench_result.lower_is_better = benchmark.lower_is_better() - bench_result.suite = benchmark.get_suite_name() + for bench_result in bench_results: + print( + f"{benchmark.name()} complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit})." + ) + bench_result.name = bench_result.label + bench_result.lower_is_better = benchmark.lower_is_better() + bench_result.suite = benchmark.get_suite_name() - if bench_result.label not in results: - results[bench_result.label] = [] + if bench_result.label not in results: + results[bench_result.label] = [] - results[bench_result.label].append(bench_result) + results[bench_result.label].append(bench_result) + except Exception as e: + failure_label = f"{benchmark.name()} iteration {iter}" + if options.exit_on_failure: + raise RuntimeError( + f"Benchmark failed: {failure_label} verification failed: {str(e)}" + ) + else: + failures[failure_label] = f"verification failed: {str(e)}" + print(f"complete ({failure_label}: verification failed: {str(e)}).") + continue # https://www.statology.org/modified-z-score/ diff --git a/devops/scripts/benchmarks/utils/result.py b/devops/scripts/benchmarks/utils/result.py index c25c2da128214..64fdd8ade2d1d 100644 --- a/devops/scripts/benchmarks/utils/result.py +++ b/devops/scripts/benchmarks/utils/result.py @@ -15,7 +15,6 @@ class Result: value: float command: list[str] env: dict[str, str] - passed: bool = True unit: str = "" # stddev can be optionally set by the benchmark, # if not set, it will be calculated automatically.