diff --git a/devops/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py
index efbf7d77e003d..ea12ae04562e6 100644
--- a/devops/scripts/benchmarks/benches/base.py
+++ b/devops/scripts/benchmarks/benches/base.py
@@ -71,6 +71,17 @@ def teardown(self):
 
     @abstractmethod
     def run(self, env_vars) -> list[Result]:
+        """Execute the benchmark with the given environment variables.
+
+        Args:
+            env_vars: Environment variables to use when running the benchmark.
+
+        Returns:
+            A list of Result objects with the benchmark results.
+
+        Raises:
+            Exception: If the benchmark fails for any reason.
+        """
         pass
 
     @staticmethod
diff --git a/devops/scripts/benchmarks/benches/syclbench.py b/devops/scripts/benchmarks/benches/syclbench.py
index ffb164e2ce7cd..e851d17e0d9be 100644
--- a/devops/scripts/benchmarks/benches/syclbench.py
+++ b/devops/scripts/benchmarks/benches/syclbench.py
@@ -158,11 +158,13 @@ def run(self, env_vars) -> list[Result]:
             res_list = []
             for row in reader:
                 if not row[0].startswith("#"):
+                    # Check if the test passed
+                    if row[1] != "PASS":
+                        raise Exception(f"{row[0]} failed")
                     res_list.append(
                         Result(
                             label=f"{self.name()} {row[0]}",
                             value=float(row[12]) * 1000,  # convert to ms
-                            passed=(row[1] == "PASS"),
                             command=command,
                             env=env_vars,
                             unit="ms",
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index d90824bbb8c38..74b77a9d30581 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -41,37 +41,37 @@ def run_iterations(
 ):
     for iter in range(iters):
         print(f"running {benchmark.name()}, iteration {iter}... ", flush=True)
-        bench_results = benchmark.run(env_vars)
-        if bench_results is None:
-            if options.exit_on_failure:
-                raise RuntimeError(f"Benchmark {benchmark.name()} produced no results!")
-            else:
-                failures[benchmark.name()] = "benchmark produced no results!"
-                break
-
-        for bench_result in bench_results:
-            if not bench_result.passed:
+        try:
+            bench_results = benchmark.run(env_vars)
+            if bench_results is None:
                 if options.exit_on_failure:
-                    raise RuntimeError(
-                        f"Benchmark {benchmark.name()} failed: {bench_result.label} verification failed."
-                    )
+                    raise RuntimeError(f"Benchmark produced no results!")
                 else:
-                    failures[bench_result.label] = "verification failed"
-                    print(f"complete ({bench_result.label}: verification failed).")
-                    continue
-
-            print(
-                f"{benchmark.name()} complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit})."
-            )
+                    failures[benchmark.name()] = "benchmark produced no results!"
+                    break
 
-            bench_result.name = bench_result.label
-            bench_result.lower_is_better = benchmark.lower_is_better()
-            bench_result.suite = benchmark.get_suite_name()
+            for bench_result in bench_results:
+                print(
+                    f"{benchmark.name()} complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit})."
+                )
+                bench_result.name = bench_result.label
+                bench_result.lower_is_better = benchmark.lower_is_better()
+                bench_result.suite = benchmark.get_suite_name()
 
-            if bench_result.label not in results:
-                results[bench_result.label] = []
+                if bench_result.label not in results:
+                    results[bench_result.label] = []
 
-            results[bench_result.label].append(bench_result)
+                results[bench_result.label].append(bench_result)
+        except Exception as e:
+            failure_label = f"{benchmark.name()} iteration {iter}"
+            if options.exit_on_failure:
+                raise RuntimeError(
+                    f"Benchmark failed: {failure_label} verification failed: {str(e)}"
+                )
+            else:
+                failures[failure_label] = f"verification failed: {str(e)}"
+                print(f"complete ({failure_label}: verification failed: {str(e)}).")
+                continue
 
 
 # https://www.statology.org/modified-z-score/
diff --git a/devops/scripts/benchmarks/utils/result.py b/devops/scripts/benchmarks/utils/result.py
index c25c2da128214..64fdd8ade2d1d 100644
--- a/devops/scripts/benchmarks/utils/result.py
+++ b/devops/scripts/benchmarks/utils/result.py
@@ -15,7 +15,6 @@ class Result:
     value: float
     command: list[str]
     env: dict[str, str]
-    passed: bool = True
     unit: str = ""
     # stddev can be optionally set by the benchmark,
     # if not set, it will be calculated automatically.