From eac24fe73e44a73f55629da0b60100cc6789e7ec Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Fri, 12 Sep 2025 18:38:07 +0530 Subject: [PATCH 01/11] Added Error Logging --- BackendBench/__init__.py | 11 ++++ BackendBench/backends/kernel_agent.py | 20 ++++++- BackendBench/backends/llm.py | 26 +++++---- BackendBench/llm_client.py | 81 ++++++++++++++++----------- 4 files changed, 91 insertions(+), 47 deletions(-) diff --git a/BackendBench/__init__.py b/BackendBench/__init__.py index 4a14d8f0..78c607bd 100644 --- a/BackendBench/__init__.py +++ b/BackendBench/__init__.py @@ -154,3 +154,14 @@ def disable() -> None: # Restore original operators _lib = None print("DirectoryBackend disabled") + + +class AgentError(Exception): + """ + Exception raised for errors related to LLM/agent failures, + such as rate limits, empty code, bad formatting, or API issues. + """ + + def __init__(self, message: str): + super().__init__(message) + self.message = message diff --git a/BackendBench/backends/kernel_agent.py b/BackendBench/backends/kernel_agent.py index fcd6c548..1ae8cece 100644 --- a/BackendBench/backends/kernel_agent.py +++ b/BackendBench/backends/kernel_agent.py @@ -8,6 +8,7 @@ import os from typing import Callable, Dict +from BackendBench import AgentError from BackendBench.utils import compile_kernel_from_string from .base import Backend @@ -236,6 +237,17 @@ def generate_kernel_with_agent(self, op, op_name: str) -> tuple[str, bool]: test_code=None, # Let KernelAgent auto-generate the test ) + # Agent error detection + if not result.get("kernel_code") or not isinstance(result.get("kernel_code"), str): + raise AgentError(f"Agent error: No kernel code produced for {op_name}.") + if "rate limit" in result.get("message", "").lower(): + raise AgentError(f"Agent error: Rate limit encountered for {op_name}.") + if ( + "error" in result.get("message", "").lower() + and "api" in result.get("message", "").lower() + ): + raise AgentError(f"Agent error: API error for {op_name}: {result.get('message')}") + if result["success"]: print(f"✅ KernelAgent succeeded for {op_name}!") print( @@ -258,9 +270,13 @@ def generate_kernel_with_agent(self, op, op_name: str) -> tuple[str, bool]: return result["kernel_code"], True else: - print(f"❌ KernelAgent failed for {op_name}: {result['message']}") - return "", False + raise AgentError( + f"Agent error: KernelAgent failed for {op_name}: {result['message']}" + ) + except AgentError as e: + print(f"❌ {e}") + return "", False except Exception as e: print(f"❌ KernelAgent error for {op_name}: {e}") return "", False diff --git a/BackendBench/backends/llm.py b/BackendBench/backends/llm.py index 6e60eb43..74c3dace 100644 --- a/BackendBench/backends/llm.py +++ b/BackendBench/backends/llm.py @@ -14,6 +14,7 @@ import torch +from BackendBench import AgentError from BackendBench.llm_client import LLMKernelGenerator from BackendBench.multiprocessing_eval import MultiprocessingEvaluator from BackendBench.utils import ( @@ -166,9 +167,19 @@ def test_kernel_correctness( "compilation_error": None, "test_errors": [], "summary": None, + "agent_error": None, } try: + # Agent error detection before compilation + if not kernel_code or not isinstance(kernel_code, str): + raise AgentError( + "Kernel code is empty or not a string (possible agent failure or rate limit)." + ) + if "rate limit" in kernel_code.lower(): + raise AgentError("Agent response indicates rate limiting.") + if "error" in kernel_code.lower() and "api" in kernel_code.lower(): + raise AgentError("Agent/API error detected in response.") kernel_file = self._generate_kernel_file_path(op_name, attempt) if not os.path.exists(kernel_file): save_kernel_to_file(kernel_code, kernel_file) @@ -177,16 +188,12 @@ def test_kernel_correctness( f"{op_name}_implementation_v{attempt}", kernel_file ) module = importlib.util.module_from_spec(spec) - - # Add to sys.modules so triton can find it sys.modules[f"{op_name}_implementation_v{attempt}"] = module try: spec.loader.exec_module(module) - expected_name = f"{op_name}_kernel_impl" if hasattr(module, expected_name): - # check if the kernel compile / is loadable _ = getattr(module, expected_name) else: available_functions = [ @@ -197,12 +204,9 @@ def test_kernel_correctness( raise ValueError( f"Expected function '{expected_name}' not found. Available: {available_functions}" ) - finally: if f"test_kernel_{op_name}_{attempt}" in sys.modules: del sys.modules[f"test_kernel_{op_name}_{attempt}"] - - # Clear CUDA cache and synchronize to prevent memory buildup if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() @@ -210,7 +214,6 @@ def test_kernel_correctness( correct_count = 0 total_count = 0 correctness_results = [] - # todo: this is to protect against IMA errors, however, we should make this work / make sense with multiple workers with MultiprocessingEvaluator(1) as evaluator: loaded_kenrel = PickleableKernel(kernel_file, op_name, attempt) _ = evaluator.submit_task( @@ -219,10 +222,7 @@ def test_kernel_correctness( test_cases, [], ) - - # Start evaluation evaluator.start_evaluation() - # Get results results = evaluator.get_results() for result in results: @@ -247,6 +247,10 @@ def test_kernel_correctness( return is_correct, feedback_info + except AgentError as e: + feedback_info["agent_error"] = str(e) + feedback_info["summary"] = f"Agent error: {str(e)}" + return False, feedback_info except Exception as e: logger.error(" ✗ Compilation failed:") logger.error(f" Error: {str(e)}") diff --git a/BackendBench/llm_client.py b/BackendBench/llm_client.py index d8a5876b..4070554c 100644 --- a/BackendBench/llm_client.py +++ b/BackendBench/llm_client.py @@ -12,6 +12,8 @@ from tenacity import retry from tenacity.wait import wait_random_exponential +from BackendBench import AgentError + from .kernel_templates import KernelTemplateManager @@ -60,15 +62,22 @@ def readme_setup_section(self) -> str: @retry(wait=wait_random_exponential(multiplier=2, min=1, max=60, exp_base=2)) def call_llm(self, prompt: str) -> str: - response = self.client.messages.create( - model=self.model, - max_tokens=8000, - temperature=0.2, - timeout=120.0, - messages=[{"role": "user", "content": prompt}], - ) - content = response.content[0].text - return content + try: + response = self.client.messages.create( + model=self.model, + max_tokens=8000, + temperature=0.2, + timeout=120.0, + messages=[{"role": "user", "content": prompt}], + ) + content = response.content[0].text + if not content or "rate limit" in content.lower(): + raise AgentError("Agent error: Empty response or rate limit encountered.") + return content + except anthropic.AnthropicError as e: + raise AgentError(f"Anthropic API error: {e}") + except Exception as e: + raise AgentError(f"Unexpected agent error: {e}") def generate_kernel( self, @@ -94,7 +103,7 @@ def generate_kernel( try: content = self.call_llm(prompt) if not content: - raise RuntimeError("Empty response from LLM relay server") + raise AgentError("Agent error: Empty response from LLM relay server.") extracted_code = self._extract_code_from_response(content) @@ -107,11 +116,13 @@ def generate_kernel( return extracted_code except requests.exceptions.RequestException as e: - raise RuntimeError( - f"Failed to communicate with LLM relay server for {op_name}: {str(e)}" + raise AgentError( + f"Agent error: Failed to communicate with LLM relay server for {op_name}: {str(e)}" ) + except AgentError: + raise except Exception as e: - raise RuntimeError(f"Failed to generate kernel for {op_name}: {str(e)}") + raise AgentError(f"Agent error: Failed to generate kernel for {op_name}: {str(e)}") def generate_kernel_with_retry( self, @@ -180,16 +191,11 @@ def _format_feedback(self, feedback_info: Dict) -> str: def _extract_code_from_response(self, response: str) -> str: if "```python" not in response: - raise ValueError( - "No Python code block found in LLM response. Response should contain ```python...``` block." - ) - + raise AgentError("Agent error: No Python code block found in LLM response.") start = response.find("```python") + len("```python") end = response.find("```", start) - if end == -1: - raise ValueError("Unclosed Python code block in LLM response.") - + raise AgentError("Agent error: Unclosed Python code block in LLM response.") return response[start:end].strip() @@ -245,17 +251,24 @@ def call_llm(self, prompt: str) -> str: else None ) - response = requests.post( - self.server_url, - json=request_data, - headers={"Content-Type": "application/json"}, - timeout=120.0, - proxies=proxies, - ) - - if response.status_code != 200: - raise RuntimeError(f"Server returned status {response.status_code}: {response.text}") - - response_data = response.json() - content = response_data.get("output", "") - return content + try: + response = requests.post( + self.server_url, + json=request_data, + headers={"Content-Type": "application/json"}, + timeout=120.0, + proxies=proxies, + ) + if response.status_code != 200: + raise AgentError( + f"Agent error: Server returned status {response.status_code}: {response.text}" + ) + response_data = response.json() + content = response_data.get("output", "") + if not content or "rate limit" in content.lower(): + raise AgentError("Agent error: Empty response or rate limit encountered.") + return content + except requests.exceptions.RequestException as e: + raise AgentError(f"Agent error: Failed to communicate with LLM relay server: {str(e)}") + except Exception as e: + raise AgentError(f"Agent error: Unexpected error in LLM relay call: {e}") From 25d090a6efc7a706e3fbb95c02669fbd2939f124 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Fri, 12 Sep 2025 18:41:28 +0530 Subject: [PATCH 02/11] Updates --- BackendBench/backends/kernel_agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/BackendBench/backends/kernel_agent.py b/BackendBench/backends/kernel_agent.py index 1ae8cece..9b3b8195 100644 --- a/BackendBench/backends/kernel_agent.py +++ b/BackendBench/backends/kernel_agent.py @@ -271,13 +271,13 @@ def generate_kernel_with_agent(self, op, op_name: str) -> tuple[str, bool]: return result["kernel_code"], True else: raise AgentError( - f"Agent error: KernelAgent failed for {op_name}: {result['message']}" + f"Agent error: ❌ KernelAgent failed for {op_name}: {result['message']}" ) except AgentError as e: print(f"❌ {e}") return "", False - except Exception as e: + except AgentError as e: print(f"❌ KernelAgent error for {op_name}: {e}") return "", False From 01426d223086985d674ef5be99dcb5ddfed8b60a Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Sat, 13 Sep 2025 22:30:59 +0530 Subject: [PATCH 03/11] Updates --- BackendBench/backends/kernel_agent.py | 12 +----------- BackendBench/backends/llm.py | 7 ++----- BackendBench/llm_client.py | 8 +++++--- 3 files changed, 8 insertions(+), 19 deletions(-) diff --git a/BackendBench/backends/kernel_agent.py b/BackendBench/backends/kernel_agent.py index 9b3b8195..df161f17 100644 --- a/BackendBench/backends/kernel_agent.py +++ b/BackendBench/backends/kernel_agent.py @@ -8,7 +8,7 @@ import os from typing import Callable, Dict -from BackendBench import AgentError +from BackendBench.agent_errors import AgentError from BackendBench.utils import compile_kernel_from_string from .base import Backend @@ -240,13 +240,6 @@ def generate_kernel_with_agent(self, op, op_name: str) -> tuple[str, bool]: # Agent error detection if not result.get("kernel_code") or not isinstance(result.get("kernel_code"), str): raise AgentError(f"Agent error: No kernel code produced for {op_name}.") - if "rate limit" in result.get("message", "").lower(): - raise AgentError(f"Agent error: Rate limit encountered for {op_name}.") - if ( - "error" in result.get("message", "").lower() - and "api" in result.get("message", "").lower() - ): - raise AgentError(f"Agent error: API error for {op_name}: {result.get('message')}") if result["success"]: print(f"✅ KernelAgent succeeded for {op_name}!") @@ -274,9 +267,6 @@ def generate_kernel_with_agent(self, op, op_name: str) -> tuple[str, bool]: f"Agent error: ❌ KernelAgent failed for {op_name}: {result['message']}" ) - except AgentError as e: - print(f"❌ {e}") - return "", False except AgentError as e: print(f"❌ KernelAgent error for {op_name}: {e}") return "", False diff --git a/BackendBench/backends/llm.py b/BackendBench/backends/llm.py index 84c67eb4..c89a98ee 100644 --- a/BackendBench/backends/llm.py +++ b/BackendBench/backends/llm.py @@ -168,7 +168,7 @@ def test_kernel_correctness( "compilation_error": None, "test_errors": [], "summary": None, - "agent_error": None, + "agent_error": "", } try: @@ -177,10 +177,6 @@ def test_kernel_correctness( raise AgentError( "Kernel code is empty or not a string (possible agent failure or rate limit)." ) - if "rate limit" in kernel_code.lower(): - raise AgentError("Agent response indicates rate limiting.") - if "error" in kernel_code.lower() and "api" in kernel_code.lower(): - raise AgentError("Agent/API error detected in response.") kernel_file = self._generate_kernel_file_path(op_name, attempt) if not os.path.exists(kernel_file): save_kernel_to_file(kernel_code, kernel_file) @@ -215,6 +211,7 @@ def test_kernel_correctness( correct_count = 0 total_count = 0 correctness_results = [] + # todo: this is to protect against IMA errors, however, we should make this work / make sense with multiple workers with MultiprocessingEvaluator(1) as evaluator: loaded_kenrel = PickleableKernel(kernel_file, op_name, attempt) _ = evaluator.submit_task( diff --git a/BackendBench/llm_client.py b/BackendBench/llm_client.py index 782710ac..17f70a04 100644 --- a/BackendBench/llm_client.py +++ b/BackendBench/llm_client.py @@ -12,7 +12,7 @@ from tenacity import retry from tenacity.wait import wait_random_exponential -from BackendBench import AgentError +from BackendBench.agent_errors import AgentError from .kernel_templates import KernelTemplateManager @@ -71,8 +71,10 @@ def call_llm(self, prompt: str) -> str: messages=[{"role": "user", "content": prompt}], ) content = response.content[0].text - if not content or "rate limit" in content.lower(): - raise AgentError("Agent error: Empty response or rate limit encountered.") + if not content: + raise AgentError("Agent error: Empty response from LLM API (API failure or rate limit).") + if "rate limit" in content.lower(): + raise AgentError("Agent error: Rate limit encountered from LLM API.") return content except anthropic.AnthropicError as e: raise AgentError(f"Anthropic API error: {e}") From f2aad73fea6b7b8a8a3e530e02f1a6b67262617a Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Sat, 13 Sep 2025 23:01:19 +0530 Subject: [PATCH 04/11] Updates and Feedback noted --- BackendBench/agent_errors.py | 9 ++++++++- BackendBench/backends/kernel_agent.py | 21 ++++++++++----------- BackendBench/backends/llm.py | 4 ++-- BackendBench/llm_client.py | 18 +++++++----------- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/BackendBench/agent_errors.py b/BackendBench/agent_errors.py index 78029f5b..b217566c 100644 --- a/BackendBench/agent_errors.py +++ b/BackendBench/agent_errors.py @@ -1,3 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. + + class AgentError(Exception): """ Exception raised for errors related to LLM/agent failures, @@ -6,4 +13,4 @@ class AgentError(Exception): def __init__(self, message: str): super().__init__(message) - self.message = message \ No newline at end of file + self.message = message diff --git a/BackendBench/backends/kernel_agent.py b/BackendBench/backends/kernel_agent.py index df161f17..1f526625 100644 --- a/BackendBench/backends/kernel_agent.py +++ b/BackendBench/backends/kernel_agent.py @@ -223,22 +223,19 @@ def generate_kernel_with_agent(self, op, op_name: str) -> tuple[str, bool]: """ try: agent = self._get_kernel_agent() - - # Create problem description problem_description = self._create_problem_description_from_op(op, op_name) - print( f"🚀 Generating {op_name} kernel with KernelAgent (parallel workers + refinement)" ) - # Generate kernel using KernelAgent result = agent.generate_kernel( problem_description=problem_description, - test_code=None, # Let KernelAgent auto-generate the test + test_code=None, ) - # Agent error detection - if not result.get("kernel_code") or not isinstance(result.get("kernel_code"), str): + # Only raise AgentError if kernel_code is missing or malformed + kernel_code = result.get("kernel_code") + if not kernel_code or not isinstance(kernel_code, str): raise AgentError(f"Agent error: No kernel code produced for {op_name}.") if result["success"]: @@ -247,8 +244,6 @@ def generate_kernel_with_agent(self, op, op_name: str) -> tuple[str, bool]: f" Worker {result['worker_id']} found solution in {result['rounds']} rounds" ) print(f" Session: {result['session_dir']}") - - # Copy the session directory to our kernels directory for preservation import shutil session_name = os.path.basename(result["session_dir"]) @@ -260,9 +255,9 @@ def generate_kernel_with_agent(self, op, op_name: str) -> tuple[str, bool]: print(f" Session preserved: {preserved_session}") except Exception as e: print(f" Warning: Could not preserve session: {e}") - - return result["kernel_code"], True + return kernel_code, True else: + # This is an agent output error, so raise AgentError raise AgentError( f"Agent error: ❌ KernelAgent failed for {op_name}: {result['message']}" ) @@ -270,6 +265,10 @@ def generate_kernel_with_agent(self, op, op_name: str) -> tuple[str, bool]: except AgentError as e: print(f"❌ KernelAgent error for {op_name}: {e}") return "", False + except Exception as e: + # API/provider errors are not actionable by the agent + print(f"❌ API/provider error for {op_name}: {e}") + return "", False def __getitem__(self, key): if key in self.compiled_kernels: diff --git a/BackendBench/backends/llm.py b/BackendBench/backends/llm.py index c89a98ee..5f82beb2 100644 --- a/BackendBench/backends/llm.py +++ b/BackendBench/backends/llm.py @@ -172,10 +172,10 @@ def test_kernel_correctness( } try: - # Agent error detection before compilation + # Only raise AgentError if kernel_code is missing or malformed if not kernel_code or not isinstance(kernel_code, str): raise AgentError( - "Kernel code is empty or not a string (possible agent failure or rate limit)." + "Kernel code is empty or not a string (agent failed to produce a kernel)." ) kernel_file = self._generate_kernel_file_path(op_name, attempt) if not os.path.exists(kernel_file): diff --git a/BackendBench/llm_client.py b/BackendBench/llm_client.py index 17f70a04..07f6d8a4 100644 --- a/BackendBench/llm_client.py +++ b/BackendBench/llm_client.py @@ -72,14 +72,16 @@ def call_llm(self, prompt: str) -> str: ) content = response.content[0].text if not content: - raise AgentError("Agent error: Empty response from LLM API (API failure or rate limit).") + raise ConnectionError( + "API error: Empty response from LLM API (possible rate limit or outage)." + ) if "rate limit" in content.lower(): - raise AgentError("Agent error: Rate limit encountered from LLM API.") + raise ConnectionError("API error: Rate limit encountered from LLM API.") return content except anthropic.AnthropicError as e: - raise AgentError(f"Anthropic API error: {e}") + raise ConnectionError(f"API error: Anthropic API error: {e}") except Exception as e: - raise AgentError(f"Unexpected agent error: {e}") + raise ConnectionError(f"API error: Unexpected error: {e}") def generate_kernel( self, @@ -104,9 +106,7 @@ def generate_kernel( try: content = self.call_llm(prompt) - if not content: - raise AgentError("Agent error: Empty response from LLM relay server.") - + # Only raise AgentError if kernel extraction fails extracted_code = self._extract_code_from_response(content) print("\n=== DEBUG: RAW LLM RELAY RESPONSE ===") @@ -117,10 +117,6 @@ def generate_kernel( return extracted_code - except requests.exceptions.RequestException as e: - raise AgentError( - f"Agent error: Failed to communicate with LLM relay server for {op_name}: {str(e)}" - ) except AgentError: raise except Exception as e: From e2f1c6887ac912dae2c4bcb8f7ca23cab2c185de Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Thu, 18 Sep 2025 19:35:31 +0530 Subject: [PATCH 05/11] Rename done, ConnectionError Defined and reverted changes from kernel_agent.py --- BackendBench/backends/kernel_agent.py | 29 +++++++++------------ BackendBench/backends/llm.py | 1 - BackendBench/{agent_errors.py => errors.py} | 0 BackendBench/llm_client.py | 3 ++- 4 files changed, 14 insertions(+), 19 deletions(-) rename BackendBench/{agent_errors.py => errors.py} (100%) diff --git a/BackendBench/backends/kernel_agent.py b/BackendBench/backends/kernel_agent.py index 1f526625..fcd6c548 100644 --- a/BackendBench/backends/kernel_agent.py +++ b/BackendBench/backends/kernel_agent.py @@ -8,7 +8,6 @@ import os from typing import Callable, Dict -from BackendBench.agent_errors import AgentError from BackendBench.utils import compile_kernel_from_string from .base import Backend @@ -223,27 +222,28 @@ def generate_kernel_with_agent(self, op, op_name: str) -> tuple[str, bool]: """ try: agent = self._get_kernel_agent() + + # Create problem description problem_description = self._create_problem_description_from_op(op, op_name) + print( f"🚀 Generating {op_name} kernel with KernelAgent (parallel workers + refinement)" ) + # Generate kernel using KernelAgent result = agent.generate_kernel( problem_description=problem_description, - test_code=None, + test_code=None, # Let KernelAgent auto-generate the test ) - # Only raise AgentError if kernel_code is missing or malformed - kernel_code = result.get("kernel_code") - if not kernel_code or not isinstance(kernel_code, str): - raise AgentError(f"Agent error: No kernel code produced for {op_name}.") - if result["success"]: print(f"✅ KernelAgent succeeded for {op_name}!") print( f" Worker {result['worker_id']} found solution in {result['rounds']} rounds" ) print(f" Session: {result['session_dir']}") + + # Copy the session directory to our kernels directory for preservation import shutil session_name = os.path.basename(result["session_dir"]) @@ -255,19 +255,14 @@ def generate_kernel_with_agent(self, op, op_name: str) -> tuple[str, bool]: print(f" Session preserved: {preserved_session}") except Exception as e: print(f" Warning: Could not preserve session: {e}") - return kernel_code, True + + return result["kernel_code"], True else: - # This is an agent output error, so raise AgentError - raise AgentError( - f"Agent error: ❌ KernelAgent failed for {op_name}: {result['message']}" - ) + print(f"❌ KernelAgent failed for {op_name}: {result['message']}") + return "", False - except AgentError as e: - print(f"❌ KernelAgent error for {op_name}: {e}") - return "", False except Exception as e: - # API/provider errors are not actionable by the agent - print(f"❌ API/provider error for {op_name}: {e}") + print(f"❌ KernelAgent error for {op_name}: {e}") return "", False def __getitem__(self, key): diff --git a/BackendBench/backends/llm.py b/BackendBench/backends/llm.py index b667eee9..2d98af15 100644 --- a/BackendBench/backends/llm.py +++ b/BackendBench/backends/llm.py @@ -17,7 +17,6 @@ import torch from BackendBench.agent_errors import AgentError -from BackendBench.eval import eval_performance from BackendBench.eval import ( CorrectnessTestResult, eval_performance, diff --git a/BackendBench/agent_errors.py b/BackendBench/errors.py similarity index 100% rename from BackendBench/agent_errors.py rename to BackendBench/errors.py diff --git a/BackendBench/llm_client.py b/BackendBench/llm_client.py index 91c65919..c1cbca3d 100644 --- a/BackendBench/llm_client.py +++ b/BackendBench/llm_client.py @@ -9,10 +9,11 @@ import anthropic import requests +from requests.exceptions import ConnectionError from tenacity import retry from tenacity.wait import wait_random_exponential -from BackendBench.agent_errors import AgentError +from BackendBench.errors import AgentError from .kernel_templates import KernelTemplateManager From c172bf9ab6c85101eefabaff2d0e39441b6e267b Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Mon, 22 Sep 2025 00:10:40 +0530 Subject: [PATCH 06/11] Updates --- BackendBench/backends/llm.py | 9 +++++---- BackendBench/llm_client.py | 6 +++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/BackendBench/backends/llm.py b/BackendBench/backends/llm.py index 2d98af15..6c373925 100644 --- a/BackendBench/backends/llm.py +++ b/BackendBench/backends/llm.py @@ -16,7 +16,7 @@ import torch -from BackendBench.agent_errors import AgentError +from BackendBench.errors import AgentError from BackendBench.eval import ( CorrectnessTestResult, eval_performance, @@ -315,11 +315,12 @@ def test_kernel_correctness( kernel_file = self._generate_kernel_file_path(op_name, attempt) if not os.path.exists(kernel_file): save_kernel_to_file(kernel_code, kernel_file) - spec = importlib.util.spec_from_file_location( f"{op_name}_implementation_v{attempt}", kernel_file ) module = importlib.util.module_from_spec(spec) + + # Add to sys.modules so triton can find it sys.modules[f"{op_name}_implementation_v{attempt}"] = module try: @@ -367,8 +368,8 @@ def test_kernel_correctness( return is_correct, feedback_info except AgentError as e: - feedback_info["agent_error"] = str(e) - feedback_info["summary"] = f"Agent error: {str(e)}" + feedback_info.compilation_error = str(e) + feedback_info.summary = f"Agent error: {str(e)}" return False, feedback_info except Exception as e: logger.error(" ✗ Compilation failed:") diff --git a/BackendBench/llm_client.py b/BackendBench/llm_client.py index c1cbca3d..80e404d9 100644 --- a/BackendBench/llm_client.py +++ b/BackendBench/llm_client.py @@ -194,13 +194,13 @@ def call_llm(self, prompt: str) -> str: proxies=proxies, ) if response.status_code != 200: - raise AgentError( - f"Agent error: Server returned status {response.status_code}: {response.text}" + raise ConnectionError( + f"Server returned status {response.status_code}: {response.text}" ) response_data = response.json() content = response_data.get("output", "") if not content or "rate limit" in content.lower(): - raise AgentError("Agent error: Empty response or rate limit encountered.") + raise ConnectionError("Empty response or rate limit encountered.") return content except requests.exceptions.RequestException as e: raise AgentError(f"Agent error: Failed to communicate with LLM relay server: {str(e)}") From 17f44465d8b092305b85e42a6aa1a413da842bf6 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Mon, 22 Sep 2025 08:48:30 +0530 Subject: [PATCH 07/11] Updates --- BackendBench/backends/llm.py | 3 +++ BackendBench/llm_client.py | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/BackendBench/backends/llm.py b/BackendBench/backends/llm.py index 6c373925..c5e0df88 100644 --- a/BackendBench/backends/llm.py +++ b/BackendBench/backends/llm.py @@ -353,7 +353,10 @@ def test_kernel_correctness( test_cases, [], ) + + # Start evaluation evaluator.start_evaluation() + # Get results results = evaluator.get_results() for result in results: diff --git a/BackendBench/llm_client.py b/BackendBench/llm_client.py index 80e404d9..aa18ba66 100644 --- a/BackendBench/llm_client.py +++ b/BackendBench/llm_client.py @@ -118,10 +118,8 @@ def generate_kernel( return extracted_code - except AgentError: - raise except Exception as e: - raise AgentError(f"Agent error: Failed to generate kernel for {op_name}: {str(e)}") + raise RuntimeError(f"Agent error: Failed to generate kernel for {op_name}: {str(e)}") def _extract_code_from_response(self, response: str) -> str: if "```python" not in response: @@ -203,6 +201,8 @@ def call_llm(self, prompt: str) -> str: raise ConnectionError("Empty response or rate limit encountered.") return content except requests.exceptions.RequestException as e: - raise AgentError(f"Agent error: Failed to communicate with LLM relay server: {str(e)}") + raise ConnectionError( + f"Agent error: Failed to communicate with LLM relay server: {str(e)}" + ) except Exception as e: - raise AgentError(f"Agent error: Unexpected error in LLM relay call: {e}") + raise RuntimeError(f"Agent error: Unexpected error in LLM relay call: {e}") From f1689f01ed8bc0dddb7667ca520daaee01f7ad50 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Tue, 30 Sep 2025 14:23:24 +0530 Subject: [PATCH 08/11] Updates --- BackendBench/backends/llm.py | 3 ++- BackendBench/llm_client.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/BackendBench/backends/llm.py b/BackendBench/backends/llm.py index c5e0df88..7f2dd23b 100644 --- a/BackendBench/backends/llm.py +++ b/BackendBench/backends/llm.py @@ -40,6 +40,7 @@ class FeedbackInfo: """Consolidated feedback information for kernel generation.""" compilation_error: Optional[str] = None + agent_error: Optional[str] = None correctness_results: List[CorrectnessTestResult] = None performance_results: List[PerformanceTestResult] = None summary: str = "" @@ -371,7 +372,7 @@ def test_kernel_correctness( return is_correct, feedback_info except AgentError as e: - feedback_info.compilation_error = str(e) + feedback_info.agent_error = str(e) feedback_info.summary = f"Agent error: {str(e)}" return False, feedback_info except Exception as e: diff --git a/BackendBench/llm_client.py b/BackendBench/llm_client.py index aa18ba66..54db42fc 100644 --- a/BackendBench/llm_client.py +++ b/BackendBench/llm_client.py @@ -198,7 +198,7 @@ def call_llm(self, prompt: str) -> str: response_data = response.json() content = response_data.get("output", "") if not content or "rate limit" in content.lower(): - raise ConnectionError("Empty response or rate limit encountered.") + raise RuntimeError("Empty response or rate limit encountered.") return content except requests.exceptions.RequestException as e: raise ConnectionError( From a34f20d9bb3ff938069e635defc2d22c853cbb39 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Thu, 2 Oct 2025 15:51:07 +0530 Subject: [PATCH 09/11] Updates --- BackendBench/llm_client.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/BackendBench/llm_client.py b/BackendBench/llm_client.py index 54db42fc..76093c54 100644 --- a/BackendBench/llm_client.py +++ b/BackendBench/llm_client.py @@ -73,14 +73,14 @@ def call_llm(self, prompt: str) -> str: ) content = response.content[0].text if not content: - raise ConnectionError( + raise RuntimeError( "API error: Empty response from LLM API (possible rate limit or outage)." ) if "rate limit" in content.lower(): - raise ConnectionError("API error: Rate limit encountered from LLM API.") + raise RuntimeError("API error: Rate limit encountered from LLM API.") return content except anthropic.AnthropicError as e: - raise ConnectionError(f"API error: Anthropic API error: {e}") + raise e except Exception as e: raise ConnectionError(f"API error: Unexpected error: {e}") @@ -201,8 +201,6 @@ def call_llm(self, prompt: str) -> str: raise RuntimeError("Empty response or rate limit encountered.") return content except requests.exceptions.RequestException as e: - raise ConnectionError( - f"Agent error: Failed to communicate with LLM relay server: {str(e)}" - ) + raise ConnectionError(f"Failed to communicate with LLM relay server: {str(e)}") except Exception as e: - raise RuntimeError(f"Agent error: Unexpected error in LLM relay call: {e}") + raise RuntimeError(f"Unexpected error in LLM relay call: {e}") From d6dc1d56ed672a968b5f75145bd985d52cb125d3 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Thu, 2 Oct 2025 16:02:58 +0530 Subject: [PATCH 10/11] Updates --- BackendBench/llm_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BackendBench/llm_client.py b/BackendBench/llm_client.py index 76093c54..964e7a16 100644 --- a/BackendBench/llm_client.py +++ b/BackendBench/llm_client.py @@ -82,7 +82,7 @@ def call_llm(self, prompt: str) -> str: except anthropic.AnthropicError as e: raise e except Exception as e: - raise ConnectionError(f"API error: Unexpected error: {e}") + raise RuntimeError(f"API error: Unexpected error: {e}") def generate_kernel( self, From 9aec2cf82820078ce5f2a4491327778aad1fceba Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Fri, 17 Oct 2025 08:48:53 +0530 Subject: [PATCH 11/11] Updates --- BackendBench/backends/llm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/BackendBench/backends/llm.py b/BackendBench/backends/llm.py index 06c10dd3..c348814a 100644 --- a/BackendBench/backends/llm.py +++ b/BackendBench/backends/llm.py @@ -279,9 +279,9 @@ def _generate_kernel_feedback_file_path(self, op_name: str, attempt: int) -> str os.makedirs(op_dir, exist_ok=True) return os.path.join(op_dir, f"{op_name}_implementation_v{attempt}_generated_feedback.txt") - def _make_error_func(self, error_msg): + def _make_error_func(self, error_msg, op_name: str = None): def error_func(*args, **kwargs): - raise RuntimeError(f"Compilation of kernel failed: {error_msg}") + raise RuntimeError(f"Compilation of kernel failed: {error_msg} for op {op_name}") return error_func