task_processor.py

import os
import signal
import sys
import time
import zipfile
import struct # Required by some subprocesses
import subprocess

from utils.command import run_command


def flush_agents():
    run_command("@2501 agents --flush")


def get_cli_version():
    """Get the CLI version by running the CLI version command."""
    result = subprocess.run(['@2501', '--version'],
                            capture_output=True, text=True, check=True)
    return result.stdout.strip()


def get_engine_version():
    """Get the engine version by running the engine version command."""
    result = subprocess.run(['@2501', 'engine-version'],
                            capture_output=True, text=True, check=True)
    return result.stdout.strip()


def process_task(task, files_dir, max_retries=3,  agent_config='CODING_AGENT'):
    """
    Process a single task and record the result in the benchmark report.

    Args:
        task (dict): The task dictionary.
        files_dir (str): The directory containing the files.
        max_retries (int): Maximum number of retries for the task.
        agent_config (str): The agent configuration to use.
    """
    start_time = time.time()
    task_id = task['id']
    input_command = task['input']
    test_command = task.get('test_command', "")
    test_script = task.get('test_script', "")

    print(f"Processing task {task_id}")

    # Unzip the corresponding zip file
    zip_path = os.path.join(files_dir, f"{task_id}.zip")
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(files_dir)
        print(f"Unzipped file: {zip_path}")
    else:
        # If the zip file does not exist, create the dir
        os.makedirs(os.path.join(files_dir, task_id), exist_ok=True)

    attempts = 0
    passed = False
    error_message = None
    duration_ms = 0
    accuracy = 0
    prompt_limiter = "IMPORTANT: You are being benchmarked, don\\'t output prose or comments. Only provide the shortest answer possible."
    prompt_limiter = ""

    # limit the output generated by the AI
    if input_command.endswith("'"):
        input_command = input_command[:-1] + " " + prompt_limiter + "'"
    else:
        input_command += " " + prompt_limiter

    while attempts < max_retries:
        attempts += 1
        try:
            if attempts > 1:
                print(f"Retrying task {task_id} (attempt {attempts})")
            flush_agents()
            # Execute the input command
            print(f"Executing command: @2501 {input_command}")

            stdout, stderr, returncode = run_command(
                f"cd {files_dir}/{task_id} && @2501 init --config {agent_config} && @2501 {input_command}")
            print(f"Command returncode: {returncode} | stdout: {stdout}")
            if stderr.strip(): print(f"Command stderr: {stderr}")

            if returncode != 0:
                print(f"Command failed with return code {returncode} | Error output: {stderr}")
                continue

            test_local = locals()
            passed = False
            output = None

            # Run the test command or script
            if test_command:
                print(f"Executing script at {test_command}")
                # execute the script at path, with args
                out, err, code = run_command(test_command)
                print(f"Test command returncode: {code} | stdout: {out}")
                if err.strip(): print(f"Test command stderr: {err}")
                passed = int(code) == 0
                output = passed and "PASS" or "FAIL"
            elif test_script:
                print(f"Executing in-line test script")
                signal.signal(signal.SIGALRM, signal_handler)
                signal.alarm(120)  # 2 minutes timeout
                try:
                    exec(test_script, globals(), test_local)
                    output = test_local.get('output', 'FAIL').strip().upper()
                    passed = output == "PASS"
                except KeyboardInterrupt:
                    print('Interrupted! Terminating.')
                    sys.exit(0)
                finally:
                    signal.alarm(0)

            print(f"Test {task_id} | Passed: {passed}")
            break

        except Exception as e:
            print(f"Test failed: {str(e)}", file=sys.stderr)
            error_message = str(e)
            # Retry only it's a server error
            if "The server has returned an error" in str(e):
                continue
            else:
                break

    duration_ms = int((time.time() - start_time) * 1000)
    accuracy = 1.0 / attempts if passed else 0.0

    if accuracy is None:
        accuracy = 1.0 if passed else 0.0

    result_entry = {
        "task_id": task_id,
        "task_name": task_id,
        "input_command": input_command,
        "script": test_command or test_script,
        "passed": passed,
        "retries": attempts-1,
        "metrics": {
            "duration_ms": duration_ms,
            "accuracy": accuracy,
        },
        "error_message": error_message,
    }
    return result_entry


def signal_handler(signum, frame):
    raise TimeoutException(f"Timed out! {signum}")


class TimeoutException(Exception): pass