provability-fabric/tests/test_openhands_engine.py at main · SentinelOps-CI/provability-fabric · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# SPDX-License-Identifier: Apache-2.0
# Copyright 2025 Provability-Fabric Contributors

from __future__ import annotations

import json
import platform
import sys
import tempfile
from pathlib import Path
from unittest import mock

import pytest

REPO_ROOT = Path(__file__).resolve().parent.parent
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))
BENCH_SWEBENCH = REPO_ROOT / "bench" / "swebench"
if str(BENCH_SWEBENCH) not in sys.path:
    sys.path.insert(0, str(BENCH_SWEBENCH))


def _is_like_diff(text: str) -> bool:
    if not text or not text.strip():
        return False
    t = text.strip()
    return "diff --git" in t or "--- " in t or t.startswith("---") or "\n@@ " in t


def test_is_like_diff_rejects_non_diff():
    assert _is_like_diff("") is False
    assert _is_like_diff("   \n  ") is False
    assert _is_like_diff("not a patch at all") is False


def test_is_like_diff_accepts_minimal_diff():
    assert _is_like_diff("diff --git a/x b/x\nindex 1..2\n--- a/x\n+++ b/x\n@@ -1,1 +1,1 @@\n") is True
    assert _is_like_diff("--- a/file\n+++ b/file\n@@ -1 +1 @@\n") is True
    assert _is_like_diff("--- a/x\n+++ b/x\n@@ -1,3 +1,3 @@\n") is True


@pytest.mark.skipif(platform.system() == "Windows", reason="subprocess timeout differs on Windows")
def test_get_patch_from_repo_timeout_fallback():
    from bench.swebench.engines import openhands_engine

    with tempfile.TemporaryDirectory() as td:
        repo_dir = Path(td)
        (repo_dir / ".git").mkdir()
        with mock.patch.object(openhands_engine.subprocess, "run") as m_run:
            m_run.side_effect = openhands_engine.subprocess.TimeoutExpired("git diff HEAD", 1)
            out = openhands_engine._get_patch_from_repo(repo_dir, timeout=1)
    assert "# git diff failed" in out


def test_parse_trajectory_missing_file_returns_empty_trace():
    from bench.swebench.engines import openhands_engine

    with tempfile.TemporaryDirectory() as td:
        path = Path(td) / "nonexistent.jsonl"
        trace = openhands_engine._parse_trajectory_for_trace(path)
    assert trace.prompts_sent == []
    assert trace.tool_calls == []
    assert trace.files_modified == []


def test_parse_trajectory_invalid_jsonl_returns_empty_trace():
    from bench.swebench.engines import openhands_engine

    with tempfile.TemporaryDirectory() as td:
        path = Path(td) / "trajectory.jsonl"
        path.write_text("not valid json\n{{{]\n", encoding="utf-8")
        trace = openhands_engine._parse_trajectory_for_trace(path)
    assert trace.raw_events == []


def test_parse_trajectory_valid_jsonl_extracts_events():
    from bench.swebench.engines import openhands_engine

    with tempfile.TemporaryDirectory() as td:
        path = Path(td) / "trajectory.jsonl"
        path.write_text(
            json.dumps({"type": "action", "action": "edit", "path": "src/foo.py"}) + "\n",
            encoding="utf-8",
        )
        trace = openhands_engine._parse_trajectory_for_trace(path)
    assert len(trace.raw_events) == 1
    assert "src/foo.py" in (trace.files_modified or [])


def test_solve_sets_execution_mode_for_prime_intellect_subprocess():
    """Prime runs must go through subprocess path and must emit execution metadata."""
    from bench.swebench.engines import openhands_engine

    with tempfile.TemporaryDirectory() as td:
        ws = Path(td)
        (ws / "repo").mkdir()
        (ws / "scratch").mkdir()

        fake_trace = openhands_engine.EngineTrace()
        with mock.patch.object(openhands_engine, "_normalize_provider", return_value="prime_intellect"):
            with mock.patch.object(
                openhands_engine,
                "_run_openhands_subprocess",
                return_value=("", fake_trace, True, None, "", ""),
            ):
                res = openhands_engine.solve(
                    workspace_path=ws,
                    task_text="task",
                    config=openhands_engine.OpenHandsConfig(timeout_seconds=1),
                    extra_env=None,
                )

        assert res.success is True
        assert res.trace.execution_mode == "prime_subprocess"
        assert res.trace.cli_mode_forced is True
        assert "prime_intellect" in (res.trace.mode_reason or "").lower()
        assert isinstance(res.trace.openhands_library_core_available, bool)


def test_subprocess_timeout_sets_timeout_origin_on_trace():
    """TimeoutExpired in subprocess mode must attribute to subprocess_wall_timeout."""
    from bench.swebench.engines import openhands_engine

    with tempfile.TemporaryDirectory() as td:
        repo_dir = Path(td) / "repo"
        repo_dir.mkdir()
        scratch_dir = Path(td) / "scratch"
        scratch_dir.mkdir()

        config = openhands_engine.OpenHandsConfig(timeout_seconds=42, max_iterations=1)

        with (
            mock.patch.object(openhands_engine, "_llm_credentials", return_value=("pit_x", "", "prime_intellect")),
            mock.patch.object(openhands_engine, "_openhands_litellm_model", side_effect=lambda prov, m: m),
            mock.patch.object(openhands_engine, "_parse_trajectory_for_trace", return_value=openhands_engine.EngineTrace()),
            mock.patch.object(openhands_engine, "_get_files_modified_from_repo", return_value=[]),
            mock.patch.object(openhands_engine, "_get_patch_from_repo", return_value=""),
            mock.patch.object(
                openhands_engine.subprocess,
                "run",
                side_effect=openhands_engine.subprocess.TimeoutExpired(
                    cmd=["openhands"],
                    timeout=10,
                    output="",
                    stderr="",
                ),
            ),
        ):
            _patch_str, trace, success, err, _stdout, _stderr = openhands_engine._run_openhands_subprocess(
                repo_dir=repo_dir,
                task_text="task",
                config=config,
                scratch_dir=scratch_dir,
                extra_env=None,
            )

        assert success is False
        assert trace.timeout_origin == "subprocess_wall_timeout"
        assert trace.subprocess_timeout_seconds == 42


def test_solve_sets_first_action_latency_and_budgets_from_trace_events():
    from bench.swebench.engines import openhands_engine

    with tempfile.TemporaryDirectory() as td:
        ws = Path(td)
        (ws / "repo").mkdir()
        (ws / "scratch").mkdir()

        raw_events = [
            {"timestamp": "2026-01-01T00:00:00Z", "kind": "MessageEvent"},
            {"timestamp": "2026-01-01T00:00:05Z", "kind": "ActionEvent", "tool_name": "run_terminal_cmd"},
            {"timestamp": "2026-01-01T00:00:07Z", "kind": "ActionEvent", "tool_name": "edit_file"},
        ]
        fake_trace = openhands_engine.EngineTrace(raw_events=raw_events)

        with mock.patch.object(openhands_engine, "_normalize_provider", return_value="prime_intellect"):
            with mock.patch.object(
                openhands_engine,
                "_run_openhands_subprocess",
                return_value=("", fake_trace, True, None, "", ""),
            ):
                res = openhands_engine.solve(
                    workspace_path=ws,
                    task_text="task",
                    config=openhands_engine.OpenHandsConfig(timeout_seconds=100, max_iterations=1),
                    extra_env=None,
                )

        assert res.success is True
        assert res.trace.startup_budget_s is not None
        assert res.trace.action_budget_s is not None
        assert res.trace.finalization_budget_s is not None
        assert res.trace.first_action_latency_s == 5.0
        assert res.trace.first_file_edit_latency_s == 7.0


def test_solve_sets_timeout_snapshot_when_timeout_origin_present():
    from bench.swebench.engines import openhands_engine

    with tempfile.TemporaryDirectory() as td:
        ws = Path(td)
        (ws / "repo").mkdir()
        (ws / "scratch").mkdir()

        raw_events = [
            {"timestamp": "2026-01-01T00:00:00Z", "kind": "MessageEvent", "action": {"message": "boot"}},
            {"timestamp": "2026-01-01T00:00:03Z", "kind": "ActionEvent", "tool_name": "edit_file", "action": {"name": "edit_file"}},
            {"timestamp": "2026-01-01T00:00:05Z", "kind": "MessageEvent", "observation": "still running..."},
        ]
        fake_trace = openhands_engine.EngineTrace(raw_events=raw_events)
        fake_trace.timeout_origin = "subprocess_wall_timeout"

        with mock.patch.object(openhands_engine, "_normalize_provider", return_value="prime_intellect"):
            with mock.patch.object(
                openhands_engine,
                "_run_openhands_subprocess",
                return_value=("", fake_trace, False, "timed out", "", ""),
            ):
                res = openhands_engine.solve(
                    workspace_path=ws,
                    task_text="task",
                    config=openhands_engine.OpenHandsConfig(timeout_seconds=60, max_iterations=1),
                    extra_env=None,
                )

        assert res.success is False
        assert res.trace.timeout_snapshot is not None
        assert res.trace.timeout_snapshot.get("tail_event_count") is not None


def test_path_restricted_fallback_when_diff_stat_over_threshold():
    from bench.swebench.engines import openhands_engine

    with tempfile.TemporaryDirectory() as td:
        repo_dir = Path(td)
        (repo_dir / ".git").mkdir()
        scratch_dir = Path(td) / "scratch"
        scratch_dir.mkdir()
        paths_300 = [f"f{i}.py" for i in range(300)]
        config = openhands_engine.OpenHandsConfig(timeout_seconds=60)
        big_patch = "x" * (openhands_engine.MAX_PATCH_BYTES + 1)
        small_patch = "diff --git a/f0.py b/f0.py\n--- a/f0.py\n+++ b/f0.py\n@@ -0,0 +1,1 @@\n+x\n"

        trajectory_stdout = "\n".join(
            json.dumps({"type": "action", "action": "edit", "path": p})
            for p in paths_300
        )
        with mock.patch.object(
            openhands_engine,
            "_get_diff_stat_file_count",
            return_value=250,
        ):
            with mock.patch.object(
                openhands_engine,
                "_get_patch_from_repo_for_paths",
                side_effect=[big_patch, small_patch],
            ) as m_paths:
                with mock.patch.object(
                    openhands_engine.subprocess,
                    "run",
                    return_value=mock.Mock(
                        returncode=0, stdout=trajectory_stdout, stderr=""
                    ),
                ):
                    patch_str, trace, success, err, stdout, stderr = (
                        openhands_engine._run_openhands_subprocess(
                            repo_dir, "fix the bug", config, scratch_dir
                        )
                    )
                assert m_paths.call_count >= 2
                second_call_paths = m_paths.call_args_list[1][0][1]
                assert len(second_call_paths) == openhands_engine.PATH_RESTRICTED_MAX_PATHS_FALLBACK
                assert patch_str == small_patch