Skip to content

Commit 77c9a5c

Browse files
committed
Merge main into refactor/add-null-check-helpers
Resolved conflicts in harbor_env.py by keeping the helper method approach (_require_container and _require_task) over the inline if/raise checks.
2 parents 3609560 + 17836cb commit 77c9a5c

12 files changed

Lines changed: 1030 additions & 19 deletions

File tree

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
# ARES
1+
# ARES: Agentic Research & Evaluation Suite
2+
3+
<p align="center">
4+
<img margin="auto" width="auto" height="312" alt="image" src="https://github.com/user-attachments/assets/ae34ab36-b78f-48de-93c9-01d611a547e3" />
5+
</p>
26

37
ARES (Agentic Research and Evaluation Suite) is an RL-first framework for training and evaluating agents.
48

@@ -133,4 +137,4 @@ if __name__ == "__main__":
133137
This example uses:
134138
- **Container backend:** Local Docker (change to `daytona.DaytonaContainer` for cloud)
135139
- **LLM backend:** Martian API (or any OpenAI-compatible API)
136-
- **Code agent:** MiniSWE agent from the mini-swe-agent library
140+
- **Code agent:** MiniSWE agent from the mini-swe-agent library

examples/03_state_snapshotting.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
"""Example demonstrating environment state snapshotting and restoration.
2+
3+
This example shows how to:
4+
1. Create a snapshot after reset (at episode boundary)
5+
2. Save the snapshot to disk
6+
3. Restore an environment from a saved snapshot
7+
4. Continue execution from the restored state
8+
9+
Example usage:
10+
11+
1. Make sure you have examples dependencies installed
12+
`uv sync --group examples`
13+
2. Run the example
14+
`uv run -m examples.03_state_snapshotting`
15+
"""
16+
17+
import asyncio
18+
import pathlib
19+
import tempfile
20+
21+
from ares.code_agents import mini_swe_agent
22+
from ares.containers import docker
23+
from ares.environments import snapshot
24+
from ares.environments import swebench_env
25+
from ares.llms import chat_completions_compatible
26+
27+
28+
async def main():
29+
# Create an LLM client
30+
agent = chat_completions_compatible.ChatCompletionCompatibleLLMClient(model="openai/gpt-4o-mini")
31+
32+
# Load SWE-bench tasks
33+
all_tasks = swebench_env.swebench_verified_tasks()
34+
tasks = [all_tasks[0]]
35+
36+
print(f"Running on task: {tasks[0].instance_id}")
37+
print(f"Repository: {tasks[0].repo}")
38+
print("-" * 80)
39+
40+
# Create a temporary directory for snapshots
41+
with tempfile.TemporaryDirectory() as snapshot_dir:
42+
snapshot_path = pathlib.Path(snapshot_dir)
43+
44+
# === PART 1: Create and save a snapshot ===
45+
print("\n[PART 1] Creating initial environment and snapshot...")
46+
47+
async with swebench_env.SweBenchEnv(
48+
tasks=tasks,
49+
code_agent_factory=mini_swe_agent.MiniSWECodeAgent,
50+
container_factory=docker.DockerContainer,
51+
) as env:
52+
# Reset the environment to get the first timestep
53+
ts = await env.reset()
54+
print(f"Environment reset complete. Step count: {env._step_count}")
55+
56+
# Take a few steps before snapshotting
57+
for i in range(3):
58+
action = await agent(ts.observation)
59+
print(f" Step {i}: Taking action...")
60+
ts = await env.step(action)
61+
62+
if ts.last():
63+
print(" Episode completed early")
64+
break
65+
66+
print(f"Current step count: {env._step_count}")
67+
68+
# Wait for agent to finish current operation (reach episode boundary)
69+
# In practice, you'd snapshot after step() returns with done=True
70+
# or after reset() completes. For this example, we'll simulate
71+
# waiting for agent to finish.
72+
if not ts.last():
73+
print("\n Note: For snapshotting, we need to be at episode boundary.")
74+
print(" Cancelling agent task to reach boundary...")
75+
if env._code_agent_task and not env._code_agent_task.done():
76+
env._code_agent_task.cancel()
77+
import contextlib
78+
79+
with contextlib.suppress(asyncio.CancelledError):
80+
await env._code_agent_task
81+
82+
# Now we can export state (at episode boundary)
83+
print("\n Exporting state snapshot...")
84+
snap = await env.export_state(snapshot_path, snapshot_id="example-snapshot")
85+
86+
print(f" ✓ Snapshot created: {snap.snapshot_id}")
87+
print(f" ✓ Snapshot saved to: {snap.snapshot_dir}")
88+
print(f" ✓ Step count in snapshot: {snap.step_count}")
89+
print(f" ✓ Task type: {snap.task_type}")
90+
print(f" ✓ Container type: {snap.container_type}")
91+
92+
# === PART 2: Restore from snapshot ===
93+
print("\n[PART 2] Restoring environment from snapshot...")
94+
95+
# Load snapshot metadata
96+
snapshot_file = snapshot_path / "example-snapshot" / "snapshot.json"
97+
loaded_snap = snapshot.EnvironmentSnapshot.load_from_file(snapshot_file)
98+
99+
print(f" ✓ Loaded snapshot: {loaded_snap.snapshot_id}")
100+
print(f" ✓ Original step count: {loaded_snap.step_count}")
101+
102+
# Restore environment from snapshot
103+
# Note: This creates a new environment instance with the saved state
104+
restored_env = await swebench_env.SweBenchEnv.load_from_state(
105+
loaded_snap,
106+
container_factory=docker.DockerContainer,
107+
code_agent_factory=mini_swe_agent.MiniSWECodeAgent,
108+
)
109+
110+
print(" ✓ Environment restored")
111+
print(f" ✓ Restored step count: {restored_env._step_count}")
112+
print(f" ✓ Task: {restored_env._current_task.instance_id}")
113+
114+
# Use the restored environment in async context
115+
async with restored_env:
116+
print("\n[PART 3] Continuing from restored state...")
117+
118+
# The environment is now at the same state as when we snapshotted
119+
# We can continue taking steps from here
120+
ts = await restored_env.reset() # Reset to start a new episode
121+
step_count = 0
122+
123+
# Take a few more steps to demonstrate it works
124+
while not ts.last() and step_count < 3:
125+
action = await agent(ts.observation)
126+
print(f" Step {step_count}: Taking action from restored env...")
127+
ts = await restored_env.step(action)
128+
step_count += 1
129+
130+
print(f"\n ✓ Completed {step_count} additional steps from restored state")
131+
132+
print("\n" + "=" * 80)
133+
print("Snapshot example completed successfully!")
134+
print("=" * 80)
135+
print("\nKey takeaways:")
136+
print(" 1. Snapshots can only be taken at episode boundaries")
137+
print(" 2. Snapshots save: task state, container filesystem, agent messages")
138+
print(" 3. Restored environments can continue execution normally")
139+
print(" 4. Use cases: debugging, RL replay, mechanistic interpretability")
140+
141+
142+
if __name__ == "__main__":
143+
asyncio.run(main())

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ dependencies = [
2121
"daytona>=0.125.0",
2222
"docker>=7.1.0",
2323
"frozendict>=2.4.7",
24-
"harbor>=0.1.25",
24+
"harbor>=0.1.32",
2525
"httpx>=0.28.1",
2626
"jinja2>=3.1.6",
2727
"mini-swe-agent>=1.17.3",

src/ares/code_agents/mini_swe_agent.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
# Original code: https://github.com/SWE-agent/mini-swe-agent/blob/main/src/minisweagent/agents/default.py
2+
# Copyright (c) 2025 Kilian A. Lieret and Carlos E. Jimenez
3+
# Licensed under the MIT License.
4+
#
5+
# Modifications Copyright (c) 2026 Martian
6+
17
"""A Code Agent wrapping the mini-swe-agent.
28
39
Last checked: Nov 15, 2025

src/ares/containers/daytona.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import functools
77
import logging
88
import pathlib
9+
import shlex
910

1011
import daytona
1112
import daytona.common.errors
@@ -124,7 +125,8 @@ async def exec_run(
124125
# If we get a Timeout, it gets re-raised.
125126
result = await _exec_with_retry(
126127
self._sbx,
127-
command,
128+
# NOTE: Many code agents expect things like `.bashrc` to be loaded, so we use `bash -lc` here
129+
f"bash -lc {shlex.quote(command)}",
128130
workdir=workdir,
129131
env=env,
130132
timeout_s=timeout_s,

src/ares/containers/docker.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,19 @@ def _client(self) -> docker.DockerClient:
2727
async def start(self, env: dict[str, str] | None = None) -> None:
2828
"""Start the container."""
2929
if self.image is None:
30-
assert self.dockerfile_path is not None, "Must specify one of image or dockerfile_path"
30+
if self.dockerfile_path is None:
31+
raise ValueError("Must specify one of image or dockerfile_path")
32+
33+
dockerfile_path = pathlib.Path(self.dockerfile_path)
3134
# TODO: Some kind of cache for dockerfile directory to avoid
3235
# rebuilding same image over and over again?
3336
image_obj, _ = await asyncio.to_thread(
3437
self._client.images.build,
35-
path=self.dockerfile_path.parent,
38+
path=dockerfile_path.parent.as_posix(),
3639
tag=self.name,
3740
)
3841
self.image = image_obj.id
42+
assert self.image is not None, f"Image ID is None for container {self.name}"
3943

4044
self._container = await asyncio.to_thread(
4145
self._client.containers.run,
@@ -64,7 +68,9 @@ async def exec_run(
6468
result = await asyncio.wait_for(
6569
asyncio.to_thread(
6670
self._container.exec_run,
67-
["sh", "-c", command],
71+
# TODO: Consolidate this implementation into some container base class/helper fn
72+
# NOTE: Many code agents expect things like `.bashrc` to be loaded, so we use `bash -lc` here
73+
["bash", "-lc", command],
6874
workdir=workdir,
6975
environment=env,
7076
),
@@ -93,6 +99,12 @@ async def upload_files(self, local_paths: list[pathlib.Path], remote_paths: list
9399
# Determine the destination directory
94100
remote_dir = str(pathlib.Path(remote_path).parent)
95101

102+
# Create dirs if they don't exist (since otherwise Docker complains)
103+
await asyncio.to_thread(
104+
self._container.exec_run,
105+
cmd=f"mkdir -p {remote_dir}",
106+
)
107+
96108
# Upload the tar archive to the container
97109
await asyncio.to_thread(
98110
self._container.put_archive,

0 commit comments

Comments
 (0)