Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
5ceeb60
miniwob with mcp browser backend, first draft
ollmer Oct 31, 2025
450dacf
actions whitelist, fixes, support new order of the agent env creation…
ollmer Oct 31, 2025
2e2b8a6
miniwob config
ollmer Oct 31, 2025
630569a
llm config
ollmer Oct 31, 2025
8be56ce
fixes, use firefox
ollmer Nov 3, 2025
9acd97d
plan_react agent with function calling and sonnet llm
ollmer Nov 3, 2025
cfc85c6
fixes
ollmer Nov 3, 2025
f278c0f
fix done state parsing
ollmer Nov 7, 2025
4e27c3a
fixes
ollmer Nov 7, 2025
d1953d2
refactor loop step_info
ollmer Nov 7, 2025
5656d0b
return page snapshot to mcp playwright results
ollmer Nov 7, 2025
b06c4e2
fix loop
ollmer Nov 7, 2025
f5ad036
vision support
ollmer Nov 7, 2025
a827344
fix agent_info as dict
ollmer Nov 7, 2025
4117e0a
remove tapeagents dep from backends core, fixes
ollmer Nov 7, 2025
a3fa1c9
python playwright backend draft
ollmer Nov 7, 2025
955e0d3
fixes
ollmer Nov 10, 2025
61a537f
remove tapeagents dep, add task-level obs postprocess
ollmer Nov 11, 2025
b82aef0
fix
ollmer Nov 13, 2025
645ee2d
fix action space
ollmer Nov 13, 2025
02dee09
playwright backend
ollmer Nov 17, 2025
f591f36
fix obs format
ollmer Nov 18, 2025
01e0719
simplest react agent with markdown observations, images and tool calls
ollmer Nov 18, 2025
dba5978
fix mcp close
ollmer Nov 18, 2025
ecf59d5
async playwright backend
ollmer Nov 18, 2025
d42dfd7
fixes
ollmer Nov 18, 2025
55da7cf
format
ollmer Nov 18, 2025
1f090c2
fix pw actions
ollmer Nov 18, 2025
f2c480a
fix tapeagent
ollmer Nov 18, 2025
8be928a
fixes
ollmer Nov 18, 2025
963c999
better final step, pass loop backend init if needed, click timeouts
ollmer Nov 19, 2025
d1d5c6d
return flatten axtree from playwright backend
ollmer Nov 25, 2025
6664b69
pass backend cls, instantiate backend in task
ollmer Nov 25, 2025
ffebf6b
get html from playwright mcp
ollmer Nov 25, 2025
3378b56
better abstract class
ollmer Nov 25, 2025
323978d
init files
ollmer Nov 25, 2025
e2cd4b9
add base benchmark class to study
ollmer Nov 25, 2025
20502a8
move action and tool classes to actions module
ollmer Nov 25, 2025
dfbc005
improve entrypoint
ollmer Nov 25, 2025
7a682a0
new react toolcall agent, inspired by tapeagents but independent
ollmer Nov 25, 2025
29ba1c4
few comments
ollmer Nov 25, 2025
d9c9216
simplify history format
ollmer Nov 25, 2025
cb6d213
fix
ollmer Nov 25, 2025
cc23893
fix
ollmer Nov 25, 2025
b8e5c3a
simpler tool call object
ollmer Nov 25, 2025
3d88daf
format
ollmer Nov 25, 2025
768d37c
history compaction
ollmer Nov 25, 2025
e28eb0f
tool schemas in the action module
ollmer Nov 25, 2025
f10615f
better task interface, support old bgym tasks in the new env
ollmer Nov 26, 2025
a203e46
support new tasks interface
ollmer Nov 26, 2025
362de79
async playwright backend
ollmer Nov 26, 2025
4fe4e48
fix
ollmer Nov 26, 2025
e6f1f5d
universal rendering of any dict observation that contains only texts …
ollmer Nov 26, 2025
e7aa807
fix
ollmer Nov 26, 2025
212c0f4
remove tape agent
ollmer Nov 26, 2025
8be1174
revert tapeagent changes
ollmer Nov 26, 2025
cdd9b54
fix
ollmer Nov 26, 2025
1befd83
html pruning
ollmer Nov 26, 2025
462038e
max obs size limit, function to prepare pair of turn data for rl trai…
ollmer Dec 4, 2025
cf68ef6
workarena bench, reuse bgym task inside
ollmer Dec 4, 2025
805c717
fixes
ollmer Dec 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions experiments/run_miniwob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import argparse
import logging
import os

from bgym import DEFAULT_BENCHMARKS
from dotenv import load_dotenv

from agentlab.agents.generic_agent.agent_configs import GPT5_MINI_FLAGS
from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs
from agentlab.agents.react_toolcall_agent import AgentConfig, LLMArgs, ReactToolCallAgentArgs
from agentlab.backends.browser.mcp_playwright import MCPPlaywright
from agentlab.backends.browser.playwright import SyncPlaywright
from agentlab.benchmarks.miniwob import MiniWobBenchmark
from agentlab.experiments.study import make_study
from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT

fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
logger = logging.getLogger(__name__)
load_dotenv()


def parse_args():
parser = argparse.ArgumentParser(description="Run MiniWob benchmark experiments")
parser.add_argument(
"--backend",
choices=["playwright", "mcp", "bgym"],
default="playwright",
help="Browser backend to use (default: playwright)",
)
parser.add_argument(
"--agent",
choices=["tape", "generic", "react"],
default="react",
help="Agent type to use (default: react)",
)
parser.add_argument(
"--config",
type=str,
default="miniwob",
help="Hydra config name to load (default: miniwob)",
)
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()

if args.backend == "bgym":
benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1)
elif args.backend == "playwright":
benchmark = MiniWobBenchmark(backend_cls=SyncPlaywright)
elif args.backend == "mcp":
benchmark = MiniWobBenchmark(backend_cls=MCPPlaywright)
else:
raise ValueError(f"Unknown backend: {args.backend}")

if args.agent == "generic":
agent_args = GenericAgentArgs(
chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-5-mini-2025-08-07"],
flags=GPT5_MINI_FLAGS,
)
else: # react
agent_args = ReactToolCallAgentArgs(
llm_args=LLMArgs(
model_name="azure/gpt-5-mini", temperature=1.0, max_total_tokens=128000
),
config=AgentConfig(),
)

study = make_study(
benchmark=benchmark,
agent_args=agent_args,
logging_level=logging.INFO,
logging_level_stdout=logging.INFO,
)
if os.environ.get("AGENTLAB_DEBUG"):
study.exp_args_list = study.exp_args_list[23:27]
study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential")
else:
study.run(n_jobs=8, n_relaunch=1, parallel_backend="ray")
125 changes: 125 additions & 0 deletions src/agentlab/actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import json
import logging
from typing import Callable, Literal
from uuid import uuid4

from bgym import AbstractActionSet
from langchain_core.utils.function_calling import convert_to_openai_tool
from pydantic import BaseModel, Field

from agentlab.llm.llm_utils import parse_html_tags_raise

logger = logging.getLogger(__name__)


class FunctionSpec(BaseModel):
"""
A class representing the specification of a function.

Attributes:
name (str): The name of the function.
description (str): A brief description of the function.
parameters (dict): A dictionary containing the parameters of the function.
"""

name: str
description: str
parameters: dict


class ToolCall(BaseModel):
id: str = Field(default_factory=lambda: uuid4().hex)
name: str
arguments: dict = Field(default_factory=dict)

def llm_view(self, **kwargs) -> str:
return self.model_dump_json(indent=2)


class ToolSpec(BaseModel):
"""
ToolSpec is a model that represents a tool specification with a type and a function.

Attributes:
type (Literal["function"]): The type of the tool, which is always "function".
function (FunctionSpec): The specification of the function.
"""

type: Literal["function"] = "function"
function: FunctionSpec

def description(self) -> str:
return f"{self.function.name} - {self.function.description}"

@classmethod
def from_function(cls, function: Callable):
"""
Creates an instance of the class by validating the model from a given function.

Args:
function (Callable): The function to be converted and validated.

Returns:
(ToolSpec): An instance of the class with the validated model.
"""
return cls.model_validate(convert_to_openai_tool(function))


class ToolsActionSet(AbstractActionSet):
multiaction: bool = False
strict: bool = False

def __init__(self, actions: list[ToolSpec]):
self.actions = actions

def describe(self, with_long_description: bool = True, with_examples: bool = True) -> str:
descs = []
for action in self.actions:
desc = f"## {action.description()}.\n Schema: {action.model_dump_json(indent=2)}"
descs.append(desc)
tools_description = "\n".join(descs)
return tools_description

def example_action(self, abstract: bool) -> str:
if abstract:
return """{
"name": "<action_name>",
"arguments": {
"<argument_name_1>": "<argument_value_1>",
"<argument_name_2>": "<argument_value_2>",
...
}
}"""
else:
return """{
"name": "browser_click",
"arguments": {
"element": "buttom with year 2022",
"ref": "e26"
}
}"""

@classmethod
def parse_action(cls, llm_output: str) -> ToolCall:
logger.info(f"Parsing action: {llm_output}")
if "<action>" in llm_output:
content_dict, valid, retry_message = parse_html_tags_raise(llm_output, keys=["action"])
if not valid or "action" not in content_dict:
raise ValueError(
f"Invalid action: llm_output: {llm_output}, retry_message: {retry_message}"
)
action_str = content_dict["action"]
else:
action_str = llm_output
try:
action_dict = json.loads(action_str)
except json.JSONDecodeError:
raise ValueError(f"Failed to parse action: {action_str}")
return ToolCall(name=action_dict["name"], arguments=action_dict["arguments"])

def to_python_code(self, action) -> str:
return action

def tools(self) -> list[dict]:
"""Returns the list of tool spec dicts for LLM consumption."""
return [tool.model_dump() for tool in self.actions]
22 changes: 15 additions & 7 deletions src/agentlab/agents/generic_agent/generic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,12 @@

from copy import deepcopy
from dataclasses import asdict, dataclass
from functools import partial
from warnings import warn

import bgym
from bgym import Benchmark
from browsergym.experiments.agent import Agent, AgentInfo

from agentlab.actions import ToolsActionSet
from agentlab.agents import dynamic_prompting as dp
from agentlab.agents.agent_args import AgentArgs
from agentlab.llm.chat_api import BaseModelArgs
Expand Down Expand Up @@ -65,9 +64,12 @@ def prepare(self):
def close(self):
return self.chat_model_args.close_server()

def make_agent(self):
def make_agent(self, actions: list | None = None):
return GenericAgent(
chat_model_args=self.chat_model_args, flags=self.flags, max_retry=self.max_retry
chat_model_args=self.chat_model_args,
flags=self.flags,
max_retry=self.max_retry,
actions=actions,
)


Expand All @@ -78,15 +80,21 @@ def __init__(
chat_model_args: BaseModelArgs,
flags: GenericPromptFlags,
max_retry: int = 4,
actions: list | None = None,
):

self.chat_llm = chat_model_args.make_model()
self.chat_model_args = chat_model_args
self.max_retry = max_retry

self.flags = flags
self.action_set = self.flags.action.action_set.make_action_set()
self._obs_preprocessor = dp.make_obs_preprocessor(flags.obs)
if actions is not None:
self.action_set = ToolsActionSet(actions=actions)
self.flags.action.action_set = self.action_set
self._obs_preprocessor = lambda obs: obs
else:
self.action_set = self.flags.action.action_set.make_action_set()
self._obs_preprocessor = dp.make_obs_preprocessor(flags.obs)

self._check_flag_constancy()
self.reset(seed=None)
Expand Down Expand Up @@ -157,7 +165,7 @@ def get_action(self, obs):
stats=stats,
extra_info={"chat_model_args": asdict(self.chat_model_args)},
)
return ans_dict["action"], agent_info
return ans_dict["action"], asdict(agent_info)

def reset(self, seed=None):
self.seed = seed
Expand Down
Loading
Loading