diff --git a/README.md b/README.md
index 73cc1ef..14f853b 100644
--- a/README.md
+++ b/README.md
@@ -126,10 +126,29 @@ The `main.py` script is the command-line interface (CLI) for running the browser
| Argument | Description | Required | Default | Supported Environment(s) |
|-|-|-|-|-|
| `--query` | The natural language query for the browser agent to execute. | Yes | N/A | All |
+| `--agent` | The agent to use. Must be one of `browser` or `form`. | No | `browser` | All |
| `--env` | The computer use environment to use. Must be one of the following: `playwright`, or `browserbase` | No | N/A | All |
| `--initial_url` | The initial URL to load when the browser starts. | No | https://www.google.com | All |
| `--highlight_mouse` | If specified, the agent will attempt to highlight the mouse cursor's position in the screenshots. This is useful for visual debugging. | No | False (not highlighted) | `playwright` |
+### Form Agent
+
+The `FormAgent` is a specialized agent for filling out web forms. It extends the `BrowserAgent` with the ability to read data from local JSON files to populate form fields.
+
+**Example Usage:**
+
+Run the `FormAgent` with a query that instructs it to open the local form, read the data, and fill out the fields:
+
+```bash
+python main.py \
+ --agent form \
+ --initial_url "file://$(pwd)/form.html" \
+ --query "Read the data from data.json and fill out the business registration form."
+```
+
+The agent will then open the local `form.html` file in the browser, read the `data.json` file, and fill in the corresponding fields.
+
+
### Environment Variables
| Variable | Description | Required |
diff --git a/data.json b/data.json
new file mode 100644
index 0000000..f2a9447
--- /dev/null
+++ b/data.json
@@ -0,0 +1,13 @@
+{
+ "business_name": "Gemini Solutions",
+ "business_type": "llc",
+ "tax_id": "12-3456789",
+ "business_description": "Providing AI-powered solutions for everyday tasks.",
+ "owner_name": "Heiko Hotz",
+ "owner_email": "heiko.hotz@example.com",
+ "owner_phone": "123-456-7890",
+ "address_street": "123 AI Street",
+ "address_city": "Googleville",
+ "address_state": "CA",
+ "address_zip": "94043"
+}
diff --git a/form.html b/form.html
new file mode 100644
index 0000000..f3276b2
--- /dev/null
+++ b/form.html
@@ -0,0 +1,153 @@
+
+
+
+
+
+
+ Business Registration Form
+
+
+
+
+
+
+
Business Registration Form
+
+
+
+
+
+
\ No newline at end of file
diff --git a/form_agent.py b/form_agent.py
new file mode 100644
index 0000000..c46cdc1
--- /dev/null
+++ b/form_agent.py
@@ -0,0 +1,119 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+from typing import Literal, Optional, Union, Any
+from google import genai
+from google.genai import types
+import termcolor
+from google.genai.types import (
+ Part,
+ GenerateContentConfig,
+ Content,
+ Candidate,
+ FunctionResponse,
+ FinishReason,
+)
+import time
+from rich.console import Console
+from rich.table import Table
+
+from agent import BrowserAgent
+from computers import EnvState, Computer
+
+MAX_RECENT_TURN_WITH_SCREENSHOTS = 3
+PREDEFINED_COMPUTER_USE_FUNCTIONS = [
+ "open_web_browser",
+ "click_at",
+ "hover_at",
+ "type_text_at",
+ "scroll_document",
+ "scroll_at",
+ "wait_5_seconds",
+ "go_back",
+ "go_forward",
+ "search",
+ "navigate",
+ "key_combination",
+ "drag_and_drop",
+]
+
+
+console = Console()
+
+# Built-in Computer Use tools will return "EnvState".
+# Custom provided functions will return "dict".
+FunctionResponseT = Union[EnvState, dict]
+
+
+def read_data_from_json(file_path: str) -> dict:
+ """Reads data from a JSON file and returns it as a dictionary."""
+ with open(file_path, 'r') as f:
+ data = json.load(f)
+ return data
+
+
+def ask_for_help(question: str) -> str:
+ """Asks the user for help with a specific question."""
+ return input(question)
+
+
+class FormAgent(BrowserAgent):
+ def __init__(
+ self,
+ browser_computer: Computer,
+ query: str,
+ model_name: str,
+ verbose: bool = True,
+ can_ask_for_help: bool = False,
+ ):
+ super().__init__(browser_computer, query, model_name, verbose)
+ self.can_ask_for_help = can_ask_for_help
+
+ # Add your own custom functions here.
+ custom_functions = [
+ types.FunctionDeclaration.from_callable(
+ client=self._client, callable=read_data_from_json
+ )
+ ]
+ if self.can_ask_for_help:
+ custom_functions.append(
+ types.FunctionDeclaration.from_callable(
+ client=self._client, callable=ask_for_help
+ )
+ )
+
+ self._generate_content_config = GenerateContentConfig(
+ temperature=1,
+ top_p=0.95,
+ top_k=40,
+ max_output_tokens=8192,
+ tools=[
+ types.Tool(
+ computer_use=types.ComputerUse(
+ environment=types.Environment.ENVIRONMENT_BROWSER,
+ ),
+ ),
+ types.Tool(function_declarations=custom_functions),
+ ],
+ )
+
+ def handle_action(self, action: types.FunctionCall) -> FunctionResponseT:
+ """Handles the action and returns the environment state."""
+ if action.name == read_data_from_json.__name__:
+ return read_data_from_json(action.args["file_path"])
+ elif action.name == ask_for_help.__name__ and self.can_ask_for_help:
+ return {"response": ask_for_help(action.args["question"])}
+ else:
+ return super().handle_action(action)
diff --git a/main.py b/main.py
index 05d5537..b1df896 100644
--- a/main.py
+++ b/main.py
@@ -15,6 +15,7 @@
import os
from agent import BrowserAgent
+from form_agent import FormAgent
from computers import BrowserbaseComputer, PlaywrightComputer
@@ -24,12 +25,12 @@
def main() -> int:
parser = argparse.ArgumentParser(description="Run the browser agent with a query.")
parser.add_argument(
- "--query",
+ "--agent",
type=str,
- required=True,
- help="The query for the browser agent to execute.",
+ choices=("browser", "form"),
+ default="browser",
+ help="The agent to use.",
)
-
parser.add_argument(
"--env",
type=str,
@@ -71,14 +72,21 @@ def main() -> int:
raise ValueError("Unknown environment: ", args.env)
with env as browser_computer:
- agent = BrowserAgent(
- browser_computer=browser_computer,
- query=args.query,
- model_name=args.model,
- )
+ if args.agent == "form":
+ agent = FormAgent(
+ browser_computer=browser_computer,
+ query=args.query,
+ model_name=args.model,
+ )
+ else:
+ agent = BrowserAgent(
+ browser_computer=browser_computer,
+ query=args.query,
+ model_name=args.model,
+ )
agent.agent_loop()
return 0
if __name__ == "__main__":
- main()
+ main()
\ No newline at end of file