diff --git a/README.md b/README.md index 73cc1ef..14f853b 100644 --- a/README.md +++ b/README.md @@ -126,10 +126,29 @@ The `main.py` script is the command-line interface (CLI) for running the browser | Argument | Description | Required | Default | Supported Environment(s) | |-|-|-|-|-| | `--query` | The natural language query for the browser agent to execute. | Yes | N/A | All | +| `--agent` | The agent to use. Must be one of `browser` or `form`. | No | `browser` | All | | `--env` | The computer use environment to use. Must be one of the following: `playwright`, or `browserbase` | No | N/A | All | | `--initial_url` | The initial URL to load when the browser starts. | No | https://www.google.com | All | | `--highlight_mouse` | If specified, the agent will attempt to highlight the mouse cursor's position in the screenshots. This is useful for visual debugging. | No | False (not highlighted) | `playwright` | +### Form Agent + +The `FormAgent` is a specialized agent for filling out web forms. It extends the `BrowserAgent` with the ability to read data from local JSON files to populate form fields. + +**Example Usage:** + +Run the `FormAgent` with a query that instructs it to open the local form, read the data, and fill out the fields: + +```bash +python main.py \ + --agent form \ + --initial_url "file://$(pwd)/form.html" \ + --query "Read the data from data.json and fill out the business registration form." +``` + +The agent will then open the local `form.html` file in the browser, read the `data.json` file, and fill in the corresponding fields. + + ### Environment Variables | Variable | Description | Required | diff --git a/data.json b/data.json new file mode 100644 index 0000000..f2a9447 --- /dev/null +++ b/data.json @@ -0,0 +1,13 @@ +{ + "business_name": "Gemini Solutions", + "business_type": "llc", + "tax_id": "12-3456789", + "business_description": "Providing AI-powered solutions for everyday tasks.", + "owner_name": "Heiko Hotz", + "owner_email": "heiko.hotz@example.com", + "owner_phone": "123-456-7890", + "address_street": "123 AI Street", + "address_city": "Googleville", + "address_state": "CA", + "address_zip": "94043" +} diff --git a/form.html b/form.html new file mode 100644 index 0000000..f3276b2 --- /dev/null +++ b/form.html @@ -0,0 +1,153 @@ + + + + + + + Business Registration Form + + + + + +
+

Business Registration Form

+
+ +
+ Business Information + + + + + + + + + + + +
+ +
+ Owner's Information + + + + + + + + +
+ +
+ Business Address + + + + + + + + + + + +
+ + + +
+
+ + + + \ No newline at end of file diff --git a/form_agent.py b/form_agent.py new file mode 100644 index 0000000..c46cdc1 --- /dev/null +++ b/form_agent.py @@ -0,0 +1,119 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import json +from typing import Literal, Optional, Union, Any +from google import genai +from google.genai import types +import termcolor +from google.genai.types import ( + Part, + GenerateContentConfig, + Content, + Candidate, + FunctionResponse, + FinishReason, +) +import time +from rich.console import Console +from rich.table import Table + +from agent import BrowserAgent +from computers import EnvState, Computer + +MAX_RECENT_TURN_WITH_SCREENSHOTS = 3 +PREDEFINED_COMPUTER_USE_FUNCTIONS = [ + "open_web_browser", + "click_at", + "hover_at", + "type_text_at", + "scroll_document", + "scroll_at", + "wait_5_seconds", + "go_back", + "go_forward", + "search", + "navigate", + "key_combination", + "drag_and_drop", +] + + +console = Console() + +# Built-in Computer Use tools will return "EnvState". +# Custom provided functions will return "dict". +FunctionResponseT = Union[EnvState, dict] + + +def read_data_from_json(file_path: str) -> dict: + """Reads data from a JSON file and returns it as a dictionary.""" + with open(file_path, 'r') as f: + data = json.load(f) + return data + + +def ask_for_help(question: str) -> str: + """Asks the user for help with a specific question.""" + return input(question) + + +class FormAgent(BrowserAgent): + def __init__( + self, + browser_computer: Computer, + query: str, + model_name: str, + verbose: bool = True, + can_ask_for_help: bool = False, + ): + super().__init__(browser_computer, query, model_name, verbose) + self.can_ask_for_help = can_ask_for_help + + # Add your own custom functions here. + custom_functions = [ + types.FunctionDeclaration.from_callable( + client=self._client, callable=read_data_from_json + ) + ] + if self.can_ask_for_help: + custom_functions.append( + types.FunctionDeclaration.from_callable( + client=self._client, callable=ask_for_help + ) + ) + + self._generate_content_config = GenerateContentConfig( + temperature=1, + top_p=0.95, + top_k=40, + max_output_tokens=8192, + tools=[ + types.Tool( + computer_use=types.ComputerUse( + environment=types.Environment.ENVIRONMENT_BROWSER, + ), + ), + types.Tool(function_declarations=custom_functions), + ], + ) + + def handle_action(self, action: types.FunctionCall) -> FunctionResponseT: + """Handles the action and returns the environment state.""" + if action.name == read_data_from_json.__name__: + return read_data_from_json(action.args["file_path"]) + elif action.name == ask_for_help.__name__ and self.can_ask_for_help: + return {"response": ask_for_help(action.args["question"])} + else: + return super().handle_action(action) diff --git a/main.py b/main.py index 05d5537..b1df896 100644 --- a/main.py +++ b/main.py @@ -15,6 +15,7 @@ import os from agent import BrowserAgent +from form_agent import FormAgent from computers import BrowserbaseComputer, PlaywrightComputer @@ -24,12 +25,12 @@ def main() -> int: parser = argparse.ArgumentParser(description="Run the browser agent with a query.") parser.add_argument( - "--query", + "--agent", type=str, - required=True, - help="The query for the browser agent to execute.", + choices=("browser", "form"), + default="browser", + help="The agent to use.", ) - parser.add_argument( "--env", type=str, @@ -71,14 +72,21 @@ def main() -> int: raise ValueError("Unknown environment: ", args.env) with env as browser_computer: - agent = BrowserAgent( - browser_computer=browser_computer, - query=args.query, - model_name=args.model, - ) + if args.agent == "form": + agent = FormAgent( + browser_computer=browser_computer, + query=args.query, + model_name=args.model, + ) + else: + agent = BrowserAgent( + browser_computer=browser_computer, + query=args.query, + model_name=args.model, + ) agent.agent_loop() return 0 if __name__ == "__main__": - main() + main() \ No newline at end of file