diff --git a/.gitignore b/.gitignore index ea6f4be..aa3d848 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,11 @@ research_dir/* state_saves/* __pycache__/* Figure*.png -testrun.py \ No newline at end of file +testrun.py +# Agent logs +agent_logs/* +!agent_logs/.gitkeep +__pycache__/ +agent_logs/ +research_dir/ +*.pyc \ No newline at end of file diff --git a/ai_lab_repo.py b/ai_lab_repo.py index dbe9541..782550e 100755 --- a/ai_lab_repo.py +++ b/ai_lab_repo.py @@ -139,6 +139,7 @@ def perform_research(self): Loop through all research phases @return: None """ + create_visualization_interface() for phase, subtasks in self.phases: phase_start_time = time.time() # Start timing the phase if self.verbose: print(f"{'*'*50}\nBeginning phase: {phase}\n{'*'*50}") @@ -198,6 +199,8 @@ def perform_research(self): phase_duration = phase_end_time - phase_start_time print(f"Subtask '{subtask}' completed in {phase_duration:.2f} seconds.") self.statistics_per_phase[subtask]["time"] = phase_duration + if os.path.exists("agent_logs/mle_solver_latest.json"): + print(f"\nVisualization available at research_dir/visualization.html\n") def report_refinement(self): """ @@ -526,6 +529,33 @@ def human_in_loop(self, phase, phase_prod): else: print("Invalid response, type Y or N") return False +def create_visualization_interface(): + """Create a simple React interface for visualizations""" + visualization_html = """ + + + + Agent Laboratory Visualization + + + + +
+ + + + """ + + # Save the visualization interface + with open("research_dir/visualization.html", "w") as f: + f.write(visualization_html) def parse_arguments(): @@ -728,9 +758,3 @@ def parse_arguments(): ) lab.perform_research() - - - - - - diff --git a/components/AgentVisualizer.jsx b/components/AgentVisualizer.jsx new file mode 100644 index 0000000..37733a7 --- /dev/null +++ b/components/AgentVisualizer.jsx @@ -0,0 +1,175 @@ +import React, { useState, useEffect } from 'react'; +import { LineChart, Line, XAxis, YAxis, CartesianGrid, Tooltip, Legend, ResponsiveContainer } from 'recharts'; + +// Card components +const Card = ({ children, className = '' }) => ( +
{children}
+); + +const CardHeader = ({ children }) => ( +
{children}
+); + +const CardContent = ({ children }) => ( +
{children}
+); + +const CardTitle = ({ children }) => ( +

{children}

+); + +const AgentLabVisualization = () => { + const [logData, setLogData] = useState([]); + const [selectedStep, setSelectedStep] = useState(null); + const [error, setError] = useState(null); + const [loading, setLoading] = useState(true); + + useEffect(() => { + const loadData = async () => { + try { + const response = await window.fs.readFile('agent_logs/mle_solver_latest.json'); + const data = JSON.parse(new TextDecoder().decode(response)); + setLogData(data); + } catch (err) { + console.error('Error loading log data:', err); + setError(err.message); + } finally { + setLoading(false); + } + }; + + loadData(); + }, []); + + const formatScore = (score) => (score * 100).toFixed(1) + '%'; + + if (loading) { + return ( +
+
Loading data...
+
+ ); + } + + if (error) { + return ( +
+
Error: {error}
+
+ ); + } + + return ( +
+

Agent Lab Progress

+ + + + Score Progress Over Time + + +
+ + + + + + + + + + +
+
+
+ +
+ + + Commands + + +
+ {logData.map((entry, idx) => ( +
setSelectedStep(idx)} + className={`p-4 rounded-md cursor-pointer transition duration-200 ${ + selectedStep === idx + ? 'bg-blue-50 border border-blue-200' + : 'hover:bg-gray-50 border border-transparent' + }`} + > +
+ Step {entry.step} + {entry.command} +
+
+ Score: {formatScore(entry.score)} +
+
+ ))} +
+
+
+ + + + Step Details + + + {selectedStep !== null && logData[selectedStep] ? ( +
+
+

Code Changes

+
+                                        {logData[selectedStep].code_lines.join('\n')}
+                                    
+
+
+

Model Response

+

+ {logData[selectedStep].model_response} +

+
+
+ {new Date(logData[selectedStep].timestamp).toLocaleString()} +
+
+ ) : ( +
+ Select a step to view details +
+ )} +
+
+
+
+ ); +}; + +export default AgentLabVisualization; \ No newline at end of file diff --git a/mlesolver.py b/mlesolver.py index cfc4896..9057836 100755 --- a/mlesolver.py +++ b/mlesolver.py @@ -4,11 +4,10 @@ from common_imports import * from abc import abstractmethod - from tools import * from inference import * from pathlib import Path - +from visualization import VisualizationManager from contextlib import contextmanager import sys, os @@ -56,12 +55,6 @@ def parse_command(self, cmd_str) -> tuple: pass -""" -@@@@@@@@@@@@@@@@@@ -@@ CODING TOOLS @@ -@@@@@@@@@@@@@@@@@@ -""" - class Replace(Command): def __init__(self): super().__init__() @@ -76,7 +69,6 @@ def docstring(self) -> str: ) def execute_command(self, *args) -> str: - # args[0] -> new code args = args[0] return args[0] @@ -92,7 +84,6 @@ def parse_command(self, *args) -> tuple: return True, (new_code.split("\n"), code_ret) - class Edit(Command): def __init__(self): super().__init__() @@ -107,11 +98,6 @@ def docstring(self) -> str: ) def execute_command(self, *args) -> str: - # args[0] -> N (int) - # args[1] -> M (int) - # args[2] -> old code - # args[3] -> new lines to replace - # args[4] -> new lines to replace try: args = args[0] current_code = args[2] @@ -124,7 +110,7 @@ def execute_command(self, *args) -> str: new_code = "\n".join(current_code) code_exec = f"{args[4]}\n{new_code}" code_ret = execute_code(code_exec) - if "CODE EXECUTION ERROR" in code_ret: return (False, None, code_ret) + if "[CODE EXECUTION ERROR]" in code_ret: return (False, None, code_ret) return (True, current_code, code_ret) except Exception as e: return (False, None, str(e)) @@ -148,68 +134,6 @@ def parse_command(self, *args) -> tuple: return False, (None, None, None, None, None) -def get_score(outlined_plan, code, code_return, REWARD_MODEL_LLM, attempts=3, openai_api_key=None): - e = str() - for _attempt in range(attempts): - try: - # todo: have a reward function here - sys = ( - f"You are a professor agent who is serving as an expert reward model that can read a research plan, research code, and code output and are able to determine how well a model followed the plan, built the code, and got the proper output scored from 0 to 1 as a float.\n\n" - f"You must structure your score exactly in the following way: ```SCORE\n\n``` where SCORE is just the word score, is a floating point number between 0 and 1 representing how well the model followed the plan, built the code, and got the proper output." - ) - scoring = query_model( - model_str=f"{REWARD_MODEL_LLM}", - system_prompt=sys, - openai_api_key=openai_api_key, - prompt=( - f"Outlined in the following text is the research plan that the machine learning engineer was tasked with building: {outlined_plan}\n\n" - f"The following text is the research code that the model produced: \n{code}\n\n" - f"The following is the output from the model: {code_return}\n\n"), temp=0.6) - performance = extract_prompt(text=scoring, word="SCORE") - performance = float(performance) - return performance, f"The performance of your submission is: {performance}", True - except Exception as e: - return None, str(e), False - return 0, e - - -def code_repair(code, error, ctype, REPAIR_LLM, openai_api_key=None): - if ctype == "replace": - repair_sys = ( - "You are an automated code repair tool.\n" - "Your goal is to take in code and an error and repair the code to make sure the same error does not repeat itself, and also to remove any other potential errors from the code without affecting the code output.\n" - "Your output should match the original code as closely as possible.\n" - "You must wrap the code in the following ```python\n\n```\n" - "Do not forget the opening ```python and the closing ```." - ) - model_resp = query_model( - openai_api_key=openai_api_key, - model_str=f"{REPAIR_LLM}", - system_prompt=repair_sys, - prompt=f"Provided here is the error: {error}\n\nProvided below is the code:\n\n{code}", temp=0.8) - return extract_prompt(model_resp, "python") - elif ctype == "edit": - repair_sys = ( - "You are an automated code repair tool.\n" - "Your goal is to take in code and an error and repair the code to make sure the same error does not repeat itself, and also to remove any other potential errors from the code without affecting the code output.\n" - "Your output should match the original code as closely as possible.\n" - - "============= CODE EDITING TOOL =============\n" - "You have access to a code editing tool. \n" - "This tool allows you to replace lines indexed n through m (n:m) of the current code with as many lines of new code as you want to add. This removal is inclusive meaning that line n and m and everything between n and m is removed. This will be the primary way that you interact with code. \n" - "You can edit code using the following command: ```EDIT N M\n\n``` EDIT is the word EDIT, N is the first line index you want to replace and M the the last line index you want to replace (everything inbetween will also be removed), and will be the new code that is replacing the old code. Before changing the existing code to be your new code, your new code will be tested and if it returns an error it will not replace the existing code.\n" - "Please use the code editing tool to fix this code." - "Do not forget the opening ```EDIT N M and the closing ```." - "Your output should look like the following\n\n```EDIT N M\n\n```" - ) - model_resp = query_model( - openai_api_key=openai_api_key, - model_str=f"{REPAIR_LLM}", - system_prompt=repair_sys, - prompt=f"Provided here is the error: {error}\n\nProvided below is the code:\n\n{code}", temp=0.2) - return model_resp - - class MLESolver: def __init__(self, dataset_code, openai_api_key=None, notes=None, max_steps=10, insights=None, plan=None, llm_str=None): if notes is None: self.notes = [] @@ -231,14 +155,15 @@ def __init__(self, dataset_code, openai_api_key=None, notes=None, max_steps=10, self.should_execute_code = True self.openai_api_key = openai_api_key + # Initialize visualization manager + self.viz_manager = VisualizationManager() + self.logger = self.viz_manager.get_logger("mle_solver") + def initial_solve(self): """ Initialize the solver and get an initial set of code and a return @return: None """ - # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - # @@ Initial CodeGen Commands @@ - # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ self.best_score = None self.commands = [Replace()] self.model = f"{self.llm_str}" @@ -276,6 +201,16 @@ def gen_initial_code(self): prompt=f"{err_hist}\nYou should now use ```REPLACE to create initial code to solve the challenge. Now please enter the ```REPLACE command below:\n ", temp=1.0) model_resp = self.clean_text(model_resp) cmd_str, code_lines, prev_code_ret, should_execute_code, score = self.process_command(model_resp) + + # Log initial solve attempt + self.logger.log_step({ + 'phase': 'initial_solve', + 'attempt': num_attempts, + 'command': cmd_str, + 'code_lines': code_lines, + 'score': score + }) + print(f"@@@ INIT ATTEMPT: Command Exec // Attempt {num_attempts}: ", str(cmd_str).replace("\n", " | ")) print(f"$$$ Score: {score}") if score is not None: break @@ -288,6 +223,7 @@ def solve(self): top_score = None self.prev_code_ret = None self.should_execute_code = False + while True: if len(self.commands) == 2: cmd_app_str = "You must output either the ```EDIT or ```REPLACE command immediately. " else: cmd_app_str = "" @@ -299,6 +235,17 @@ def solve(self): model_resp = self.clean_text(model_resp) self.code_lines = copy(random.choice(self.best_codes)[0]) cmd_str, code_lines, prev_code_ret, should_execute_code, score = self.process_command(model_resp) + + # Log solve attempt + self.logger.log_step({ + 'phase': 'solve', + 'attempt': num_attempts, + 'command': cmd_str, + 'code_lines': code_lines, + 'score': score, + 'model_response': model_resp + }) + self.st_history.append([model_resp, prev_code_ret, code_lines, cmd_str]) if len(self.st_history) > self.st_hist_len: self.st_history.pop(0) if score is not None: @@ -312,7 +259,20 @@ def solve(self): print(f"$$$ Score: {score}") if num_attempts >= self.min_gen_trials and top_score is not None: break num_attempts += 1 + self.code_lines, self.prev_code_ret, self.should_execute_code, model_resp, cmd_str = best_pkg + + # Log best result + self.logger.log_step({ + 'phase': 'solve_complete', + 'final_score': top_score, + 'best_code_lines': self.code_lines, + 'total_attempts': num_attempts + }) + + # Save logs + self.logger.save_logs() + # add top scoring code that was successful to the best codes if top_score > self.best_codes[-1][1]: # replace the lowest scoring one @@ -325,10 +285,6 @@ def solve(self): return model_resp, cmd_str def reflect_code(self): - """ - Provide a reflection on produced behavior for next execution - @return: (str) language model-produced reflection - """ code_strs = ("$"*40 + "\n\n").join([self.generate_code_lines(_code[0]) + f"\nCode Return {_code[1]}" for _code in self.best_codes]) code_strs = f"Please reflect on the following sets of code: {code_strs} and come up with generalizable insights that will help you improve your performance on this benchmark." syst = self.system_prompt(commands=False) + code_strs @@ -358,7 +314,7 @@ def process_command(self, model_resp): failed = True code_err = str() for _tries in range(GLOBAL_REPAIR_ATTEMPTS): - success, args = cmd.parse_command(model_resp, copy(self.code_lines), self.dataset_code) + success, args = cmd.parse_command(model_resp,copy(self.code_lines), self.dataset_code) if success: cmd_return = cmd.execute_command(args) code_err = f"Return from executing code: {cmd_return[2]}" @@ -447,7 +403,7 @@ def system_prompt(self, commands=True): f"The following are notes, instructions, and general tips for you: {self.notes}" # PLAN DESCRIPTION f"You are given a machine learning research task described, where the plan is described as follows: {self.plan}\n" - # DATASET DESCRIPTION + # DATASET DESCRIPTION f"{self.generate_dataset_descr_prompt()}" # Create Figures f"You should also try generating at least two figures to showcase the results, titled Figure_1.png and Figure_2.png\n" @@ -568,8 +524,4 @@ def run_code(self): return self.prev_code_ret elif self.should_execute_code: return execute_code("\n".join(self.code_lines)) - return "Changes have not yet been made to the code." - - - - + return "Changes have not yet been made to the code." \ No newline at end of file diff --git a/test-visualization.py b/test-visualization.py new file mode 100644 index 0000000..ee4ff8d --- /dev/null +++ b/test-visualization.py @@ -0,0 +1,201 @@ +import os +import json +import random +from datetime import datetime + +def generate_test_logs(num_entries=50): + """Generate test log entries""" + logs = [] + start_time = datetime.now() + + for i in range(num_entries): + log_entry = { + "timestamp": start_time.isoformat(), + "step": i, + "command": "EDIT" if i % 2 == 0 else "REPLACE", + "code_lines": [f"print('Step {i}')"], + "score": 0.4 + random.random() * 0.2, + "model_response": f"Test response for step {i}" + } + logs.append(log_entry) + + return logs + +def save_logs(logs, directory="agent_logs"): + """Save logs to a JSON file""" + if not os.path.exists(directory): + os.makedirs(directory) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"mle_solver_{timestamp}.json" + filepath = os.path.join(directory, filename) + + with open(filepath, 'w') as f: + json.dump(logs, f, indent=2) + + # Create or update latest symlink + latest_link = os.path.join(directory, "mle_solver_latest.json") + if os.path.exists(latest_link): + os.remove(latest_link) + with open(latest_link, 'w') as f: + json.dump(logs, f, indent=2) + + return filepath + +def create_visualization_html(log_file_path): + """Create the visualization HTML file""" + if not os.path.exists("research_dir"): + os.makedirs("research_dir") + + html_content = """ + + + Agent Lab Visualization + + + + + + + + + +
+ + +""" + + output_path = os.path.join("research_dir", "visualization.html") + with open(output_path, 'w') as f: + f.write(html_content) + + return output_path + +def main(): + """Main function to run the visualization test""" + print("Starting visualization test...") + + # Generate and save test logs + logs = generate_test_logs() + log_file_path = save_logs(logs) + print(f"Generated test logs at: {log_file_path}") + + # Create visualization HTML + viz_path = create_visualization_html(log_file_path) + print(f"Created visualization interface at: {viz_path}") + + # Print some information about the logs + print(f"Log file contains {len(logs)} entries") + print("Sample log entry:") + print(json.dumps(logs[0], indent=2)) + + print("\nVisualization test complete!") + print("\nTo view the visualization:") + print("1. Start a local server: python -m http.server") + print("2. Open http://localhost:8000/research_dir/visualization.html in your browser") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/visualization.py b/visualization.py new file mode 100644 index 0000000..676b5a4 --- /dev/null +++ b/visualization.py @@ -0,0 +1,56 @@ +import json +import os +from datetime import datetime + +class AgentLogger: + def __init__(self, agent_name, log_dir="agent_logs"): + self.agent_name = agent_name + self.log_dir = log_dir + self.current_session = datetime.now().strftime("%Y%m%d_%H%M%S") + self.logs = [] + + # Create log directory if it doesn't exist + if not os.path.exists(log_dir): + os.makedirs(log_dir) + + def log_step(self, step_data): + """Log a single step of agent behavior""" + log_entry = { + 'timestamp': datetime.now().isoformat(), + 'step': len(self.logs), + **step_data + } + self.logs.append(log_entry) + + def save_logs(self): + """Save current logs to file""" + filename = f"{self.log_dir}/{self.agent_name}_{self.current_session}.json" + with open(filename, 'w') as f: + json.dump(self.logs, f, indent=2) + return filename + + def get_logs(self): + """Return current logs""" + return self.logs + + def clear_logs(self): + """Clear current logs""" + self.logs = [] + +class VisualizationManager: + def __init__(self): + self.loggers = {} + + def get_logger(self, agent_name): + """Get or create a logger for an agent""" + if agent_name not in self.loggers: + self.loggers[agent_name] = AgentLogger(agent_name) + return self.loggers[agent_name] + + def get_all_logs(self): + """Get logs from all agents""" + return {name: logger.get_logs() for name, logger in self.loggers.items()} + + def save_all_logs(self): + """Save logs from all agents""" + return {name: logger.save_logs() for name, logger in self.loggers.items()} \ No newline at end of file