diff --git a/DGM_outer.py b/DGM_outer.py index 2c87f4b..082925c 100644 --- a/DGM_outer.py +++ b/DGM_outer.py @@ -47,13 +47,12 @@ def any_exceeding_context_length(output_dir, commit_id, instance_ids): return True return False -def choose_selfimproves(output_dir, archive, selfimprove_size, method='random', run_baseline=None, polyglot=False): +def choose_selfimproves(output_dir, archive, selfimprove_size, fusion_probability, method='random', run_baseline=None, polyglot=False): """ Choose self-improve attempts for the current generation. + May include single parent entries or two-parent fusion entries. """ selfimprove_entries = [] - - # Get parent candidates candidates = {} for commit in archive: try: @@ -75,77 +74,101 @@ def choose_selfimproves(output_dir, archive, selfimprove_size, method='random', print(f"{commit} not eligible for being a parent: {e}") continue - # Choose parents based on method and baseline - if run_baseline == 'no_darwin': - # Always take the last commit - commits = list(candidates.keys()) - parent_commits = commits[-1:] - elif method == 'score_prop': - # Choose parents based on score - commits = list(candidates.keys()) - scores = [candidates[commit]['accuracy_score'] for commit in commits] - scores = [1 / (1 + math.exp(-10*(score-0.5))) for score in scores] - probabilities = [score / sum(scores) for score in scores] - print(commits) - parent_commits = random.choices(commits, probabilities, k=selfimprove_size) - elif method == 'score_child_prop': - # Choose parents based on score and the number of children - commits = list(candidates.keys()) - scores = [candidates[commit]['accuracy_score'] for commit in commits] - scores = [1 / (1 + math.exp(-10*(score-0.5))) for score in scores] - children_counts = [candidates[commit]['children_count'] for commit in commits] - children_counts = [1 / (1 + count) for count in children_counts] - probabilities = [score * count for score, count in zip(scores, children_counts)] - probabilities = [prob / sum(probabilities) for prob in probabilities] - parent_commits = random.choices(commits, probabilities, k=selfimprove_size) - elif method == 'best': - # Choose parents with the best score - sorted_commits = sorted(candidates, key=lambda x: candidates[x]['accuracy_score']) - parent_commits = sorted_commits[:min(selfimprove_size, len(sorted_commits))] - if len(parent_commits) < selfimprove_size: - parent_commits.extend(random.choices(parent_commits, k=selfimprove_size - len(parent_commits))) - else: - # Choose parents randomly - parent_commits = random.choices(list(candidates.keys()), k=selfimprove_size) - - # Choose entries for each parent - for parent_commit in parent_commits: - empty_ids = candidates[parent_commit]['total_emptypatch_ids'] - resolved_ids = candidates[parent_commit]['total_resolved_ids'] - unresolved_ids = candidates[parent_commit]['total_unresolved_ids'] + candidate_commits_list = list(candidates.keys()) + if not candidate_commits_list: + # No eligible parents, return empty list + return [] + + for _ in range(selfimprove_size): + attempt_fusion = random.random() < fusion_probability - if polyglot: - entry_ids = empty_ids + unresolved_ids - if not entry_ids: - entry_ids = resolved_ids + empty_ids + unresolved_ids + if attempt_fusion and len(candidate_commits_list) >= 2: + parent1, parent2 = random.sample(candidate_commits_list, 2) + selfimprove_entries.append(((parent1, parent2), "fuse_parents")) + continue # Move to next attempt else: - num_total_ids = len(empty_ids) + len(resolved_ids) + len(unresolved_ids) - - # Solve empty patches - if len(empty_ids) >= 0.1 * num_total_ids and random.random() < 0.25: - entry = 'solve_empty_patches' - selfimprove_entries.append((parent_commit, entry)) + # Single parent selection (or fallback if len(candidate_commits_list) < 2) + if not candidate_commits_list: continue - # Solve stochasticity - if random.random() < 0.25: - entry = 'solve_stochasticity' - selfimprove_entries.append((parent_commit, entry)) + parent_commit = None + if run_baseline == 'no_darwin': + parent_commit = candidate_commits_list[-1] # Always take the last commit + elif method == 'score_prop': + scores = [candidates[c]['accuracy_score'] for c in candidate_commits_list] + # Sigmoid scaling for scores + scaled_scores = [1 / (1 + math.exp(-10*(score-0.5))) for score in scores] + if sum(scaled_scores) == 0: # Avoid division by zero if all scaled scores are 0 + parent_commit = random.choice(candidate_commits_list) + else: + probabilities = [s / sum(scaled_scores) for s in scaled_scores] + parent_commit = random.choices(candidate_commits_list, probabilities, k=1)[0] + elif method == 'score_child_prop': + scores = [candidates[c]['accuracy_score'] for c in candidate_commits_list] + scaled_scores = [1 / (1 + math.exp(-10*(score-0.5))) for score in scores] + children_counts = [candidates[c]['children_count'] for c in candidate_commits_list] + # Inverse of children count (add 1 to avoid division by zero and to give less weight to more children) + child_weights = [1 / (1 + count) for count in children_counts] + + combined_weights = [s * cw for s, cw in zip(scaled_scores, child_weights)] + if sum(combined_weights) == 0: + parent_commit = random.choice(candidate_commits_list) + else: + probabilities = [w / sum(combined_weights) for w in combined_weights] + parent_commit = random.choices(candidate_commits_list, probabilities, k=1)[0] + elif method == 'best': + # Sort by accuracy_score descending and pick the top one. + # This means if multiple single-parent entries are made, they might all use the same best parent. + parent_commit = sorted(candidate_commits_list, key=lambda c: candidates[c]['accuracy_score'], reverse=True)[0] + else: # 'random' or default + parent_commit = random.choice(candidate_commits_list) + + if parent_commit is None: # Should ideally not happen if candidate_commits_list is not empty continue - # Solve context length - if any_exceeding_context_length(output_dir, parent_commit, empty_ids + unresolved_ids) and \ - random.random() < 0.25: - entry = 'solve_contextlength' - selfimprove_entries.append((parent_commit, entry)) + empty_ids = candidates[parent_commit]['total_emptypatch_ids'] + resolved_ids = candidates[parent_commit]['total_resolved_ids'] + unresolved_ids = candidates[parent_commit]['total_unresolved_ids'] + + if polyglot: + entry_ids = empty_ids + unresolved_ids + if not entry_ids: + entry_ids = resolved_ids + empty_ids + unresolved_ids + else: + num_total_ids = len(empty_ids) + len(resolved_ids) + len(unresolved_ids) + if len(empty_ids) >= 0.1 * num_total_ids and random.random() < 0.25: + entry = 'solve_empty_patches' + selfimprove_entries.append((parent_commit, entry)) + continue + if random.random() < 0.25: + entry = 'solve_stochasticity' + selfimprove_entries.append((parent_commit, entry)) + continue + if any_exceeding_context_length(output_dir, parent_commit, empty_ids + unresolved_ids) and \ + random.random() < 0.25: + entry = 'solve_contextlength' + selfimprove_entries.append((parent_commit, entry)) + continue + if not unresolved_ids: # Renamed from unresolved_ids == 0 for clarity + # If no specific entry type chosen and no unresolved, what to do? + # Maybe pick from resolved_ids or skip this parent for this iteration? + # For now, if no unresolved, we might not add an entry, leading to fewer than selfimprove_size. + # This needs to be handled: either ensure an entry or adjust loop. + # Current original code has `continue` if unresolved_ids == 0, which means + # it might also produce fewer than selfimprove_size entries. + # Let's keep that behavior for now. + if not (empty_ids + resolved_ids + unresolved_ids): # if truly no tasks at all + continue # skip this parent + entry_ids = unresolved_ids if unresolved_ids else (empty_ids + resolved_ids) + + + if not entry_ids: # If after all logic, entry_ids is still empty (e.g. polyglot case with no tasks) + # This case should ideally be prevented by ensuring 'candidates' only includes those with tasks, + # or by having a default task. For now, skip adding an entry. continue - # Choose a random unresolved entry - if unresolved_ids == 0: - continue - entry_ids = unresolved_ids - entry = random.choice(entry_ids) - selfimprove_entries.append((parent_commit, entry)) + entry = random.choice(entry_ids) + selfimprove_entries.append((parent_commit, entry)) return selfimprove_entries @@ -237,8 +260,9 @@ def main(): parser.add_argument("--polyglot", default=False, action='store_true', help="Run single shallow evaluation for self-improvement on swe.") parser.add_argument("--eval_noise", type=float, default=0.1, help="Noise leeway for evaluation.") parser.add_argument("--no_full_eval", default=False, action='store_true', help="Do not run full evaluation on swe if a node is the top N highest performing.") - # baselines + # baselines # Use a more descriptive name for the argument for clarity. parser.add_argument("--run_baseline", type=str, default=None, choices=['no_selfimprove', 'no_darwin'], help="Baseline to run.") + parser.add_argument("--fusion_probability", type=float, default=0.25, help="Probability of attempting a two-parent fusion.") args = parser.parse_args() # Variables for this DGM run @@ -271,6 +295,7 @@ def main(): # Choose self-improve attempts selfimprove_entries = choose_selfimproves( output_dir, archive, args.selfimprove_size, + args.fusion_probability, # Pass the new argument method=args.choose_selfimproves_method, run_baseline=args.run_baseline, polyglot=args.polyglot, diff --git a/coding_agent.py b/coding_agent.py index c32dfa3..464f5e8 100644 --- a/coding_agent.py +++ b/coding_agent.py @@ -74,14 +74,27 @@ def __init__( test_description=None, self_improve=False, instance_id=None, + # Fusion task specific arguments + is_fusion_task=False, + parent1_patch_file=None, + parent2_patch_file=None, + parent1_commit_id=None, + parent2_commit_id=None ): self.problem_statement = problem_statement self.git_tempdir = git_tempdir - self.base_commit = base_commit + self.base_commit = base_commit # This is the SHA of the common ancestor for fusion self.chat_history_file = chat_history_file self.test_description = test_description self.self_improve = self_improve self.instance_id = instance_id if not self_improve else 'dgm' + + self.is_fusion_task = is_fusion_task + self.parent1_patch_file = parent1_patch_file + self.parent2_patch_file = parent2_patch_file + self.parent1_commit_id = parent1_commit_id + self.parent2_commit_id = parent2_commit_id + self.code_model = CLAUDE_MODEL # Initialize logger and store it in thread-local storage @@ -154,7 +167,34 @@ def forward(self): """ The forward function for the AgenticSystem. """ - instruction = f"""I have uploaded a Python code repository in the directory {self.git_tempdir}. Help solve the following problem. + if self.is_fusion_task: + self.logger.info("Fusion task detected.") + try: + with open(self.parent1_patch_file, 'r') as f: + parent1_patch_content = f.read() + with open(self.parent2_patch_file, 'r') as f: + parent2_patch_content = f.read() + except FileNotFoundError as e: + self.logger.error(f"Error: Parent patch file not found: {e}. This will result in an empty patch.") + # Allow to proceed, will result in an empty diff as no chat_with_agent call + return # Exit early, no instruction to run + + # Dynamically import here to avoid issues if this file is imported elsewhere + # where prompts.fusion_prompt might not be immediately available or needed. + from prompts.fusion_prompt import get_fusion_prompt + + instruction = get_fusion_prompt( + base_commit_id=self.base_commit, # base_commit is the SHA of the common ancestor + parent1_commit_id=self.parent1_commit_id, + parent1_patch_content=parent1_patch_content, + parent2_commit_id=self.parent2_commit_id, + parent2_patch_content=parent2_patch_content, + existing_problem_statement=self.problem_statement # Original problem statement for context + ) + self.logger.info(f"Fusion instruction generated for base {self.base_commit}, P1 {self.parent1_commit_id}, P2 {self.parent2_commit_id}") + else: + self.logger.info("Standard task detected.") + instruction = f"""I have uploaded a Python code repository in the directory {self.git_tempdir}. Help solve the following problem. {self.problem_statement} @@ -166,6 +206,7 @@ def forward(self): Your task is to make changes to the files in the {self.git_tempdir} directory to address the . I have already taken care of the required dependencies. """ + self.logger.info(f"Instruction for chat_with_agent (first 200 chars):\n{instruction[:200]}...") new_msg_history = chat_with_agent(instruction, model=self.code_model, msg_history=[], logging=safe_log) def main(): @@ -178,8 +219,19 @@ def main(): parser.add_argument('--test_description', default=None, required=False, help='Description of how to test the repository') parser.add_argument('--self_improve', default=False, action='store_true', help='Whether to self-improve the repository or solving swe') parser.add_argument('--instance_id', default=None, help='Instance ID for SWE issue') + + # Arguments for fusion task + parser.add_argument("--is_fusion_task", default=False, action="store_true", help="Indicates if the task is a fusion of two parents.") + parser.add_argument("--parent1_patch_file", type=str, default=None, help="Path to the diff file for Parent 1 (changes from base to P1). Required if is_fusion_task is True.") + parser.add_argument("--parent2_patch_file", type=str, default=None, help="Path to the diff file for Parent 2 (changes from base to P2). Required if is_fusion_task is True.") + parser.add_argument("--parent1_commit_id", type=str, default="Parent1", help="Commit ID for Parent 1 (for prompt context).") + parser.add_argument("--parent2_commit_id", type=str, default="Parent2", help="Commit ID for Parent 2 (for prompt context).") + args = parser.parse_args() + if args.is_fusion_task and (not args.parent1_patch_file or not args.parent2_patch_file): + parser.error("--parent1_patch_file and --parent2_patch_file are required when --is_fusion_task is True.") + # Process the repository agentic_system = AgenticSystem( problem_statement=args.problem_statement, @@ -189,6 +241,12 @@ def main(): test_description=args.test_description, self_improve=args.self_improve, instance_id=args.instance_id, + # Fusion arguments + is_fusion_task=args.is_fusion_task, + parent1_patch_file=args.parent1_patch_file, + parent2_patch_file=args.parent2_patch_file, + parent1_commit_id=args.parent1_commit_id, + parent2_commit_id=args.parent2_commit_id ) # Run the agentic system to try to solve the problem diff --git a/prompts/fusion_prompt.py b/prompts/fusion_prompt.py new file mode 100644 index 0000000..d13cb32 --- /dev/null +++ b/prompts/fusion_prompt.py @@ -0,0 +1,49 @@ +def get_fusion_prompt( + base_commit_id: str, + parent1_commit_id: str, + parent1_patch_content: str, + parent2_commit_id: str, + parent2_patch_content: str, + existing_problem_statement: str = None +) -> str: + """ + Generates a prompt for the LLM to perform a fusion of two parent diffs. + """ + + prompt = f"""You are an expert AI programmer. Your task is to perform a 'fusion' of code changes from two parent versions (Parent1 and Parent2) which both evolved from a common `base_commit`. The goal is to create a new child version that intelligently combines the beneficial features and improvements from both parents, applied to the `base_commit`. + +The codebase is currently in the state of `base_commit` ('{base_commit_id}'). + +Here are the changes from `base_commit` to Parent1 ('{parent1_commit_id}'): +```diff +{parent1_patch_content} +``` + +Here are the changes from `base_commit` to Parent2 ('{parent2_commit_id}'): +```diff +{parent2_patch_content} +``` + +Your instructions for fusion are as follows: +1. **Analyze Both Diffs**: Carefully study both diffs to understand the specific changes, enhancements, and fixes each parent introduced relative to the `base_commit`. +2. **Primary Objective - Merge**: Your main goal is to apply the changes from *both* Parent1 and Parent2 onto the `base_commit` state. Try to incorporate the intent of both sets of changes. +3. **Conflict Resolution**: + * If Parent1 and Parent2 modify the exact same lines of code in conflicting ways (e.g., changing a line to two different things), **prioritize the changes from Parent1.** + * If Parent1 deletes lines that Parent2 modifies, **the deletion by Parent1 takes precedence.** + * If Parent2 deletes lines that Parent1 modifies, **the deletion by Parent2 takes precedence.** + * If both parents delete the same set of lines, this is not a conflict; the lines are simply deleted. + * If one parent modifies a section of code and the other parent deletes the entire section, **the deletion takes precedence.** +4. **Redundancy**: If both parents introduce identical changes (e.g., adding the exact same lines of code at the same location, or making the exact same modification), include these changes only once in the final merged code. +5. **Coherence and Correctness**: Ensure the resulting code is coherent, syntactically correct, and logically sound. The combined changes should work together harmoniously. +6. **Focus on Provided Diffs**: Do not introduce entirely new features, refactorings, or code changes that are not suggested by the content of Parent1's diff or Parent2's diff. Your task is to combine the given changes, not to add your own unrelated improvements. +""" + + if existing_problem_statement: + prompt += f"\n7. **Original Problem Context**: For your awareness, the original problem statement this lineage was trying to solve was: \"{existing_problem_statement}\". While fusing, ensure the fused result is still relevant to this problem, but prioritize the fusion instructions above.\n" + else: + prompt += "\n" + + prompt += """ +Please use the available file editing tools to apply the combined and resolved changes directly to the codebase. Remember, the codebase is currently in the state of `base_commit`. Your edits will be captured as a new patch file representing the fused child. +""" + return prompt diff --git a/self_improve_step.py b/self_improve_step.py index c8e41b5..dbdfcc0 100644 --- a/self_improve_step.py +++ b/self_improve_step.py @@ -254,7 +254,25 @@ def self_improve( output_dir = os.path.join(root_dir, f"{output_dir}/{run_id}/") os.makedirs(output_dir, exist_ok=True) metadata['run_id'] = run_id - metadata['parent_commit'] = parent_commit + + is_fusion = isinstance(parent_commit, tuple) + base_commit = 'initial' # Hardcoded for now as per requirements + + if is_fusion: + if entry != "fuse_parents": + # This case should ideally not happen if DGM_outer.py is correct + safe_log("Warning: parent_commit is a tuple, but entry is not 'fuse_parents'. Proceeding as fusion.") + parent1_commit, parent2_commit = parent_commit + metadata['parent_commits'] = [parent1_commit, parent2_commit] + metadata['base_commit'] = base_commit + metadata['parent_commit'] = None # Nullify to avoid confusion + primary_parent_commit_for_patches = base_commit # Patches for agent are from base_commit + safe_log(f"Fusion mode: Parent1={parent1_commit}, Parent2={parent2_commit}, Base={base_commit}") + else: + metadata['parent_commit'] = parent_commit + primary_parent_commit_for_patches = parent_commit # Patches from this parent + safe_log(f"Single parent mode: Parent={parent_commit}") + test_task_list_big = load_json_file("./swe_bench/subsets/big.json") # Set up logger @@ -290,9 +308,24 @@ def self_improve( exec_result = container.exec_run("rm /dgm/coding_agent_polyglot.py", workdir='/') # Find all parent patches and apply them - patch_files = get_model_patch_paths(root_dir, os.path.join(output_dir, '../'), parent_commit) + # If fusion, the agent works from base_commit, so no prior patches are applied to bring it to parent1 or parent2. + # The agent will be *aware* of parent1 and parent2's patches. + # If not fusion, patches are applied to bring it to parent_commit's state. + if is_fusion: + patch_files = [] # Container starts from base_commit state for fusion agent + # However, we need to know the patches for Parent1 and Parent2 for the agent + # get_model_patch_paths always traces back to 'initial', which is our current base_commit. + patch_files_parent1 = get_model_patch_paths(root_dir, os.path.join(output_dir, '../'), parent1_commit) + patch_files_parent2 = get_model_patch_paths(root_dir, os.path.join(output_dir, '../'), parent2_commit) + metadata['patch_files_parent1'] = patch_files_parent1 + metadata['patch_files_parent2'] = patch_files_parent2 + safe_log(f"Patch files for Parent1 (from {base_commit}): {patch_files_parent1}") + safe_log(f"Patch files for Parent2 (from {base_commit}): {patch_files_parent2}") + else: + patch_files = get_model_patch_paths(root_dir, os.path.join(output_dir, '../'), primary_parent_commit_for_patches) + if run_baseline not in ['no_selfimprove']: - for patch_file in patch_files: + for patch_file in patch_files: # These are only applied if not fusion. copy_to_container(container, patch_file, '/dgm/parent_patch.txt') exec_result = container.exec_run("/bin/sh -c 'patch -p1 < /dgm/parent_patch.txt'", workdir='/dgm') log_container_output(exec_result) @@ -300,6 +333,8 @@ def self_improve( log_container_output(exec_result) # Commit this version of dgm, so that irrelevant changes are not included in the patch + # This commit represents the state the agent will start working from. + # For fusion, this is base_commit. For single parent, this is parent_commit. exec_result = container.exec_run("git add --all", workdir='/dgm/') log_container_output(exec_result) exec_result = container.exec_run("git -c user.name='user' -c user.email='you@example.com' commit -m 'a nonsense commit message'", workdir='/dgm/') @@ -313,9 +348,20 @@ def self_improve( log_container_output(exec_result) # Get tasks to improve - if entry: + if is_fusion and entry == "fuse_parents": + problem_statement = ( + f"Fuse features from Parent1 ({parent1_commit}) and Parent2 ({parent2_commit}) " + f"based on the common ancestor ({base_commit})." + ) + metadata['problem_statement_type'] = "fusion_predefined" + safe_log(f"Fusion task problem statement: {problem_statement}") + elif entry: safe_log(f"Task to improve: {entry}") - problem_statement = diagnose_problem(entry, parent_commit, root_dir, out_dir_base, patch_files=patch_files, polyglot=polyglot) + # For diagnose_problem, use the actual single parent commit if not fusion, + # or parent1 if fusion (though problem_statement is predefined for fusion, this call might be skipped or adapted) + diagnose_parent_ref = parent1_commit if is_fusion else parent_commit + problem_statement = diagnose_problem(entry, diagnose_parent_ref, root_dir, out_dir_base, patch_files=patch_files, polyglot=polyglot) + metadata['problem_statement_type'] = "diagnosed" safe_log(f"problem_statement: {problem_statement}") else: safe_log("No entry provided. Exiting.") @@ -333,58 +379,156 @@ def self_improve( return metadata # Run self-improvement - safe_log("Running self-improvement") - chat_history_file_container = "/dgm/self_evo.md" - test_description = get_test_description(swerepo=False) - env_vars = { - "ANTHROPIC_API_KEY": os.getenv('ANTHROPIC_API_KEY'), - "AWS_REGION": os.getenv('AWS_REGION'), - "AWS_REGION_NAME": os.getenv('AWS_REGION_NAME'), - "AWS_ACCESS_KEY_ID": os.getenv('AWS_ACCESS_KEY_ID'), - "AWS_SECRET_ACCESS_KEY": os.getenv('AWS_SECRET_ACCESS_KEY'), - "OPENAI_API_KEY": os.getenv('OPENAI_API_KEY'), - } - cmd = [ - "timeout", "1800", # 30min timeout - "python", "/dgm/coding_agent.py", - "--problem_statement", problem_statement, - "--git_dir", "/dgm/", - "--chat_history_file", chat_history_file_container, - "--base_commit", commit_hash, - "--outdir", "/dgm/", - "--test_description", test_description, - "--self_improve", - ] - exec_result = container.exec_run(cmd, environment=env_vars, workdir='/') - log_container_output(exec_result) + model_patch_file = os.path.join(output_dir, "model_patch.diff") # Define early for cleanup + model_patch_generated = False + + if is_fusion and entry == "fuse_parents": + safe_log(f"Fusion mode: Preparing to call coding_agent.py for fusion. Base commit in container: {commit_hash}") + + # 1. Create concatenated diff files for parent1 and parent2 + host_p1_diff_path = os.path.join(output_dir, "parent1_for_fusion.diff") + p1_content = [] + for p_file in metadata.get('patch_files_parent1', []): + try: + with open(p_file, 'r') as f: + p1_content.append(f.read()) + except FileNotFoundError: + safe_log(f"Warning: Patch file {p_file} for parent1 not found. Skipping.") + with open(host_p1_diff_path, 'w') as f: + f.write("\n".join(p1_content)) + + host_p2_diff_path = os.path.join(output_dir, "parent2_for_fusion.diff") + p2_content = [] + for p_file in metadata.get('patch_files_parent2', []): + try: + with open(p_file, 'r') as f: + p2_content.append(f.read()) + except FileNotFoundError: + safe_log(f"Warning: Patch file {p_file} for parent2 not found. Skipping.") + with open(host_p2_diff_path, 'w') as f: + f.write("\n".join(p2_content)) + + # 2. Define container paths and copy these diff files to the container + container_p1_diff_path = "/tmp/parent1_for_fusion.diff" + container_p2_diff_path = "/tmp/parent2_for_fusion.diff" + copy_to_container(container, host_p1_diff_path, container_p1_diff_path) + copy_to_container(container, host_p2_diff_path, container_p2_diff_path) + + # 3. Construct command for coding_agent.py + chat_history_file_container = "/dgm/self_evo_fusion.md" # Specific for fusion + env_vars = { + "ANTHROPIC_API_KEY": os.getenv('ANTHROPIC_API_KEY'), + "AWS_REGION": os.getenv('AWS_REGION'), + "AWS_REGION_NAME": os.getenv('AWS_REGION_NAME'), + "AWS_ACCESS_KEY_ID": os.getenv('AWS_ACCESS_KEY_ID'), + "AWS_SECRET_ACCESS_KEY": os.getenv('AWS_SECRET_ACCESS_KEY'), + "OPENAI_API_KEY": os.getenv('OPENAI_API_KEY'), + } + cmd = [ + "timeout", "3600", # Increased timeout for potentially complex fusion task (1hr) + "python", "/dgm/coding_agent.py", + "--problem_statement", problem_statement, # Predefined fusion problem statement + "--git_dir", "/dgm/", + "--chat_history_file", chat_history_file_container, + "--base_commit", commit_hash, # SHA of 'initial' state in container + "--outdir", "/dgm/", + # Fusion specific arguments + "--is_fusion_task", + "--parent1_commit_id", parent1_commit, + "--parent2_commit_id", parent2_commit, + "--parent1_patch_file", container_p1_diff_path, + "--parent2_patch_file", container_p2_diff_path, + # --test_description might not be directly relevant for fusion prompt, but good to pass + "--test_description", get_test_description(swerepo=False), + "--self_improve" # Keep self_improve flag if it influences agent behavior generally + ] + + safe_log(f"Executing fusion command: {' '.join(cmd)}") + exec_result = container.exec_run(cmd, environment=env_vars, workdir='/') + log_container_output(exec_result) - # Copy output files back to host - chat_history_file = os.path.join(output_dir, "self_evo.md") - copy_from_container(container, chat_history_file_container, chat_history_file) - model_patch_file = os.path.join(output_dir, "model_patch.diff") - copy_from_container(container, "/dgm/model_patch.diff", model_patch_file) + # Copy output files (model_patch.diff, chat history) back to host + chat_history_file = os.path.join(output_dir, "self_evo_fusion.md") + copy_from_container(container, chat_history_file_container, chat_history_file) + copy_from_container(container, "/dgm/model_patch.diff", model_patch_file) + model_patch_generated = True + + elif problem_statement: # Proceed only if we have a problem statement (and not fusion) + safe_log("Running self-improvement (single parent mode)") + chat_history_file_container = "/dgm/self_evo.md" # Standard chat history file + test_description = get_test_description(swerepo=False) + env_vars = { + "ANTHROPIC_API_KEY": os.getenv('ANTHROPIC_API_KEY'), + "AWS_REGION": os.getenv('AWS_REGION'), + "AWS_REGION_NAME": os.getenv('AWS_REGION_NAME'), + "AWS_ACCESS_KEY_ID": os.getenv('AWS_ACCESS_KEY_ID'), + "AWS_SECRET_ACCESS_KEY": os.getenv('AWS_SECRET_ACCESS_KEY'), + "OPENAI_API_KEY": os.getenv('OPENAI_API_KEY'), + } + # TODO: For fusion, coding_agent.py will need different/additional arguments + # like parent1_commit, parent2_commit, base_commit, and paths to their respective patches/code. + cmd = [ + "timeout", "1800", # 30min timeout + "python", "/dgm/coding_agent.py", + "--problem_statement", problem_statement, + "--git_dir", "/dgm/", + "--chat_history_file", chat_history_file_container, + "--base_commit", commit_hash, # This is hash of (base_commit + patch_files) state + "--outdir", "/dgm/", + "--test_description", test_description, + "--self_improve", + ] + exec_result = container.exec_run(cmd, environment=env_vars, workdir='/') + log_container_output(exec_result) - # Try reading the patch file to validate it - try: - # Check if patch file exists and is not empty - if not os.path.exists(model_patch_file): - raise Exception("Model patch file is empty or does not exist") - with open(model_patch_file, 'r') as f: - patch_content = f.read() - if not patch_content.strip(): - raise Exception("Model patch file is empty") - except Exception as e: - safe_log(f"Failed to read model patch file: {str(e)}") - save_metadata(metadata, output_dir) - return metadata + # Copy output files back to host + chat_history_file = os.path.join(output_dir, "self_evo.md") + copy_from_container(container, chat_history_file_container, chat_history_file) + copy_from_container(container, "/dgm/model_patch.diff", model_patch_file) + model_patch_generated = True + else: + safe_log("Skipping self-improvement agent call due to missing problem statement or other reasons.") + + if model_patch_generated: + try: + # Check if patch file exists and is not empty + if not os.path.exists(model_patch_file): + raise Exception("Model patch file is empty or does not exist") + with open(model_patch_file, 'r') as f: + patch_content = f.read() + if not patch_content.strip(): + raise Exception("Model patch file is empty") + patch_files.append(model_patch_file) # Add successfully generated patch to list for evaluation + except Exception as e: + safe_log(f"Failed to read or validate model patch file: {str(e)}") + model_patch_generated = False # Mark as not generated if checks fail + # Do not return yet, proceed to cleanup and save metadata + + # Patch files for evaluation harness: + # For fusion, it's just the new model_patch.diff (applied to 'initial'). + # For non-fusion, it's the chain of parent patches + new model_patch.diff. + eval_patch_files = [] + if is_fusion: + if model_patch_generated and os.path.exists(model_patch_file) and os.path.getsize(model_patch_file) > 0: + eval_patch_files = [model_patch_file] + safe_log(f"Fusion successful: using {model_patch_file} for evaluation.") + else: + safe_log("Fusion generated an empty or missing patch. Evaluation will run on base_commit state.") + # eval_patch_files remains empty, harness runs on 'initial' + else: # Non-fusion + eval_patch_files = patch_files # These are patches leading up to current parent + if model_patch_generated and os.path.exists(model_patch_file) and os.path.getsize(model_patch_file) > 0: + eval_patch_files.append(model_patch_file) + safe_log(f"Self-improve (single parent) successful: using {model_patch_file} on top of parent patches for evaluation.") + else: + safe_log("Self-improve (single parent) generated an empty or missing patch. Evaluation will run on parent state.") - patch_files.append(model_patch_file) # Stop and remove the container cleanup_container(container) # Evaluate the performance of the self-improvement - model_patch_exists = os.path.exists(model_patch_file) + model_patch_exists = os.path.exists(model_patch_file) and model_patch_generated metadata['model_patch_exists'] = model_patch_exists model_patch_notempty = os.path.getsize(model_patch_file) > 0 metadata['model_patch_notempty'] = model_patch_notempty @@ -401,16 +545,29 @@ def self_improve( # Post-self-improvement diagnosis if post_improve_diagnose: safe_log("Diagnosing the self-improvement") - metadata['is_compiled'] = is_compiled_self_improve(metadata) - if metadata['is_compiled']: - safe_log("The self-improvement succeed to be complied") + # is_compiled_self_improve uses metadata['overall_performance'] which is populated by run_harness_* + # This check should occur *after* evaluation. + # metadata['is_compiled'] = is_compiled_self_improve(metadata) # This seems premature here. + + # For diagnose_improvement, use parent1_commit if fusion, else original parent_commit + diagnose_parent_ref_for_improvement = parent1_commit if is_fusion else parent_commit + + # Only diagnose if a patch was generated and exists, and if it compiled (checked after eval) + if model_patch_exists: + # We'll update metadata['is_compiled'] after evaluation, then this diagnosis can be more robust. + # For now, we proceed assuming if patch exists, it might be diagnosable. + # The actual diagnosis might need to happen after evaluation results are in metadata. + # The `patch_files` argument to diagnose_improvement should be `eval_patch_files` improvement_diagnosis = diagnose_improvement( - entry, parent_commit, root_dir, - model_patch_file, out_dir_base, run_id, - patch_files=patch_files, + entry, diagnose_parent_ref_for_improvement, root_dir, + model_patch_file, out_dir_base, run_id, # model_patch_file is the one generated by agent + patch_files=eval_patch_files, # These are the patches applied for the eval run ) metadata['improvement_diagnosis'] = improvement_diagnosis safe_log(f"Improvement diagnosis: {improvement_diagnosis}") + else: + safe_log("Skipping improvement diagnosis as no valid model patch was generated.") + metadata['improvement_diagnosis'] = "No model patch generated." else: safe_log("The self-improvement fail to be complied") metadata['improvement_diagnosis'] = "Fail to complied. Ignore this."