diff --git a/.github/workflows/sync-docs-cn-to-en.yml b/.github/workflows/sync-docs-cn-to-en.yml new file mode 100644 index 0000000000000..5fe0aa9e3913b --- /dev/null +++ b/.github/workflows/sync-docs-cn-to-en.yml @@ -0,0 +1,134 @@ +name: Sync Docs Changes from ZH PR to EN PR + +on: + workflow_dispatch: + inputs: + source_pr_url: + description: 'Source PR URL (Chinese docs repository)' + required: true + type: string + default: '' + target_pr_url: + description: 'Target PR URL (English docs repository)' + required: true + type: string + default: '' + ai_provider: + description: 'AI Provider to use for translation' + required: false + type: choice + options: + - deepseek + - gemini + default: 'gemini' + +jobs: + sync-docs: + runs-on: ubuntu-latest + + steps: + - name: Checkout current repository + uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r scripts/translate_doc_pr/requirements.txt + + - name: Extract PR information + id: extract_info + run: | + # Extract source repo info + SOURCE_URL="${{ github.event.inputs.source_pr_url }}" + SOURCE_OWNER=$(echo $SOURCE_URL | cut -d'/' -f4) + SOURCE_REPO=$(echo $SOURCE_URL | cut -d'/' -f5) + SOURCE_PR=$(echo $SOURCE_URL | cut -d'/' -f7) + + # Extract target repo info + TARGET_URL="${{ github.event.inputs.target_pr_url }}" + TARGET_OWNER=$(echo $TARGET_URL | cut -d'/' -f4) + TARGET_REPO=$(echo $TARGET_URL | cut -d'/' -f5) + TARGET_PR=$(echo $TARGET_URL | cut -d'/' -f7) + + echo "source_owner=${SOURCE_OWNER}" >> $GITHUB_OUTPUT + echo "source_repo=${SOURCE_REPO}" >> $GITHUB_OUTPUT + echo "source_pr=${SOURCE_PR}" >> $GITHUB_OUTPUT + echo "target_owner=${TARGET_OWNER}" >> $GITHUB_OUTPUT + echo "target_repo=${TARGET_REPO}" >> $GITHUB_OUTPUT + echo "target_pr=${TARGET_PR}" >> $GITHUB_OUTPUT + + echo "Source: ${SOURCE_OWNER}/${SOURCE_REPO}#${SOURCE_PR}" + echo "Target: ${TARGET_OWNER}/${TARGET_REPO}#${TARGET_PR}" + + - name: Get target PR branch info + id: target_branch + run: | + # Get target PR branch name + TARGET_BRANCH=$(curl -s \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/${{ steps.extract_info.outputs.target_owner }}/${{ steps.extract_info.outputs.target_repo }}/pulls/${{ steps.extract_info.outputs.target_pr }}" \ + | jq -r '.head.ref') + + echo "target_branch=${TARGET_BRANCH}" >> $GITHUB_OUTPUT + echo "Target branch: ${TARGET_BRANCH}" + + - name: Clone target repository + run: | + # Clone target repository with the PR branch + git clone https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ steps.extract_info.outputs.target_owner }}/${{ steps.extract_info.outputs.target_repo }}.git target_repo + cd target_repo + git checkout ${{ steps.target_branch.outputs.target_branch }} + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Run sync script + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DEEPSEEK_API_TOKEN: ${{ secrets.DEEPSEEK_API_TOKEN }} + GEMINI_API_TOKEN: ${{ secrets.GEMINI_API_TOKEN }} + SOURCE_PR_URL: ${{ github.event.inputs.source_pr_url }} + TARGET_PR_URL: ${{ github.event.inputs.target_pr_url }} + AI_PROVIDER: ${{ github.event.inputs.ai_provider }} + TARGET_REPO_PATH: ${{ github.workspace }}/target_repo + run: | + cd scripts/translate_doc_pr + python main_workflow.py + + - name: Commit and push changes + run: | + cd target_repo + git add . + if git diff --staged --quiet; then + echo "No changes to commit" + else + git commit -m "Auto-sync: Update English docs from Chinese PR ${{ github.event.inputs.source_pr_url }} + + Synced from: ${{ github.event.inputs.source_pr_url }} + Target PR: ${{ github.event.inputs.target_pr_url }} + AI Provider: ${{ github.event.inputs.ai_provider }} + + Co-authored-by: github-actions[bot] " + + git push origin ${{ steps.target_branch.outputs.target_branch }} + echo "Changes pushed to target PR branch: ${{ steps.target_branch.outputs.target_branch }}" + fi + + - name: Add comment to target PR + run: | + # Add a comment to the target PR about the sync + curl -X POST \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/${{ steps.extract_info.outputs.target_owner }}/${{ steps.extract_info.outputs.target_repo }}/issues/${{ steps.extract_info.outputs.target_pr }}/comments" \ + -d "{ + \"body\": \"šŸ¤– **Auto-sync completed**\\n\\nšŸ“„ **Source PR**: ${{ github.event.inputs.source_pr_url }}\\nšŸŽÆ **Target PR**: ${{ github.event.inputs.target_pr_url }}\\nāœ… English documentation has been updated based on Chinese documentation changes.\\n\\n_This comment was generated automatically by the sync workflow._\" + }" diff --git a/scripts/translate_doc_pr/__init__.py b/scripts/translate_doc_pr/__init__.py new file mode 100644 index 0000000000000..b272696e2e394 --- /dev/null +++ b/scripts/translate_doc_pr/__init__.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +""" +Auto-Sync PR Changes - Refactored Modular Version + +This package contains the refactored version of the auto-sync-pr-changes script, +split into logical modules for better maintainability and testing. + +Modules: +- pr_analyzer: PR analysis, diff parsing, content getting, hierarchy building +- section_matcher: Section matching (direct matching + AI matching) +- file_adder: New file processing and translation +- file_deleter: Deleted file processing +- file_updater: Updated file processing and translation +- toc_processor: TOC file special processing +- main: Main orchestration function +""" + +# Import main functionality for easy access +from main import main + +# Make main function available at package level +__all__ = ["main"] diff --git a/scripts/translate_doc_pr/file_adder.py b/scripts/translate_doc_pr/file_adder.py new file mode 100644 index 0000000000000..57e93b2fb1c63 --- /dev/null +++ b/scripts/translate_doc_pr/file_adder.py @@ -0,0 +1,193 @@ +""" +File Adder Module +Handles processing and translation of newly added files +""" + +import os +import re +import json +import threading +from github import Github +from openai import OpenAI + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def create_section_batches(file_content, max_lines_per_batch=200): + """Create batches of file content for translation, respecting section boundaries""" + lines = file_content.split('\n') + + # Find all section headers + section_starts = [] + for i, line in enumerate(lines): + line = line.strip() + if line.startswith('#'): + match = re.match(r'^(#{1,10})\s+(.+)', line) + if match: + section_starts.append(i + 1) # 1-based line numbers + + # If no sections found, just batch by line count + if not section_starts: + batches = [] + for i in range(0, len(lines), max_lines_per_batch): + batch_lines = lines[i:i + max_lines_per_batch] + batches.append('\n'.join(batch_lines)) + return batches + + # Create batches respecting section boundaries + batches = [] + current_batch_start = 0 + + for i, section_start in enumerate(section_starts): + section_start_idx = section_start - 1 # Convert to 0-based + + # Check if adding this section would exceed the line limit + if (section_start_idx - current_batch_start) > max_lines_per_batch: + # Close current batch at the previous section boundary + if current_batch_start < section_start_idx: + batch_lines = lines[current_batch_start:section_start_idx] + batches.append('\n'.join(batch_lines)) + current_batch_start = section_start_idx + + # If this is the last section, or the next section would create a batch too large + if i == len(section_starts) - 1: + # Add remaining content as final batch + batch_lines = lines[current_batch_start:] + batches.append('\n'.join(batch_lines)) + else: + next_section_start = section_starts[i + 1] - 1 # 0-based + if (next_section_start - current_batch_start) > max_lines_per_batch: + # Close current batch at current section boundary + batch_lines = lines[current_batch_start:section_start_idx] + if batch_lines: # Only add non-empty batches + batches.append('\n'.join(batch_lines)) + current_batch_start = section_start_idx + + # Clean up any empty batches + batches = [batch for batch in batches if batch.strip()] + + return batches + +def translate_file_batch(batch_content, ai_client, source_language="English", target_language="Chinese"): + """Translate a single batch of file content using AI""" + if not batch_content.strip(): + return batch_content + + thread_safe_print(f" šŸ¤– Translating batch ({len(batch_content.split())} words)...") + + prompt = f"""You are a professional technical writer. Please translate the following {source_language} content to {target_language}. + +IMPORTANT INSTRUCTIONS: +1. Preserve ALL Markdown formatting (headers, links, code blocks, tables, etc.) +2. Do NOT translate: + - Code examples, SQL queries, configuration values + - Technical terms like "TiDB", "TiKV", "PD", API names, etc. + - File paths, URLs, and command line examples + - Variable names and system configuration parameters +3. Translate only the descriptive text and explanations +4. Maintain the exact structure and indentation +5. Keep all special characters and formatting intact + +Content to translate: +{batch_content} + +Please provide the translated content maintaining all formatting and structure.""" + + # Add token estimation + try: + from main import print_token_estimation + print_token_estimation(prompt, "File addition translation") + except ImportError: + # Fallback if import fails - use tiktoken + try: + import tiktoken + enc = tiktoken.get_encoding("cl100k_base") + tokens = enc.encode(prompt) + actual_tokens = len(tokens) + char_count = len(prompt) + print(f" šŸ’° File addition translation") + print(f" šŸ“ Input: {char_count:,} characters") + print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") + except Exception: + # Final fallback to character approximation + estimated_tokens = len(prompt) // 4 + char_count = len(prompt) + print(f" šŸ’° File addition translation") + print(f" šŸ“ Input: {char_count:,} characters") + print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") + + try: + translated_content = ai_client.chat_completion( + messages=[{"role": "user", "content": prompt}], + temperature=0.1 + ) + thread_safe_print(f" āœ… Batch translation completed") + return translated_content + + except Exception as e: + thread_safe_print(f" āŒ Batch translation failed: {e}") + return batch_content # Return original content if translation fails + +def process_added_files(added_files, pr_url, github_client, ai_client, repo_config): + """Process newly added files by translating and creating them in target repository""" + if not added_files: + thread_safe_print("\nšŸ“„ No new files to process") + return + + thread_safe_print(f"\nšŸ“„ Processing {len(added_files)} newly added files...") + + target_local_path = repo_config['target_local_path'] + source_language = repo_config['source_language'] + target_language = repo_config['target_language'] + + for file_path, file_content in added_files.items(): + thread_safe_print(f"\nšŸ“ Processing new file: {file_path}") + + # Create target file path + target_file_path = os.path.join(target_local_path, file_path) + target_dir = os.path.dirname(target_file_path) + + # Create directory if it doesn't exist + if not os.path.exists(target_dir): + os.makedirs(target_dir, exist_ok=True) + thread_safe_print(f" šŸ“ Created directory: {target_dir}") + + # Check if file already exists + if os.path.exists(target_file_path): + thread_safe_print(f" āš ļø Target file already exists: {target_file_path}") + continue + + # Create section batches for translation + batches = create_section_batches(file_content, max_lines_per_batch=200) + thread_safe_print(f" šŸ“¦ Created {len(batches)} batches for translation") + + # Translate each batch + translated_batches = [] + for i, batch in enumerate(batches): + thread_safe_print(f" šŸ”„ Processing batch {i+1}/{len(batches)}") + translated_batch = translate_file_batch( + batch, + ai_client, + source_language, + target_language + ) + translated_batches.append(translated_batch) + + # Combine translated batches + translated_content = '\n'.join(translated_batches) + + # Write translated content to target file + try: + with open(target_file_path, 'w', encoding='utf-8') as f: + f.write(translated_content) + + thread_safe_print(f" āœ… Created translated file: {target_file_path}") + + except Exception as e: + thread_safe_print(f" āŒ Error creating file {target_file_path}: {e}") + + thread_safe_print(f"\nāœ… Completed processing all new files") diff --git a/scripts/translate_doc_pr/file_deleter.py b/scripts/translate_doc_pr/file_deleter.py new file mode 100644 index 0000000000000..c2064fe568cf3 --- /dev/null +++ b/scripts/translate_doc_pr/file_deleter.py @@ -0,0 +1,45 @@ +""" +File Deleter Module +Handles processing of deleted files and deleted sections +""" + +import os +import threading +from github import Github + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def process_deleted_files(deleted_files, github_client, repo_config): + """Process deleted files by removing them from target repository""" + if not deleted_files: + thread_safe_print("\nšŸ—‘ļø No files to delete") + return + + thread_safe_print(f"\nšŸ—‘ļø Processing {len(deleted_files)} deleted files...") + + target_local_path = repo_config['target_local_path'] + + for file_path in deleted_files: + thread_safe_print(f"\nšŸ—‘ļø Processing deleted file: {file_path}") + + # Create target file path + target_file_path = os.path.join(target_local_path, file_path) + + # Check if file exists in target + if os.path.exists(target_file_path): + try: + os.remove(target_file_path) + thread_safe_print(f" āœ… Deleted file: {target_file_path}") + except Exception as e: + thread_safe_print(f" āŒ Error deleting file {target_file_path}: {e}") + else: + thread_safe_print(f" āš ļø Target file not found: {target_file_path}") + + thread_safe_print(f"\nāœ… Completed processing deleted files") + +# Section deletion logic moved to file_updater.py diff --git a/scripts/translate_doc_pr/file_updater.py b/scripts/translate_doc_pr/file_updater.py new file mode 100644 index 0000000000000..82addd7cc6881 --- /dev/null +++ b/scripts/translate_doc_pr/file_updater.py @@ -0,0 +1,1692 @@ +""" +File Updater Module +Handles processing and translation of updated files and sections +""" + +import os +import re +import json +import threading +from concurrent.futures import ThreadPoolExecutor +from github import Github +from openai import OpenAI + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def get_updated_sections_from_ai(pr_diff, target_sections, source_old_content_dict, ai_client, source_language, target_language, target_file_name=None): + """Use AI to update target sections based on source old content, PR diff, and target sections""" + if not source_old_content_dict or not target_sections: + return {} + + # Filter out deleted sections and prepare source sections from old content + source_sections = {} + for key, old_content in source_old_content_dict.items(): + # Skip deleted sections + if 'deleted' in key: + continue + + # Handle null values by using empty string + content = old_content if old_content is not None else "" + source_sections[key] = content + + # Keep the original order from match_source_diff_to_target.json (no sorting needed) + formatted_source_sections = json.dumps(source_sections, ensure_ascii=False, indent=2) + formatted_target_sections = json.dumps(target_sections, ensure_ascii=False, indent=2) + + thread_safe_print(f" šŸ“Š Source sections: {len(source_sections)} sections") + thread_safe_print(f" šŸ“Š Target sections: {len(target_sections)} sections") + + # Calculate total content size + total_source_chars = sum(len(str(content)) for content in source_sections.values()) + total_target_chars = sum(len(str(content)) for content in target_sections.values()) + thread_safe_print(f" šŸ“ Content size: Source={total_source_chars:,} chars, Target={total_target_chars:,} chars") + + thread_safe_print(f" šŸ¤– Getting AI translation for {len(source_sections)} sections...") + + diff_content = source_sections + + prompt = f"""You are a professional technical writer in the Database domain. I will provide you with: + +1. Source sections in {source_language}: +{formatted_source_sections} + +2. GitHub PR changes (Diff): +{pr_diff} + +3. Current target sections in {target_language}: +{formatted_target_sections} + +Task: Update the target sections in {target_language} according to the diff in {source_language}. + +Instructions: +1. Carefully analyze the PR diff to understand what changes were made (additions, deletions, modifications) +2. Find the corresponding positions in the {target_language} sections and make the same changes. Do not change any content that is not modified in the diff, especially the format. +3. Keep the JSON structure unchanged, only modify the section content +4. Ensure the updated {target_language} content is logically consistent with the {source_language} changes +5. Maintain proper technical writing style and terminology in {target_language}. If a sentence in the diff is unchanged in content but only reordered in {source_language}, reuse its existing translation in {target_language}. + +Please return the complete updated JSON in the same format as target sections, without any additional explanatory text.""" + + # Save prompt to file for reference with target file prefix + target_file_prefix = "unknown" + if target_file_name: + # Use provided target file name + target_file_prefix = target_file_name.replace('/', '_').replace('.md', '') + elif target_sections: + # Try to extract filename from the first section key or content + first_key = next(iter(target_sections.keys()), "") + if "_" in first_key: + # If key contains underscore, it might have target file info + parts = first_key.split("_") + if len(parts) > 1: + target_file_prefix = parts[0] + + # Ensure temp_output directory exists + script_dir = os.path.dirname(os.path.abspath(__file__)) + temp_dir = os.path.join(script_dir, "temp_output") + os.makedirs(temp_dir, exist_ok=True) + + prompt_file = os.path.join(temp_dir, f"{target_file_prefix}_prompt-for-ai-translation.txt") + with open(prompt_file, 'w', encoding='utf-8') as f: + f.write(prompt) + + thread_safe_print(f"\nšŸ’¾ Prompt saved to {prompt_file}") + thread_safe_print(f"šŸ“ Prompt length: {len(prompt)} characters") + thread_safe_print(f"šŸ“Š Source sections: {len(source_sections)}") + thread_safe_print(f"šŸ“Š Target sections: {len(target_sections)}") + thread_safe_print(f"šŸ¤– Sending prompt to AI...") + + thread_safe_print(f"\n šŸ“¤ AI Update Prompt ({source_language} → {target_language}):") + thread_safe_print(f" " + "="*80) + thread_safe_print(f" Source Sections: {formatted_source_sections[:500]}...") + thread_safe_print(f" PR Diff (first 500 chars): {pr_diff[:500]}...") + thread_safe_print(f" Target Sections: {formatted_target_sections[:500]}...") + thread_safe_print(f" " + "="*80) + + try: + from main import print_token_estimation + print_token_estimation(prompt, f"Document translation ({source_language} → {target_language})") + except ImportError: + # Fallback if import fails - use tiktoken + try: + import tiktoken + enc = tiktoken.get_encoding("cl100k_base") + tokens = enc.encode(prompt) + actual_tokens = len(tokens) + char_count = len(prompt) + thread_safe_print(f" šŸ’° Document translation ({source_language} → {target_language})") + thread_safe_print(f" šŸ“ Input: {char_count:,} characters") + thread_safe_print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") + except Exception: + # Final fallback to character approximation + estimated_tokens = len(prompt) // 4 + char_count = len(prompt) + thread_safe_print(f" šŸ’° Document translation ({source_language} → {target_language})") + thread_safe_print(f" šŸ“ Input: {char_count:,} characters") + thread_safe_print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") + + try: + ai_response = ai_client.chat_completion( + messages=[{"role": "user", "content": prompt}], + temperature=0.1 + ) + thread_safe_print(f" šŸ“ AI translation response received") + thread_safe_print(f" šŸ“‹ AI response (first 500 chars): {ai_response[:500]}...") + + result = parse_updated_sections(ai_response) + thread_safe_print(f" šŸ“Š Parsed {len(result)} sections from AI response") + + # Save AI results to file with target file prefix + ai_results_file = os.path.join(temp_dir, f"{target_file_prefix}_updated_sections_from_ai.json") + with open(ai_results_file, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + thread_safe_print(f" šŸ’¾ AI results saved to {ai_results_file}") + return result + + except Exception as e: + thread_safe_print(f" āŒ AI translation failed: {e}") + return {} + +def parse_updated_sections(ai_response): + """Parse AI response and extract JSON (from get-updated-target-sections.py)""" + # Ensure temp_output directory exists for debug files + script_dir = os.path.dirname(os.path.abspath(__file__)) + temp_dir = os.path.join(script_dir, "temp_output") + os.makedirs(temp_dir, exist_ok=True) + + try: + print(f"\n šŸ”§ Parsing AI response...") + print(f" Raw response length: {len(ai_response)} characters") + + # Try to extract JSON from AI response + cleaned_response = ai_response.strip() + + # Remove markdown code blocks if present + if cleaned_response.startswith('```json'): + cleaned_response = cleaned_response[7:] + print(f" šŸ“ Removed '```json' prefix") + elif cleaned_response.startswith('```'): + cleaned_response = cleaned_response[3:] + print(f" šŸ“ Removed '```' prefix") + + if cleaned_response.endswith('```'): + cleaned_response = cleaned_response[:-3] + print(f" šŸ“ Removed '```' suffix") + + cleaned_response = cleaned_response.strip() + + print(f" šŸ“ Cleaned response length: {len(cleaned_response)} characters") + print(f" šŸ“ First 200 chars: {cleaned_response[:200]}...") + print(f" šŸ“ Last 200 chars: ...{cleaned_response[-200:]}") + + # Try to find JSON content between curly braces + start_idx = cleaned_response.find('{') + end_idx = cleaned_response.rfind('}') + + if start_idx != -1 and end_idx != -1 and end_idx > start_idx: + json_content = cleaned_response[start_idx:end_idx+1] + print(f" šŸ“ Extracted JSON content length: {len(json_content)} characters") + + try: + # Parse JSON + updated_sections = json.loads(json_content) + print(f" āœ… Successfully parsed JSON with {len(updated_sections)} sections") + return updated_sections + except json.JSONDecodeError as e: + print(f" āš ļø JSON seems incomplete, trying to fix...") + + # Try to fix incomplete JSON by finding the last complete entry + lines = json_content.split('\n') + fixed_lines = [] + in_value = False + quote_count = 0 + + for line in lines: + if '"' in line: + quote_count += line.count('"') + + fixed_lines.append(line) + + # If we have an even number of quotes, we might have a complete entry + if quote_count % 2 == 0 and (line.strip().endswith(',') or line.strip().endswith('"')): + # Try to parse up to this point + potential_json = '\n'.join(fixed_lines) + if not potential_json.rstrip().endswith('}'): + # Remove trailing comma and add closing brace + if potential_json.rstrip().endswith(','): + potential_json = potential_json.rstrip()[:-1] + '\n}' + else: + potential_json += '\n}' + + try: + partial_sections = json.loads(potential_json) + print(f" šŸ”§ Fixed JSON with {len(partial_sections)} sections") + return partial_sections + except: + continue + + # If all else fails, return the original error + raise e + else: + print(f" āŒ Could not find valid JSON structure in response") + return None + + except json.JSONDecodeError as e: + print(f" āŒ Error parsing AI response as JSON: {e}") + print(f" šŸ“ Error at position: {e.pos if hasattr(e, 'pos') else 'unknown'}") + + # Save debug info + debug_file = os.path.join(temp_dir, f"ai_response_debug_{os.getpid()}.txt") + with open(debug_file, 'w', encoding='utf-8') as f: + f.write("Original AI Response:\n") + f.write("="*80 + "\n") + f.write(ai_response) + f.write("\n" + "="*80 + "\n") + f.write("Cleaned Response:\n") + f.write("-"*80 + "\n") + f.write(cleaned_response if 'cleaned_response' in locals() else "Not available") + + print(f" šŸ“ Debug info saved to: {debug_file}") + return None + except Exception as e: + print(f" āŒ Unexpected error parsing AI response: {e}") + return None + + +def replace_frontmatter_content(lines, new_content): + """Replace content from beginning of file to first top-level header""" + # Find the first top-level header + first_header_idx = None + for i, line in enumerate(lines): + if line.strip().startswith('# '): + first_header_idx = i + break + + if first_header_idx is None: + # No top-level header found, replace entire content + return new_content.split('\n') + + # Replace content from start to before first header + new_lines = new_content.split('\n') + return new_lines + lines[first_header_idx:] + + +def replace_toplevel_section_content(lines, target_line_num, new_content): + """Replace content from top-level header to first next-level header""" + start_idx = target_line_num - 1 # Convert to 0-based index + + # Find the end of top-level section (before first ## header) + end_idx = len(lines) + for i in range(start_idx + 1, len(lines)): + line = lines[i].strip() + if line.startswith('##'): # Found first next-level header + end_idx = i + break + + # Replace the top-level section content (from start_idx to end_idx) + new_lines = new_content.split('\n') + return lines[:start_idx] + new_lines + lines[end_idx:] + + +def update_local_document(file_path, updated_sections, hierarchy_dict, target_local_path): + """Update local document using hierarchy-based section identification (from update-target-doc-v2.py)""" + local_path = os.path.join(target_local_path, file_path) + + if not os.path.exists(local_path): + print(f" āŒ Local file not found: {local_path}") + return False + + try: + # Read document content + with open(local_path, 'r', encoding='utf-8') as f: + document_content = f.read() + + lines = document_content.split('\n') + + replacements_made = [] + + # Use a unified approach: build a complete replacement plan first, then execute it + # This avoids line number shifts during the replacement process + + # Find section boundaries for ALL sections + section_boundaries = find_section_boundaries(lines, hierarchy_dict) + + # Create a comprehensive replacement plan + replacement_plan = [] + + for line_num, new_content in updated_sections.items(): + if line_num == "0": + # Special handling for frontmatter + first_header_idx = None + for i, line in enumerate(lines): + if line.strip().startswith('# '): + first_header_idx = i + break + + replacement_plan.append({ + 'type': 'frontmatter', + 'start': 0, + 'end': first_header_idx if first_header_idx else len(lines), + 'new_content': new_content, + 'line_num': line_num + }) + + elif line_num in hierarchy_dict: + hierarchy = hierarchy_dict[line_num] + if ' > ' not in hierarchy: # Top-level section + # Special handling for top-level sections + start_idx = int(line_num) - 1 + end_idx = len(lines) + for i in range(start_idx + 1, len(lines)): + line = lines[i].strip() + if line.startswith('##'): + end_idx = i + break + + replacement_plan.append({ + 'type': 'toplevel', + 'start': start_idx, + 'end': end_idx, + 'new_content': new_content, + 'line_num': line_num + }) + else: + # Regular section + if line_num in section_boundaries: + boundary = section_boundaries[line_num] + replacement_plan.append({ + 'type': 'regular', + 'start': boundary['start'], + 'end': boundary['end'], + 'new_content': new_content, + 'line_num': line_num, + 'hierarchy': boundary['hierarchy'] + }) + else: + print(f" āš ļø Section at line {line_num} not found in hierarchy") + + # Sort replacement plan: process from bottom to top of the document to avoid line shifts + # Sort by start line in reverse order (highest line number first) + replacement_plan.sort(key=lambda x: -x['start']) + + # Execute replacements in the planned order (from bottom to top) + print(f" šŸ“‹ Executing {len(replacement_plan)} replacements from bottom to top:") + for i, replacement in enumerate(replacement_plan): + print(f" {i+1}. {replacement['type']} (line {replacement.get('line_num', '0')}, start: {replacement['start']})") + + for replacement in replacement_plan: + start = replacement['start'] + end = replacement['end'] + new_content = replacement['new_content'] + new_lines = new_content.split('\n') + + # Replace the content + lines = lines[:start] + new_lines + lines[end:] + + # Record the replacement + original_line_count = end - start + line_diff = len(new_lines) - original_line_count + + replacements_made.append({ + 'type': replacement['type'], + 'line_num': replacement.get('line_num', 'N/A'), + 'hierarchy': replacement.get('hierarchy', 'N/A'), + 'start': start, + 'end': end, + 'original_lines': original_line_count, + 'new_lines': len(new_lines), + 'line_diff': line_diff + }) + + print(f" āœ… Updated {replacement['type']} section: {replacement.get('line_num', 'frontmatter')}") + + # Save updated document + with open(local_path, 'w', encoding='utf-8') as f: + f.write('\n'.join(lines)) + + print(f" āœ… Updated {len(replacements_made)} sections") + for replacement in replacements_made: + print(f" šŸ“ Line {replacement['line_num']}: {replacement['hierarchy']}") + + return True + + except Exception as e: + thread_safe_print(f" āŒ Error updating file: {e}") + return False + +def find_section_boundaries(lines, hierarchy_dict): + """Find the start and end line for each section based on hierarchy (from update-target-doc-v2.py)""" + section_boundaries = {} + + # Sort sections by line number + sorted_sections = sorted(hierarchy_dict.items(), key=lambda x: int(x[0])) + + for i, (line_num, hierarchy) in enumerate(sorted_sections): + start_line = int(line_num) - 1 # Convert to 0-based index + + # Find end line (start of next section at same or higher level) + end_line = len(lines) # Default to end of document + + if start_line >= len(lines): + continue + + # Get current section level + current_line = lines[start_line].strip() + if not current_line.startswith('#'): + continue + + current_level = len(current_line.split()[0]) # Count # characters + + # Look for next section at same or higher level + for j in range(start_line + 1, len(lines)): + line = lines[j].strip() + if line.startswith('#'): + line_level = len(line.split()[0]) if line.split() else 0 + if line_level <= current_level: + end_line = j + break + + section_boundaries[line_num] = { + 'start': start_line, + 'end': end_line, + 'hierarchy': hierarchy, + 'level': current_level + } + + return section_boundaries + +def insert_sections_into_document(file_path, translated_sections, target_insertion_points, target_local_path): + """Insert translated sections into the target document at specified points""" + + if not translated_sections or not target_insertion_points: + thread_safe_print(f" āš ļø No sections or insertion points provided") + return False + + local_path = os.path.join(target_local_path, file_path) + + if not os.path.exists(local_path): + thread_safe_print(f" āŒ Local file not found: {local_path}") + return False + + try: + # Read document content + with open(local_path, 'r', encoding='utf-8') as f: + document_content = f.read() + + lines = document_content.split('\n') + thread_safe_print(f" šŸ“„ Document has {len(lines)} lines") + + # Sort insertion points by line number in descending order to avoid position shifts + sorted_insertions = sorted( + target_insertion_points.items(), + key=lambda x: x[1]['insertion_after_line'], + reverse=True + ) + + insertions_made = [] + + for group_id, point_data in sorted_insertions: + insertion_after_line = point_data['insertion_after_line'] + new_sections = point_data['new_sections'] + insertion_type = point_data['insertion_type'] + + thread_safe_print(f" šŸ“Œ Inserting {len(new_sections)} sections after line {insertion_after_line}") + + # Convert 1-based line number to 0-based index for insertion point + # insertion_after_line is 1-based, so insertion_index should be insertion_after_line - 1 + insertion_index = insertion_after_line - 1 + + # Prepare new content to insert + new_content_lines = [] + + # Add an empty line before the new sections if not already present + if insertion_index < len(lines) and lines[insertion_index].strip(): + new_content_lines.append("") + + # Add each translated section + for section_line_num in new_sections: + # Find the corresponding translated content + section_hierarchy = None + section_content = None + + # Search for the section in translated_sections by line number or hierarchy + for hierarchy, content in translated_sections.items(): + # Try to match by hierarchy or find the content + if str(section_line_num) in hierarchy or content: # This is a simplified matching + section_hierarchy = hierarchy + section_content = content + break + + if section_content: + # Split content into lines and add to insertion + content_lines = section_content.split('\n') + new_content_lines.extend(content_lines) + + # Add spacing between sections + if section_line_num != new_sections[-1]: # Not the last section + new_content_lines.append("") + + thread_safe_print(f" āœ… Added section: {section_hierarchy}") + else: + thread_safe_print(f" āš ļø Could not find translated content for section at line {section_line_num}") + + # Add an empty line after the new sections if not already present + # Check if the new content already ends with an empty line + if new_content_lines and not new_content_lines[-1].strip(): + # Content already ends with empty line, don't add another + pass + elif insertion_index + 1 < len(lines) and lines[insertion_index + 1].strip(): + # Next line has content and our content doesn't end with empty line, add one + new_content_lines.append("") + + # Insert the new content (insert after insertion_index line, before the next line) + # If insertion_after_line is 251, we want to insert at position 252 (0-based index 251) + lines = lines[:insertion_index + 1] + new_content_lines + lines[insertion_index + 1:] + + insertions_made.append({ + 'group_id': group_id, + 'insertion_after_line': insertion_after_line, + 'sections_count': len(new_sections), + 'lines_added': len(new_content_lines), + 'insertion_type': insertion_type + }) + + # Save updated document + with open(local_path, 'w', encoding='utf-8') as f: + f.write('\n'.join(lines)) + + thread_safe_print(f" āœ… Successfully inserted {len(insertions_made)} section groups") + for insertion in insertions_made: + thread_safe_print(f" šŸ“ {insertion['group_id']}: {insertion['sections_count']} sections, {insertion['lines_added']} lines after line {insertion['insertion_after_line']}") + + return True + + except Exception as e: + thread_safe_print(f" āŒ Error inserting sections: {e}") + return False + +def process_modified_sections(modified_sections, pr_diff, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Process modified sections with full data structure support""" + results = [] + + for file_path, file_data in modified_sections.items(): + thread_safe_print(f"\nšŸ“„ Processing {file_path}") + + try: + # Call process_single_file with the complete data structure + success, message = process_single_file( + file_path, + file_data, # Pass the complete data structure (includes 'sections', 'original_hierarchy', etc.) + pr_diff, + pr_url, + github_client, + ai_client, + repo_config, + max_non_system_sections + ) + + if success: + thread_safe_print(f" āœ… Successfully processed {file_path}") + results.append((file_path, True, message)) + else: + thread_safe_print(f" āŒ Failed to process {file_path}: {message}") + results.append((file_path, False, message)) + + except Exception as e: + thread_safe_print(f" āŒ Error processing {file_path}: {e}") + results.append((file_path, False, f"Error processing {file_path}: {e}")) + + return results + +def process_deleted_sections(deleted_sections, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Process deleted sections with full data structure support""" + results = [] + + for file_path, source_sections in deleted_sections.items(): + thread_safe_print(f"\nšŸ—‘ļø Processing deleted sections in {file_path}") + + try: + # Call process_single_file_deletion with the complete data structure + success, message = process_single_file_deletion( + file_path, + source_sections, + pr_url, + github_client, + ai_client, + repo_config, + max_non_system_sections + ) + + if success: + thread_safe_print(f" āœ… Successfully processed deletions in {file_path}") + results.append((file_path, True, message)) + else: + thread_safe_print(f" āŒ Failed to process deletions in {file_path}: {message}") + results.append((file_path, False, message)) + + except Exception as e: + thread_safe_print(f" āŒ Error processing deletions in {file_path}: {e}") + results.append((file_path, False, f"Error processing deletions in {file_path}: {e}")) + + return results + +def process_single_file_deletion(file_path, source_sections, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Process deletion of sections in a single file""" + + # Import needed functions + from pr_analyzer import get_target_hierarchy_and_content + from section_matcher import ( + find_direct_matches_for_special_files, + filter_non_system_sections, + get_corresponding_sections, + is_system_variable_or_config, + clean_title_for_matching, + parse_ai_response, + find_matching_line_numbers + ) + + # Get target file hierarchy and content + target_hierarchy, target_lines = get_target_hierarchy_and_content( + file_path, github_client, repo_config['target_repo'] + ) + + if not target_hierarchy: + return False, f"Could not get target hierarchy for {file_path}" + + # Separate system variables from regular sections for hybrid mapping + system_sections = {} + regular_sections = {} + + for line_num, hierarchy in source_sections.items(): + # Extract title for checking + if ' > ' in hierarchy: + title = hierarchy.split(' > ')[-1] + else: + title = hierarchy + + cleaned_title = clean_title_for_matching(title) + if is_system_variable_or_config(cleaned_title): + system_sections[line_num] = hierarchy + else: + regular_sections[line_num] = hierarchy + + sections_to_delete = [] + + # Process system variables with direct matching + if system_sections: + thread_safe_print(f" šŸŽÆ Direct matching for {len(system_sections)} system sections...") + matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( + system_sections, target_hierarchy, target_lines + ) + + for target_line_num, hierarchy_string in matched_dict.items(): + sections_to_delete.append(int(target_line_num)) + thread_safe_print(f" āœ… Marked system section for deletion: line {target_line_num}") + + if failed_matches: + thread_safe_print(f" āŒ Failed to match {len(failed_matches)} system sections") + for failed_line in failed_matches: + thread_safe_print(f" - Line {failed_line}: {system_sections[failed_line]}") + + # Process regular sections with AI matching + if regular_sections: + thread_safe_print(f" šŸ¤– AI matching for {len(regular_sections)} regular sections...") + + # Filter target hierarchy for AI + filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) + + # Check if filtered hierarchy is reasonable for AI + if len(filtered_target_hierarchy) > max_non_system_sections: + thread_safe_print(f" āŒ Target hierarchy too large for AI: {len(filtered_target_hierarchy)} > {max_non_system_sections}") + else: + # Get AI mapping (convert dict values to lists as expected by the function) + source_list = list(regular_sections.values()) + target_list = list(filtered_target_hierarchy.values()) + + ai_mapping = get_corresponding_sections( + source_list, + target_list, + ai_client, + repo_config['source_language'], + repo_config['target_language'], + max_tokens=20000 # Use default value for now, can be made configurable later + ) + + if ai_mapping: + # Parse AI response and find matching line numbers + ai_sections = parse_ai_response(ai_mapping) + ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) + + for source_line, target_line in ai_matched.items(): + try: + sections_to_delete.append(int(target_line)) + thread_safe_print(f" āœ… Marked regular section for deletion: line {target_line}") + except ValueError as e: + thread_safe_print(f" āŒ Error converting target_line to int: {target_line}, error: {e}") + # If target_line is not a number, try to find it in target_hierarchy + for line_num, hierarchy in target_hierarchy.items(): + if target_line in hierarchy or hierarchy in target_line: + sections_to_delete.append(int(line_num)) + thread_safe_print(f" āœ… Found matching section at line {line_num}: {hierarchy}") + break + + # Delete the sections from local document + if sections_to_delete: + success = delete_sections_from_document(file_path, sections_to_delete, repo_config['target_local_path']) + if success: + return True, f"Successfully deleted {len(sections_to_delete)} sections from {file_path}" + else: + return False, f"Failed to delete sections from {file_path}" + else: + return False, f"No sections to delete in {file_path}" + +def delete_sections_from_document(file_path, sections_to_delete, target_local_path): + """Delete specified sections from the local document""" + target_file_path = os.path.join(target_local_path, file_path) + + if not os.path.exists(target_file_path): + thread_safe_print(f" āŒ Target file not found: {target_file_path}") + return False + + try: + # Read current file content + with open(target_file_path, 'r', encoding='utf-8') as f: + content = f.read() + + lines = content.split('\n') + + # Import needed function + from pr_analyzer import build_hierarchy_dict + + # Build hierarchy to understand section boundaries + target_hierarchy = build_hierarchy_dict(content) + + # Sort sections to delete in reverse order to maintain line numbers + sections_to_delete.sort(reverse=True) + + thread_safe_print(f" šŸ—‘ļø Deleting {len(sections_to_delete)} sections from {file_path}") + + for section_line in sections_to_delete: + section_start = section_line - 1 # Convert to 0-based index + + if section_start < 0 or section_start >= len(lines): + thread_safe_print(f" āŒ Invalid section line: {section_line}") + continue + + # Find section end + section_end = len(lines) - 1 # Default to end of file + + # Look for next header at same or higher level + current_line = lines[section_start].strip() + if current_line.startswith('#'): + current_level = len(current_line.split('#')[1:]) # Count # characters + + for i in range(section_start + 1, len(lines)): + line = lines[i].strip() + if line.startswith('#'): + line_level = len(line.split('#')[1:]) + if line_level <= current_level: + section_end = i - 1 + break + + # Delete section (from section_start to section_end inclusive) + thread_safe_print(f" šŸ—‘ļø Deleting lines {section_start + 1} to {section_end + 1}") + del lines[section_start:section_end + 1] + + # Write updated content back to file + updated_content = '\n'.join(lines) + with open(target_file_path, 'w', encoding='utf-8') as f: + f.write(updated_content) + + thread_safe_print(f" āœ… Updated file: {target_file_path}") + return True + + except Exception as e: + thread_safe_print(f" āŒ Error deleting sections from {target_file_path}: {e}") + return False + +def process_single_file(file_path, source_sections, pr_diff, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Process a single file - thread-safe function for parallel processing""" + thread_id = threading.current_thread().name + thread_safe_print(f"\nšŸ“„ [{thread_id}] Processing {file_path}") + + try: + # Check if this is a TOC file with special operations + if isinstance(source_sections, dict) and 'type' in source_sections and source_sections['type'] == 'toc': + from toc_processor import process_toc_file + return process_toc_file(file_path, source_sections, pr_url, github_client, ai_client, repo_config) + + # Check if this is enhanced sections + if isinstance(source_sections, dict) and 'sections' in source_sections: + if source_sections.get('type') == 'enhanced_sections': + # Skip all the matching logic and directly extract data + thread_safe_print(f" [{thread_id}] šŸš€ Using enhanced sections data, skipping matching logic") + enhanced_sections = source_sections['sections'] + + # Extract target sections and source old content from enhanced sections + # Maintain the exact order from match_source_diff_to_target.json + from collections import OrderedDict + target_sections = OrderedDict() + source_old_content_dict = OrderedDict() + + # Process in the exact order they appear in enhanced_sections (which comes from match_source_diff_to_target.json) + for key, section_info in enhanced_sections.items(): + if isinstance(section_info, dict): + operation = section_info.get('source_operation', '') + + # Skip deleted sections - they shouldn't be in the enhanced_sections anyway + if operation == 'deleted': + continue + + # For source sections: use old_content for modified, new_content for added + if operation == 'added': + source_content = section_info.get('source_new_content', '') + else: # modified + source_content = section_info.get('source_old_content', '') + + # For target sections: use target_content for modified, empty string for added + if operation == 'added': + target_content = "" # Added sections have no existing target content + else: # modified + target_content = section_info.get('target_content', '') + + # Add to both dictionaries using the same key from match_source_diff_to_target.json + if source_content is not None: + source_old_content_dict[key] = source_content + target_sections[key] = target_content + + thread_safe_print(f" [{thread_id}] šŸ“Š Extracted: {len(target_sections)} target sections, {len(source_old_content_dict)} source old content entries") + + # Update sections with AI (get-updated-target-sections.py logic) + thread_safe_print(f" [{thread_id}] šŸ¤– Getting updated sections from AI...") + updated_sections = get_updated_sections_from_ai(pr_diff, target_sections, source_old_content_dict, ai_client, repo_config['source_language'], repo_config['target_language'], file_path) + if not updated_sections: + thread_safe_print(f" [{thread_id}] āš ļø Could not get AI update") + return False, f"Could not get AI update for {file_path}" + + # Return the AI results for further processing + thread_safe_print(f" [{thread_id}] āœ… Successfully got AI translation results for {file_path}") + return True, updated_sections # Return the actual AI results + + else: + # New format: complete data structure + actual_sections = source_sections['sections'] + + # Regular file processing continues here for old format + # Get target hierarchy and content (get-target-affected-hierarchy.py logic) + from pr_analyzer import get_target_hierarchy_and_content + target_hierarchy, target_lines = get_target_hierarchy_and_content(file_path, github_client, repo_config['target_repo']) + if not target_hierarchy: + thread_safe_print(f" [{thread_id}] āš ļø Could not get target content") + return False, f"Could not get target content for {file_path}" + else: + # Old format: direct dict + actual_sections = source_sections + + # Only do mapping if we don't have enhanced sections + if 'enhanced_sections' not in locals() or not enhanced_sections: + # Separate different types of sections + from section_matcher import is_system_variable_or_config + system_var_sections = {} + toplevel_sections = {} + frontmatter_sections = {} + regular_sections = {} + + for line_num, hierarchy in actual_sections.items(): + if line_num == "0" and hierarchy == "frontmatter": + # Special handling for frontmatter + frontmatter_sections[line_num] = hierarchy + else: + # Extract the leaf title from hierarchy + leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy + + if is_system_variable_or_config(leaf_title): + system_var_sections[line_num] = hierarchy + elif leaf_title.startswith('# '): + # Top-level titles need special handling + toplevel_sections[line_num] = hierarchy + else: + regular_sections[line_num] = hierarchy + + thread_safe_print(f" [{thread_id}] šŸ“Š Found {len(system_var_sections)} system variable/config, {len(toplevel_sections)} top-level, {len(frontmatter_sections)} frontmatter, and {len(regular_sections)} regular sections") + + target_affected = {} + + # Process frontmatter sections with special handling + if frontmatter_sections: + thread_safe_print(f" [{thread_id}] šŸ“„ Processing frontmatter section...") + # For frontmatter, we simply map it to line 0 in target + for line_num, hierarchy in frontmatter_sections.items(): + target_affected[line_num] = hierarchy + thread_safe_print(f" [{thread_id}] āœ… Mapped {len(frontmatter_sections)} frontmatter section") + + # Process top-level titles with special matching + if toplevel_sections: + thread_safe_print(f" [{thread_id}] šŸ” Top-level title matching for {len(toplevel_sections)} sections...") + from section_matcher import find_toplevel_title_matches + toplevel_matched, toplevel_failed, toplevel_skipped = find_toplevel_title_matches(toplevel_sections, target_lines) + + if toplevel_matched: + target_affected.update(toplevel_matched) + thread_safe_print(f" [{thread_id}] āœ… Top-level matched {len(toplevel_matched)} sections") + + if toplevel_failed: + thread_safe_print(f" [{thread_id}] āš ļø {len(toplevel_failed)} top-level sections failed matching") + for failed in toplevel_failed: + thread_safe_print(f" āŒ {failed['hierarchy']}: {failed['reason']}") + + # Process system variables/config sections with direct matching + if system_var_sections: + thread_safe_print(f" [{thread_id}] šŸŽÆ Direct matching {len(system_var_sections)} system variable/config sections...") + from section_matcher import find_direct_matches_for_special_files + direct_matched, failed_matches, skipped_sections = find_direct_matches_for_special_files(system_var_sections, target_hierarchy, target_lines) + + if direct_matched: + target_affected.update(direct_matched) + thread_safe_print(f" [{thread_id}] āœ… Direct matched {len(direct_matched)} system variable/config sections") + + if failed_matches: + thread_safe_print(f" [{thread_id}] āš ļø {len(failed_matches)} system variable/config sections failed direct matching") + for failed in failed_matches: + thread_safe_print(f" āŒ {failed['hierarchy']}: {failed['reason']}") + + # Process regular sections with AI mapping using filtered target hierarchy + if regular_sections: + thread_safe_print(f" [{thread_id}] šŸ¤– AI mapping {len(regular_sections)} regular sections...") + + # Filter target hierarchy to only include non-system sections for AI mapping + from section_matcher import filter_non_system_sections + filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) + + # Check if filtered target hierarchy exceeds the maximum allowed for AI mapping + MAX_NON_SYSTEM_SECTIONS_FOR_AI = 120 + if len(filtered_target_hierarchy) > MAX_NON_SYSTEM_SECTIONS_FOR_AI: + thread_safe_print(f" [{thread_id}] āŒ Too many non-system sections ({len(filtered_target_hierarchy)} > {MAX_NON_SYSTEM_SECTIONS_FOR_AI})") + thread_safe_print(f" [{thread_id}] āš ļø Skipping AI mapping for regular sections to avoid complexity") + + # If no system sections were matched either, return error + if not target_affected: + error_message = f"File {file_path} has too many non-system sections ({len(filtered_target_hierarchy)} > {MAX_NON_SYSTEM_SECTIONS_FOR_AI}) and no system variable sections were matched" + return False, error_message + + # Continue with only system variable matches if available + thread_safe_print(f" [{thread_id}] āœ… Proceeding with {len(target_affected)} system variable/config sections only") + else: + # Proceed with AI mapping using filtered hierarchy + source_list = list(regular_sections.values()) + target_list = list(filtered_target_hierarchy.values()) + + from section_matcher import get_corresponding_sections + ai_response = get_corresponding_sections(source_list, target_list, ai_client, repo_config['source_language'], repo_config['target_language'], max_tokens=20000) + if ai_response: + # Parse AI response and find matching line numbers in the original (unfiltered) hierarchy + from section_matcher import parse_ai_response, find_matching_line_numbers + ai_sections = parse_ai_response(ai_response) + ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) # Use original hierarchy for line number lookup + + if ai_matched: + target_affected.update(ai_matched) + thread_safe_print(f" [{thread_id}] āœ… AI mapped {len(ai_matched)} regular sections") + else: + thread_safe_print(f" [{thread_id}] āš ļø AI mapping failed for regular sections") + else: + thread_safe_print(f" [{thread_id}] āš ļø Could not get AI response for regular sections") + + # Summary of mapping results + thread_safe_print(f" [{thread_id}] šŸ“Š Total mapped: {len(target_affected)} out of {len(actual_sections)} sections") + + if not target_affected: + thread_safe_print(f" [{thread_id}] āš ļø Could not map sections") + return False, f"Could not map sections for {file_path}" + + thread_safe_print(f" [{thread_id}] āœ… Mapped {len(target_affected)} sections") + + # Extract target sections (get-target-affected-sections.py logic) + thread_safe_print(f" [{thread_id}] šŸ“ Extracting target sections...") + from pr_analyzer import extract_affected_sections + target_sections = extract_affected_sections(target_affected, target_lines) + + # Extract source old content from the enhanced data structure + thread_safe_print(f" [{thread_id}] šŸ“– Extracting source old content...") + source_old_content_dict = {} + + # Handle different data structures for source_sections + if isinstance(source_sections, dict) and 'sections' in source_sections: + # New format: complete data structure with enhanced matching info + for key, section_info in source_sections.items(): + if isinstance(section_info, dict) and 'source_old_content' in section_info: + source_old_content_dict[key] = section_info['source_old_content'] + else: + # Fallback: if we don't have the enhanced structure, we need to get it differently + thread_safe_print(f" [{thread_id}] āš ļø Source sections missing enhanced structure, using fallback") + # For now, create empty dict to avoid errors - this should be addressed in the calling code + source_old_content_dict = {} + + # Update sections with AI (get-updated-target-sections.py logic) + thread_safe_print(f" [{thread_id}] šŸ¤– Getting updated sections from AI...") + updated_sections = get_updated_sections_from_ai(pr_diff, target_sections, source_old_content_dict, ai_client, repo_config['source_language'], repo_config['target_language'], file_path) + if not updated_sections: + thread_safe_print(f" [{thread_id}] āš ļø Could not get AI update") + return False, f"Could not get AI update for {file_path}" + + # Update local document (update-target-doc-v2.py logic) + thread_safe_print(f" [{thread_id}] šŸ’¾ Updating local document...") + success = update_local_document(file_path, updated_sections, target_affected, repo_config['target_local_path']) + + if success: + thread_safe_print(f" [{thread_id}] šŸŽ‰ Successfully updated {file_path}") + return True, f"Successfully updated {file_path}" + else: + thread_safe_print(f" [{thread_id}] āŒ Failed to update {file_path}") + return False, f"Failed to update {file_path}" + + except Exception as e: + thread_safe_print(f" [{thread_id}] āŒ Error processing {file_path}: {e}") + return False, f"Error processing {file_path}: {e}" + +def process_added_sections(added_sections, pr_diff, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Process added sections by translating and inserting them""" + if not added_sections: + thread_safe_print("\nāž• No added sections to process") + return + + thread_safe_print(f"\nāž• Processing added sections from {len(added_sections)} files...") + + # Import needed functions + from section_matcher import map_insertion_points_to_target + from pr_analyzer import get_target_hierarchy_and_content + + for file_path, section_data in added_sections.items(): + thread_safe_print(f"\nāž• Processing added sections in {file_path}") + + source_sections = section_data['sections'] + insertion_points = section_data['insertion_points'] + + # Get target file hierarchy and content + target_hierarchy, target_lines = get_target_hierarchy_and_content( + file_path, github_client, repo_config['target_repo'] + ) + + if not target_hierarchy: + thread_safe_print(f" āŒ Could not get target hierarchy for {file_path}") + continue + + # Map insertion points to target language + target_insertion_points = map_insertion_points_to_target( + insertion_points, target_hierarchy, target_lines, file_path, pr_url, github_client, ai_client, repo_config, max_non_system_sections + ) + + if not target_insertion_points: + thread_safe_print(f" āŒ No insertion points mapped for {file_path}") + continue + + # Use AI to translate/update new sections (similar to modified sections) + # Since we're now using source_old_content, we need to extract it from the added sections + source_old_content_dict = {} + for key, content in source_sections.items(): + # For added sections, source_old_content is typically None or empty + # We use the new content (from the source file) as the content to translate + source_old_content_dict[key] = content if content is not None else "" + + # Get target sections (empty for new sections, but we need the structure) + target_sections = {} # New sections don't have existing target content + + # Use the same AI function to translate the new sections + translated_sections = get_updated_sections_from_ai( + pr_diff, + target_sections, + source_old_content_dict, + ai_client, + repo_config['source_language'], + repo_config['target_language'], + file_path + ) + + if translated_sections: + # Insert translated sections into document + insert_sections_into_document(file_path, translated_sections, target_insertion_points, repo_config['target_local_path']) + thread_safe_print(f" āœ… Successfully inserted {len(translated_sections)} sections in {file_path}") + else: + thread_safe_print(f" āš ļø No sections were translated for {file_path}") + +def process_files_in_batches(source_changes, pr_diff, pr_url, github_client, ai_client, repo_config, operation_type="modified", batch_size=5, max_non_system_sections=120): + """Process files in parallel batches""" + # Handle different data formats + if isinstance(source_changes, dict): + files = [] + for path, data in source_changes.items(): + if isinstance(data, dict): + if 'type' in data and data['type'] == 'toc': + # TOC file with special operations + files.append((path, data)) + elif 'sections' in data: + # New format: extract sections for processing + files.append((path, data['sections'])) + else: + # Old format: direct dict + files.append((path, data)) + else: + # Old format: direct dict + files.append((path, data)) + else: + files = list(source_changes.items()) + + total_files = len(files) + + if total_files == 0: + return [] + + thread_safe_print(f"\nšŸ”„ Processing {total_files} files in batches of {batch_size}") + + results = [] + + # Process files in batches + for i in range(0, total_files, batch_size): + batch = files[i:i + batch_size] + batch_num = (i // batch_size) + 1 + total_batches = (total_files + batch_size - 1) // batch_size + + thread_safe_print(f"\nšŸ“¦ Batch {batch_num}/{total_batches}: Processing {len(batch)} files") + + # Process current batch in parallel + with ThreadPoolExecutor(max_workers=len(batch), thread_name_prefix=f"Batch{batch_num}") as executor: + # Submit all files in current batch + future_to_file = {} + for file_path, source_sections in batch: + future = executor.submit( + process_single_file, + file_path, + source_sections, + pr_diff, + pr_url, + github_client, + ai_client, + repo_config, + max_non_system_sections + ) + future_to_file[future] = file_path + + # Collect results as they complete + from concurrent.futures import as_completed + batch_results = [] + for future in as_completed(future_to_file): + file_path = future_to_file[future] + try: + success, message = future.result() + batch_results.append((file_path, success, message)) + except Exception as e: + batch_results.append((file_path, False, f"Exception in thread: {e}")) + + results.extend(batch_results) + + # Brief pause between batches to avoid overwhelming the APIs + if i + batch_size < total_files: + thread_safe_print(f" āøļø Waiting 2 seconds before next batch...") + import time + time.sleep(2) + + return results + +def update_target_document_from_match_data(match_file_path, target_local_path, target_file_name=None): + """ + Update target document using data from match_source_diff_to_target.json + This integrates the logic from test_target_update.py + + Args: + match_file_path: Path to the match_source_diff_to_target.json file + target_local_path: Local path to the target repository + target_file_name: Optional target file name (if not provided, will be extracted from match_file_path) + """ + import json + import os + from pathlib import Path + + # Load match data + if not os.path.exists(match_file_path): + thread_safe_print(f"āŒ {match_file_path} file does not exist") + return False + + with open(match_file_path, 'r', encoding='utf-8') as f: + match_data = json.load(f) + + thread_safe_print(f"āœ… Loaded {len(match_data)} section matching data from {match_file_path}") + thread_safe_print(f" Reading translation results directly from target_new_content field") + + if not match_data: + thread_safe_print("āŒ No matching data found") + return False + + # Sort sections by target_line from large to small (modify from back to front) + sections_with_line = [] + + for key, section_data in match_data.items(): + operation = section_data.get('source_operation', '') + target_new_content = section_data.get('target_new_content') + + # For deleted sections, target_new_content should be null + if operation == 'deleted': + if target_new_content is not None: + thread_safe_print(f" āš ļø Deleted section {key} has non-null target_new_content, should be fixed") + thread_safe_print(f" šŸ—‘ļø Including deleted section: {key}") + elif not target_new_content: + thread_safe_print(f" āš ļø Skipping section without target_new_content: {key}") + continue + + target_line = section_data.get('target_line') + if target_line and target_line != 'unknown': + try: + # Handle special case for bottom sections + if target_line == "-1": + line_num = -1 # Special marker for bottom sections + else: + line_num = int(target_line) + sections_with_line.append((key, section_data, line_num)) + except ValueError: + thread_safe_print(f"āš ļø Skipping invalid target_line: {target_line} for {key}") + + # Separate sections into different processing groups + bottom_modified_sections = [] # Process first: modify existing content at document end + regular_sections = [] # Process second: normal operations from back to front + bottom_added_sections = [] # Process last: append new content to document end + + for key, section_data, line_num in sections_with_line: + target_hierarchy = section_data.get('target_hierarchy', '') + + if target_hierarchy.startswith('bottom-modified-'): + bottom_modified_sections.append((key, section_data, line_num)) + elif target_hierarchy.startswith('bottom-added-'): + bottom_added_sections.append((key, section_data, line_num)) + else: + regular_sections.append((key, section_data, line_num)) + + # Sort each group appropriately + def get_source_line_num(item): + key, section_data, line_num = item + if '_' in key and key.split('_')[1].isdigit(): + return int(key.split('_')[1]) + return 0 + + # Bottom modified: sort by source line number (large to small) + bottom_modified_sections.sort(key=lambda x: -get_source_line_num(x)) + + # Regular sections: sort by target_line (large to small), then by source line number + regular_sections.sort(key=lambda x: (-x[2], -get_source_line_num(x))) + + # Bottom added: sort by source line number (small to large) for proper document order + bottom_added_sections.sort(key=lambda x: get_source_line_num(x)) + + # Combine all sections in processing order + all_sections = bottom_modified_sections + regular_sections + bottom_added_sections + + thread_safe_print(f"\nšŸ“Š Processing order: bottom-modified -> regular -> bottom-added") + thread_safe_print(f" šŸ“‹ Bottom modified sections: {len(bottom_modified_sections)}") + thread_safe_print(f" šŸ“‹ Regular sections: {len(regular_sections)}") + thread_safe_print(f" šŸ“‹ Bottom added sections: {len(bottom_added_sections)}") + + if not all_sections: + thread_safe_print("āŒ No valid sections found for update") + return False + + thread_safe_print(f"\nšŸ“Š Detailed processing order:") + for i, (key, section_data, line_num) in enumerate(all_sections, 1): + operation = section_data.get('source_operation', '') + hierarchy = section_data.get('target_hierarchy', '') + insertion_type = section_data.get('insertion_type', '') + + # Extract source line number for display + source_line_num = int(key.split('_')[1]) if '_' in key and key.split('_')[1].isdigit() else 'N/A' + + # Display target_line with special handling for bottom sections + target_display = "END" if line_num == -1 else str(line_num) + + # Determine section group + if hierarchy.startswith('bottom-modified-'): + group = "BotMod" + elif hierarchy.startswith('bottom-added-'): + group = "BotAdd" + else: + group = "Regular" + + if operation == 'deleted': + action = "delete" + elif insertion_type == "before_reference": + action = "insert" + elif line_num == -1: + action = "append" + else: + action = "replace" + + thread_safe_print(f" {i:2}. [{group:7}] Target:{target_display:>3} Src:{source_line_num:3} | {key:15} ({operation:8}) | {action:7} | {hierarchy}") + + # Determine target file name + if target_file_name is None: + # Extract target file name from match file path + # e.g., "tikv-configuration-file-match_source_diff_to_target.json" -> "tikv-configuration-file.md" + match_filename = os.path.basename(match_file_path) + if match_filename.endswith('-match_source_diff_to_target.json'): + extracted_name = match_filename[:-len('-match_source_diff_to_target.json')] + '.md' + target_file_name = extracted_name + thread_safe_print(f" šŸ“‚ Extracted target file name from match file: {target_file_name}") + else: + # Fallback: try to determine from source hierarchy + first_entry = next(iter(match_data.values())) + source_hierarchy = first_entry.get('source_original_hierarchy', '') + + if 'TiFlash' in source_hierarchy or 'tiflash' in source_hierarchy.lower(): + target_file_name = "tiflash/tiflash-configuration.md" + else: + # Default to command-line flags for other cases + target_file_name = "command-line-flags-for-tidb-configuration.md" + thread_safe_print(f" šŸ“‚ Determined target file name from hierarchy: {target_file_name}") + else: + thread_safe_print(f" šŸ“‚ Using provided target file name: {target_file_name}") + + target_file_path = os.path.join(target_local_path, target_file_name) + thread_safe_print(f"\nšŸ“„ Target file path: {target_file_path}") + + # Update target document + thread_safe_print(f"\nšŸš€ Starting target document update, will modify {len(all_sections)} sections...") + success = update_target_document_sections(all_sections, target_file_path) + + return success + +def update_target_document_sections(all_sections, target_file_path): + """ + Update target document sections - integrated from test_target_update.py + """ + thread_safe_print(f"\nšŸš€ Starting target document update: {target_file_path}") + + # Read target document + if not os.path.exists(target_file_path): + thread_safe_print(f"āŒ Target file does not exist: {target_file_path}") + return False + + with open(target_file_path, 'r', encoding='utf-8') as f: + target_lines = f.readlines() + + thread_safe_print(f"šŸ“„ Target document total lines: {len(target_lines)}") + + # Process modifications in order (bottom-modified -> regular -> bottom-added) + for i, (key, section_data, target_line_num) in enumerate(all_sections, 1): + operation = section_data.get('source_operation', '') + insertion_type = section_data.get('insertion_type', '') + target_hierarchy = section_data.get('target_hierarchy', '') + target_new_content = section_data.get('target_new_content') + + thread_safe_print(f"\nšŸ“ {i}/{len(all_sections)} Processing {key} (Line {target_line_num})") + thread_safe_print(f" Operation type: {operation}") + thread_safe_print(f" Target section: {target_hierarchy}") + + if operation == 'deleted': + # Delete logic: remove the specified section + if target_line_num == -1: + thread_safe_print(f" āŒ Invalid delete operation for bottom section") + continue + + thread_safe_print(f" šŸ—‘ļø Delete mode: removing section starting at line {target_line_num}") + + # Find section end position + start_line = target_line_num - 1 # Convert to 0-based index + + if start_line >= len(target_lines): + thread_safe_print(f" āŒ Line number out of range: {target_line_num} > {len(target_lines)}") + continue + + # Find section end position + end_line = find_section_end_for_update(target_lines, start_line, target_hierarchy) + + thread_safe_print(f" šŸ“ Delete range: line {start_line + 1} to {end_line}") + thread_safe_print(f" šŸ“„ Delete content: {target_lines[start_line].strip()[:50]}...") + + # Delete content + deleted_lines = target_lines[start_line:end_line] + target_lines[start_line:end_line] = [] + + thread_safe_print(f" āœ… Deleted {len(deleted_lines)} lines of content") + + elif target_new_content is None: + thread_safe_print(f" āš ļø Skipping: target_new_content is null") + continue + + elif not target_new_content: + thread_safe_print(f" āš ļø Skipping: target_new_content is empty") + continue + + else: + # Handle content format + thread_safe_print(f" šŸ“„ Content preview: {repr(target_new_content[:80])}...") + + if target_hierarchy.startswith('bottom-'): + # Bottom section special handling + if target_hierarchy.startswith('bottom-modified-'): + # Bottom modified: find and replace existing content at document end + thread_safe_print(f" šŸ”„ Bottom modified section: replacing existing content at document end") + + # Get the old content to search for + source_operation_data = section_data.get('source_operation_data', {}) + old_content = source_operation_data.get('old_content', '').strip() + + if old_content: + # Search backwards from end to find the matching section + found_line = None + for idx in range(len(target_lines) - 1, -1, -1): + line_content = target_lines[idx].strip() + if line_content == old_content: + found_line = idx + thread_safe_print(f" šŸ“ Found target section at line {found_line + 1}: {line_content[:50]}...") + break + + if found_line is not None: + # Find section end + end_line = find_section_end_for_update(target_lines, found_line, target_hierarchy) + + # Ensure content format is correct + if not target_new_content.endswith('\n'): + target_new_content += '\n' + + # Split content by lines + new_lines = target_new_content.splitlines(keepends=True) + + # Replace content + target_lines[found_line:end_line] = new_lines + + thread_safe_print(f" āœ… Replaced {end_line - found_line} lines with {len(new_lines)} lines") + else: + thread_safe_print(f" āš ļø Could not find target section, appending to end instead") + # Fallback: append to end + if not target_new_content.endswith('\n'): + target_new_content += '\n' + if target_lines and target_lines[-1].strip(): + target_new_content = '\n' + target_new_content + new_lines = target_new_content.splitlines(keepends=True) + target_lines.extend(new_lines) + thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") + else: + thread_safe_print(f" āš ļø No old_content found, appending to end instead") + # Fallback: append to end + if not target_new_content.endswith('\n'): + target_new_content += '\n' + if target_lines and target_lines[-1].strip(): + target_new_content = '\n' + target_new_content + new_lines = target_new_content.splitlines(keepends=True) + target_lines.extend(new_lines) + thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") + + elif target_hierarchy.startswith('bottom-added-'): + # Bottom added: append new content to end of document + thread_safe_print(f" šŸ”š Bottom added section: appending new content to end") + + # Ensure content format is correct + if not target_new_content.endswith('\n'): + target_new_content += '\n' + + # Add spacing before new section if needed + if target_lines and target_lines[-1].strip(): + target_new_content = '\n' + target_new_content + + # Split content by lines + new_lines = target_new_content.splitlines(keepends=True) + + # Append to end of document + target_lines.extend(new_lines) + + thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") + else: + # Other bottom sections: append to end + thread_safe_print(f" šŸ”š Other bottom section: appending to end of document") + + # Ensure content format is correct + if not target_new_content.endswith('\n'): + target_new_content += '\n' + + # Add spacing before new section if needed + if target_lines and target_lines[-1].strip(): + target_new_content = '\n' + target_new_content + + # Split content by lines + new_lines = target_new_content.splitlines(keepends=True) + + # Append to end of document + target_lines.extend(new_lines) + + thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") + + elif target_hierarchy == "frontmatter": + # Frontmatter special handling: directly replace front lines + thread_safe_print(f" šŸ“„ Frontmatter mode: directly replacing document beginning") + + # Find the first top-level heading position + first_header_line = 0 + for i, line in enumerate(target_lines): + if line.strip().startswith('# '): + first_header_line = i + break + + thread_safe_print(f" šŸ“ Frontmatter range: line 1 to {first_header_line}") + + # Split new content by lines, preserving original structure including trailing empty lines + new_lines = target_new_content.splitlines(keepends=True) + + # If the original content ends with \n, it means there should be an empty line after the last content line + # splitlines() doesn't create this empty line, so we need to add it manually + if target_new_content.endswith('\n'): + new_lines.append('\n') + elif target_new_content: + # If content doesn't end with newline, ensure the last line has one + if not new_lines[-1].endswith('\n'): + new_lines[-1] += '\n' + + # Replace frontmatter + target_lines[0:first_header_line] = new_lines + + thread_safe_print(f" āœ… Replaced {first_header_line} lines of frontmatter with {len(new_lines)} lines") + + elif insertion_type == "before_reference": + # Insert logic: insert before specified line + if target_line_num == -1: + thread_safe_print(f" āŒ Invalid insert operation for bottom section") + continue + + thread_safe_print(f" šŸ“ Insert mode: inserting before line {target_line_num}") + + # Ensure content format is correct + if not target_new_content.endswith('\n'): + target_new_content += '\n' + + # Ensure spacing between sections + if not target_new_content.endswith('\n\n'): + target_new_content += '\n' + + # Split content by lines + new_lines = target_new_content.splitlines(keepends=True) + + # Insert at specified position + insert_position = target_line_num - 1 # Convert to 0-based index + if insert_position < 0: + insert_position = 0 + elif insert_position > len(target_lines): + insert_position = len(target_lines) + + # Execute insertion + for j, line in enumerate(new_lines): + target_lines.insert(insert_position + j, line) + + thread_safe_print(f" āœ… Inserted {len(new_lines)} lines of content") + + else: + # Replace logic: find target section and replace + if target_line_num == -1: + thread_safe_print(f" āŒ Invalid replace operation for bottom section") + continue + + thread_safe_print(f" šŸ”„ Replace mode: replacing section starting at line {target_line_num}") + + # Ensure content format is correct + if not target_new_content.endswith('\n'): + target_new_content += '\n' + + # Ensure spacing between sections + if not target_new_content.endswith('\n\n'): + target_new_content += '\n' + + # Find section end position + start_line = target_line_num - 1 # Convert to 0-based index + + if start_line >= len(target_lines): + thread_safe_print(f" āŒ Line number out of range: {target_line_num} > {len(target_lines)}") + continue + + # Find section end position + end_line = find_section_end_for_update(target_lines, start_line, target_hierarchy) + + thread_safe_print(f" šŸ“ Replace range: line {start_line + 1} to {end_line}") + + # Split new content by lines + new_lines = target_new_content.splitlines(keepends=True) + + # Replace content + target_lines[start_line:end_line] = new_lines + + thread_safe_print(f" āœ… Replaced {end_line - start_line} lines with {len(new_lines)} lines") + + + with open(target_file_path, 'w', encoding='utf-8') as f: + f.writelines(target_lines) + + thread_safe_print(f"\nāœ… Target document update completed!") + thread_safe_print(f"šŸ“„ Updated file: {target_file_path}") + + return True + +def find_section_end_for_update(lines, start_line, target_hierarchy): + """Find section end position - based on test_target_update.py logic""" + current_line = lines[start_line].strip() + + if target_hierarchy == "frontmatter": + # Frontmatter special handling: from --- to second ---, then to first top-level heading + if start_line == 0 and current_line.startswith('---'): + # Find second --- + for i in range(start_line + 1, len(lines)): + if lines[i].strip() == '---': + # Found frontmatter end, but need to include up to next content start + # Look for first non-empty line or first heading + for j in range(i + 1, len(lines)): + line = lines[j].strip() + if line and line.startswith('# '): + thread_safe_print(f" šŸ“ Frontmatter ends at line {j} (before first top-level heading)") + return j + elif line and not line.startswith('#'): + # If there's other content, end there + thread_safe_print(f" šŸ“ Frontmatter ends at line {j} (before other content)") + return j + # If no other content found, end after second --- + thread_safe_print(f" šŸ“ Frontmatter ends at line {i+1} (after second ---)") + return i + 1 + # If not standard frontmatter format, find first top-level heading + for i in range(start_line + 1, len(lines)): + if lines[i].strip().startswith('# '): + thread_safe_print(f" šŸ“ Frontmatter ends at line {i} (before first top-level heading)") + return i + # If no top-level heading found, process entire file + return len(lines) + + if current_line.startswith('#'): + # Use file_updater.py method to calculate heading level + current_level = len(current_line.split()[0]) if current_line.split() else 0 + thread_safe_print(f" šŸ” Current heading level: {current_level} (heading: {current_line[:50]}...)") + + # Special handling for top-level headings: only process until first second-level heading + if current_level == 1: + for i in range(start_line + 1, len(lines)): + line = lines[i].strip() + if line.startswith('##'): # Find first second-level heading + thread_safe_print(f" šŸ“ Top-level heading ends at line {i} (before first second-level heading)") + return i + # If no second-level heading found, look for next top-level heading + for i in range(start_line + 1, len(lines)): + line = lines[i].strip() + if line.startswith('#') and not line.startswith('##'): + thread_safe_print(f" šŸ“ Top-level heading ends at line {i} (before next top-level heading)") + return i + else: + # For other level headings, stop at ANY header to get only direct content + # This prevents including sub-sections in the update range + for i in range(start_line + 1, len(lines)): + line = lines[i].strip() + if line.startswith('#'): + # Stop at ANY header to get only direct content + thread_safe_print(f" šŸ“ Found header at line {i}: {line[:30]}... (stopping for direct content only)") + return i + + # If not found, return file end + thread_safe_print(f" šŸ“ No end position found, using file end") + return len(lines) + + # Non-heading line, only replace current line + return start_line + 1 diff --git a/scripts/translate_doc_pr/main_workflow.py b/scripts/translate_doc_pr/main_workflow.py new file mode 100644 index 0000000000000..12260334ec206 --- /dev/null +++ b/scripts/translate_doc_pr/main_workflow.py @@ -0,0 +1,691 @@ +""" +Main Entry Point for GitHub Workflow +Orchestrates the entire auto-sync workflow in GitHub Actions environment +""" + +import sys +import os +import json +import threading +import tiktoken +from github import Github, Auth + +# Conditional import for Gemini +try: + from google import genai + GEMINI_AVAILABLE = True +except ImportError: + GEMINI_AVAILABLE = False + +# Import all modules +from pr_analyzer import analyze_source_changes, get_repo_config, get_target_hierarchy_and_content, parse_pr_url +from file_adder import process_added_files +from file_deleter import process_deleted_files +from file_updater import process_files_in_batches, process_added_sections, process_modified_sections, process_deleted_sections +from toc_processor import process_toc_files +from section_matcher import match_source_diff_to_target + +# Configuration from environment variables +SOURCE_PR_URL = os.getenv("SOURCE_PR_URL") +TARGET_PR_URL = os.getenv("TARGET_PR_URL") +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") +AI_PROVIDER = os.getenv("AI_PROVIDER", "deepseek") +TARGET_REPO_PATH = os.getenv("TARGET_REPO_PATH") + +# AI configuration +DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_TOKEN") +DEEPSEEK_BASE_URL = "https://api.deepseek.com" +GEMINI_API_KEY = os.getenv("GEMINI_API_TOKEN") +GEMINI_MODEL_NAME = "gemini-2.0-flash" + +# Processing limit configuration +MAX_NON_SYSTEM_SECTIONS_FOR_AI = 120 +SOURCE_TOKEN_LIMIT = 5000 # Maximum tokens for source new_content before skipping file processing + +# AI configuration +AI_MAX_TOKENS = 20000 # Maximum tokens for AI translation requests + +# Special file configuration +SPECIAL_FILES = ["TOC.md"] +IGNORE_FILES = ["faq/ddl-faq.md","command-line-flags-for-tidb-configuration.md","pd-configuration-file.md"] + +# Repository configuration for workflow +def get_workflow_repo_configs(): + """Get repository configuration based on environment variables""" + if not SOURCE_PR_URL or not TARGET_PR_URL: + raise ValueError("SOURCE_PR_URL and TARGET_PR_URL must be set") + + # Parse source and target repo info + source_parts = SOURCE_PR_URL.split('/') + target_parts = TARGET_PR_URL.split('/') + + source_owner, source_repo = source_parts[-4], source_parts[-3] + target_owner, target_repo = target_parts[-4], target_parts[-3] + + source_repo_key = f"{source_owner}/{source_repo}" + target_repo_key = f"{target_owner}/{target_repo}" + + # Determine language direction based on repo names + if source_repo.endswith('-cn') and not target_repo.endswith('-cn'): + # Chinese to English + source_language = "Chinese" + target_language = "English" + elif not source_repo.endswith('-cn') and target_repo.endswith('-cn'): + # English to Chinese + source_language = "English" + target_language = "Chinese" + else: + # Default fallback + source_language = "English" + target_language = "Chinese" + + return { + source_repo_key: { + "target_repo": target_repo_key, + "target_local_path": TARGET_REPO_PATH, + "source_language": source_language, + "target_language": target_language + } + } + +# Thread-safe printing function +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def ensure_temp_output_dir(): + """Ensure the temp_output directory exists""" + # Get the directory of the current script + script_dir = os.path.dirname(os.path.abspath(__file__)) + temp_dir = os.path.join(script_dir, "temp_output") + os.makedirs(temp_dir, exist_ok=True) + return temp_dir + +def clean_temp_output_dir(): + """Clean the temp_output directory at the start of execution""" + import shutil + # Get the directory of the current script + script_dir = os.path.dirname(os.path.abspath(__file__)) + temp_dir = os.path.join(script_dir, "temp_output") + if os.path.exists(temp_dir): + if os.path.isdir(temp_dir): + shutil.rmtree(temp_dir) + print(f"🧹 Cleaned existing temp_output directory") + else: + # Remove file if it exists + os.remove(temp_dir) + print(f"🧹 Removed existing temp_output file") + os.makedirs(temp_dir, exist_ok=True) + print(f"šŸ“ Created temp_output directory: {temp_dir}") + return temp_dir + +def estimate_tokens(text): + """Calculate accurate token count using tiktoken (GPT-4/3.5 encoding)""" + if not text: + return 0 + try: + enc = tiktoken.get_encoding("cl100k_base") # GPT-4/3.5 encoding + tokens = enc.encode(text) + return len(tokens) + except Exception as e: + # Fallback to character approximation if tiktoken fails + thread_safe_print(f" āš ļø Tiktoken encoding failed: {e}, using character approximation") + return len(text) // 4 + +def print_token_estimation(prompt_text, context="AI translation"): + """Print accurate token consumption for a request""" + actual_tokens = estimate_tokens(prompt_text) + char_count = len(prompt_text) + thread_safe_print(f" šŸ’° {context}") + thread_safe_print(f" šŸ“ Input: {char_count:,} characters") + thread_safe_print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") + return actual_tokens + +class UnifiedAIClient: + """Unified interface for different AI providers""" + + def __init__(self, provider="deepseek"): + self.provider = provider + if provider == "deepseek": + from openai import OpenAI + self.client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_BASE_URL) + self.model = "deepseek-chat" + elif provider == "gemini": + if not GEMINI_AVAILABLE: + raise ImportError("google.generativeai package not installed. Run: pip install google-generativeai") + if not GEMINI_API_KEY: + raise ValueError("GEMINI_API_TOKEN environment variable must be set") + self.client = genai.Client(api_key=GEMINI_API_KEY) + self.model = GEMINI_MODEL_NAME + else: + raise ValueError(f"Unsupported AI provider: {provider}") + + def chat_completion(self, messages, temperature=0.1, max_tokens=20000): + """Unified chat completion interface""" + if self.provider == "deepseek": + response = self.client.chat.completions.create( + model=self.model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens + ) + return response.choices[0].message.content.strip() + elif self.provider == "gemini": + try: + # Convert OpenAI-style messages to Gemini format + prompt = self._convert_messages_to_prompt(messages) + thread_safe_print(f" šŸ”„ Calling Gemini API...") + + # Use the correct Gemini API call format (based on your reference file) + response = self.client.models.generate_content( + model=self.model, + contents=prompt + ) + + if response and response.text: + thread_safe_print(f" āœ… Gemini response received") + return response.text.strip() + else: + thread_safe_print(f" āš ļø Gemini response was empty or blocked") + return "No response from Gemini" + + except Exception as e: + thread_safe_print(f" āŒ Gemini API error: {str(e)}") + # Fallback: suggest switching to DeepSeek + thread_safe_print(f" šŸ’” Consider switching to DeepSeek in main.py: AI_PROVIDER = 'deepseek'") + raise e + + def _convert_messages_to_prompt(self, messages): + """Convert OpenAI-style messages to a single prompt for Gemini""" + prompt_parts = [] + for message in messages: + role = message.get("role", "user") + content = message.get("content", "") + if role == "user": + prompt_parts.append(content) + elif role == "system": + prompt_parts.append(f"System: {content}") + return "\n\n".join(prompt_parts) + +def check_source_token_limit(source_diff_dict_file, token_limit=SOURCE_TOKEN_LIMIT): + """Check if the total tokens of all new_content in source-diff-dict exceeds the limit""" + try: + with open(source_diff_dict_file, 'r', encoding='utf-8') as f: + source_diff_dict = json.load(f) + + total_new_content = "" + section_count = 0 + + for key, section_data in source_diff_dict.items(): + if isinstance(section_data, dict): + new_content = section_data.get('new_content', '') + if new_content: + total_new_content += new_content + "\n" + section_count += 1 + + if not total_new_content.strip(): + thread_safe_print(f" āš ļø No new_content found in {source_diff_dict_file}") + return True, 0, 0 # Allow processing if no content to check + + total_tokens = estimate_tokens(total_new_content) + char_count = len(total_new_content) + + thread_safe_print(f" šŸ“Š Source token limit check:") + thread_safe_print(f" šŸ“ Total new_content: {char_count:,} characters from {section_count} sections") + thread_safe_print(f" šŸ”¢ Total tokens: {total_tokens:,}") + thread_safe_print(f" 🚧 Token limit: {token_limit:,}") + + if total_tokens > token_limit: + thread_safe_print(f" āŒ Token limit exceeded! ({total_tokens:,} > {token_limit:,})") + return False, total_tokens, token_limit + else: + thread_safe_print(f" āœ… Within token limit ({total_tokens:,} ≤ {token_limit:,})") + return True, total_tokens, token_limit + + except Exception as e: + thread_safe_print(f" āŒ Error checking token limit for {source_diff_dict_file}: {e}") + return True, 0, 0 # Allow processing on error to avoid blocking + +def get_pr_diff(pr_url, github_client): + """Get the diff content from a GitHub PR (from auto-sync-pr-changes.py)""" + try: + from pr_analyzer import parse_pr_url + owner, repo, pr_number = parse_pr_url(pr_url) + repository = github_client.get_repo(f"{owner}/{repo}") + pr = repository.get_pull(pr_number) + + # Get files and their patches + files = pr.get_files() + diff_content = [] + + for file in files: + if file.filename.endswith('.md') and file.patch: + diff_content.append(f"File: {file.filename}") + diff_content.append(file.patch) + diff_content.append("-" * 80) + + return "\n".join(diff_content) + + except Exception as e: + thread_safe_print(f" āŒ Error getting PR diff: {e}") + return None + +def filter_diff_by_operation_type(pr_diff, operation_type, target_sections=None): + """Filter PR diff to only include changes relevant to specific operation type""" + + if not pr_diff: + return "" + + if operation_type == "modified": + # For modified sections, we want the full diff but focus on changed content + return pr_diff + elif operation_type == "added": + # For added sections, we want to show what was added + filtered_lines = [] + for line in pr_diff.split('\n'): + if line.startswith('+') and not line.startswith('+++'): + filtered_lines.append(line) + elif line.startswith('@@') or line.startswith('File:'): + filtered_lines.append(line) + return '\n'.join(filtered_lines) + elif operation_type == "deleted": + # For deleted sections, we want to show what was removed + filtered_lines = [] + for line in pr_diff.split('\n'): + if line.startswith('-') and not line.startswith('---'): + filtered_lines.append(line) + elif line.startswith('@@') or line.startswith('File:'): + filtered_lines.append(line) + return '\n'.join(filtered_lines) + + return pr_diff + +def filter_diff_for_target_file(pr_diff, target_file, source_diff_dict): + """Extract file-specific diff from the complete PR diff based on source files that map to the target file""" + if not pr_diff or not source_diff_dict: + return pr_diff + + # Extract source files that contribute to this target file + source_files = set() + for key, section_data in source_diff_dict.items(): + if isinstance(section_data, dict): + source_file = section_data.get('source_file', '') + if source_file: + source_files.add(source_file) + + if not source_files: + print(f" āš ļø No source files found in source_diff_dict, using complete PR diff") + return pr_diff + + print(f" šŸ“„ Source files contributing to {target_file}: {list(source_files)}") + + # Filter PR diff to only include changes from these source files + filtered_lines = [] + current_file = None + include_section = False + + for line in pr_diff.split('\n'): + if line.startswith('File: '): + current_file = line.replace('File: ', '').strip() + include_section = current_file in source_files + if include_section: + filtered_lines.append(line) + elif line.startswith('-' * 80): + if include_section: + filtered_lines.append(line) + elif include_section: + filtered_lines.append(line) + + file_specific_diff = '\n'.join(filtered_lines) + print(f" šŸ“Š Filtered diff: {len(file_specific_diff)} chars (from {len(pr_diff)} chars)") + + return file_specific_diff if file_specific_diff.strip() else pr_diff + +def extract_file_diff_from_pr(pr_diff, source_file_path): + """Extract diff content for a specific source file from the complete PR diff""" + if not pr_diff: + return "" + + filtered_lines = [] + current_file = None + include_section = False + + for line in pr_diff.split('\n'): + if line.startswith('File: '): + current_file = line.replace('File: ', '').strip() + include_section = (current_file == source_file_path) + if include_section: + filtered_lines.append(line) + elif line.startswith('-' * 80): + if include_section: + filtered_lines.append(line) + include_section = False # End of this file's section + elif include_section: + filtered_lines.append(line) + + return '\n'.join(filtered_lines) + +def determine_file_processing_type(source_file_path, file_sections, special_files=None): + """Determine how to process a file based on operation type and file characteristics""" + + # Check if this is a special file (like TOC.md) + if special_files and os.path.basename(source_file_path) in special_files: + return "special_file_toc" + + # For all other modified files, use regular processing + return "regular_modified" + +def process_regular_modified_file(source_file_path, file_sections, file_diff, pr_url, github_client, ai_client, repo_config, max_sections): + """Process a regular markdown file that has been modified""" + try: + print(f" šŸ“ Processing as regular modified file: {source_file_path}") + + # Extract the actual sections from the file_sections structure + # file_sections contains: {'sections': {...}, 'original_hierarchy': {...}, 'current_hierarchy': {...}} + if isinstance(file_sections, dict) and 'sections' in file_sections: + actual_sections = file_sections['sections'] + else: + # Fallback: assume file_sections is already the sections dict + actual_sections = file_sections + + print(f" šŸ“Š Extracted sections: {len(actual_sections)} sections") + + # CRITICAL: Load the source-diff-dict.json and perform matching + import json + import os + from section_matcher import match_source_diff_to_target + from pr_analyzer import get_target_hierarchy_and_content + + # Load source-diff-dict.json with file prefix + temp_dir = ensure_temp_output_dir() + file_prefix = source_file_path.replace('/', '-').replace('.md', '') + source_diff_dict_file = os.path.join(temp_dir, f"{file_prefix}-source-diff-dict.json") + if os.path.exists(source_diff_dict_file): + with open(source_diff_dict_file, 'r', encoding='utf-8') as f: + source_diff_dict = json.load(f) + print(f" šŸ“‚ Loaded source diff dict with {len(source_diff_dict)} sections from {source_diff_dict_file}") + + # Check source token limit before proceeding with processing + print(f" šŸ” Checking source token limit...") + within_limit, total_tokens, token_limit = check_source_token_limit(source_diff_dict_file) + if not within_limit: + print(f" 🚫 Skipping file processing: source content exceeds token limit") + print(f" šŸ“Š Total tokens: {total_tokens:,} > Limit: {token_limit:,}") + print(f" ā­ļø File {source_file_path} will not be processed") + return False + + else: + print(f" āŒ {source_diff_dict_file} not found") + return False + + # Get target file hierarchy and content + target_repo = repo_config['target_repo'] + target_hierarchy, target_lines = get_target_hierarchy_and_content(source_file_path, github_client, target_repo) + + if not target_hierarchy or not target_lines: + print(f" āŒ Could not get target file content for {source_file_path}") + return False + + print(f" šŸ“– Target file: {len(target_hierarchy)} sections, {len(target_lines)} lines") + + # Perform source diff to target matching + print(f" šŸ”— Matching source diff to target...") + enhanced_sections = match_source_diff_to_target( + source_diff_dict, + target_hierarchy, + target_lines, + ai_client, + repo_config, + max_sections, + AI_MAX_TOKENS + ) + + if not enhanced_sections: + print(f" āŒ No sections matched") + return False + + print(f" āœ… Matched {len(enhanced_sections)} sections") + + # Save the match result for reference + match_file = os.path.join(temp_dir, f"{source_file_path.replace('/', '-').replace('.md', '')}-match_source_diff_to_target.json") + with open(match_file, 'w', encoding='utf-8') as f: + json.dump(enhanced_sections, f, ensure_ascii=False, indent=2) + print(f" šŸ’¾ Saved match result to: {match_file}") + + # Step 2: Get AI translation for the matched sections + print(f" šŸ¤– Getting AI translation for matched sections...") + + # Create file data structure with enhanced matching info + # Wrap enhanced_sections in the expected format for process_single_file + file_data = { + source_file_path: { + 'type': 'enhanced_sections', + 'sections': enhanced_sections + } + } + + # Call the existing process_modified_sections function to get AI translation + results = process_modified_sections(file_data, file_diff, pr_url, github_client, ai_client, repo_config, max_sections) + + # Step 3: Update match_source_diff_to_target.json with AI results + if results and len(results) > 0: + file_path, success, ai_updated_sections = results[0] # Get first result + if success and isinstance(ai_updated_sections, dict): + print(f" šŸ“ Step 3: Updating {match_file} with AI results...") + + # Load current match_source_diff_to_target.json + with open(match_file, 'r', encoding='utf-8') as f: + match_data = json.load(f) + + # Add target_new_content field to each section based on AI results + updated_count = 0 + for key, section_data in match_data.items(): + operation = section_data.get('source_operation', '') + + if operation == 'deleted': + # For deleted sections, set target_new_content to null + section_data['target_new_content'] = None + elif key in ai_updated_sections: + # For modified/added sections with AI translation + section_data['target_new_content'] = ai_updated_sections[key] + updated_count += 1 + else: + # For sections not translated, keep original content + section_data['target_new_content'] = section_data.get('target_content', '') + + # Save updated match_source_diff_to_target.json + with open(match_file, 'w', encoding='utf-8') as f: + json.dump(match_data, f, ensure_ascii=False, indent=2) + + print(f" āœ… Updated {updated_count} sections with AI translations in {match_file}") + + # Step 4: Apply updates to target document using update_target_document_from_match_data + print(f" šŸ“ Step 4: Applying updates to target document...") + from file_updater import update_target_document_from_match_data + + success = update_target_document_from_match_data(match_file, repo_config['target_local_path'], source_file_path) + if success: + print(f" šŸŽ‰ Target document successfully updated!") + return True + else: + print(f" āŒ Failed to update target document") + return False + + else: + print(f" āš ļø AI translation failed or returned invalid results") + return False + else: + print(f" āš ļø No results from process_modified_sections") + return False + + except Exception as e: + print(f" āŒ Error processing regular modified file {source_file_path}: {e}") + return False + + +def get_workflow_repo_config(pr_url, repo_configs): + """Get repository configuration for workflow environment""" + from pr_analyzer import parse_pr_url + + owner, repo, pr_number = parse_pr_url(pr_url) + source_repo = f"{owner}/{repo}" + + if source_repo not in repo_configs: + raise ValueError(f"Unsupported source repository: {source_repo}. Supported: {list(repo_configs.keys())}") + + config = repo_configs[source_repo].copy() + config['source_repo'] = source_repo + config['pr_number'] = pr_number + + return config + +def main(): + """Main function - orchestrates the entire workflow for GitHub Actions""" + + # Validate environment variables + if not all([SOURCE_PR_URL, TARGET_PR_URL, GITHUB_TOKEN, TARGET_REPO_PATH]): + print("āŒ Missing required environment variables:") + print(f" SOURCE_PR_URL: {SOURCE_PR_URL}") + print(f" TARGET_PR_URL: {TARGET_PR_URL}") + print(f" GITHUB_TOKEN: {'Set' if GITHUB_TOKEN else 'Not set'}") + print(f" TARGET_REPO_PATH: {TARGET_REPO_PATH}") + return + + print(f"šŸ”§ Auto PR Sync Tool (GitHub Workflow Version)") + print(f"šŸ“ Source PR URL: {SOURCE_PR_URL}") + print(f"šŸ“ Target PR URL: {TARGET_PR_URL}") + print(f"šŸ¤– AI Provider: {AI_PROVIDER}") + print(f"šŸ“ Target Repo Path: {TARGET_REPO_PATH}") + + # Clean and prepare temp_output directory + clean_temp_output_dir() + + # Get repository configuration using workflow config + try: + repo_configs = get_workflow_repo_configs() + repo_config = get_workflow_repo_config(SOURCE_PR_URL, repo_configs) + print(f"šŸ“ Source Repo: {repo_config['source_repo']} ({repo_config['source_language']})") + print(f"šŸ“ Target Repo: {repo_config['target_repo']} ({repo_config['target_language']})") + print(f"šŸ“ Target Path: {repo_config['target_local_path']}") + except ValueError as e: + print(f"āŒ {e}") + return + + # Initialize clients + auth = Auth.Token(GITHUB_TOKEN) + github_client = Github(auth=auth) + + # Initialize unified AI client + try: + ai_client = UnifiedAIClient(provider=AI_PROVIDER) + thread_safe_print(f"šŸ¤– AI Provider: {AI_PROVIDER.upper()} ({ai_client.model})") + except Exception as e: + thread_safe_print(f"āŒ Failed to initialize AI client: {e}") + return + + print(f"\nšŸš€ Starting auto-sync for PR: {SOURCE_PR_URL}") + + # Step 1: Get PR diff + print(f"\nšŸ“‹ Step 1: Getting PR diff...") + pr_diff = get_pr_diff(SOURCE_PR_URL, github_client) + if not pr_diff: + print("āŒ Could not get PR diff") + return + print(f"āœ… Got PR diff: {len(pr_diff)} characters") + + # Step 2: Analyze source changes with operation categorization + print(f"\nšŸ“Š Step 2: Analyzing source changes...") + added_sections, modified_sections, deleted_sections, added_files, deleted_files, toc_files = analyze_source_changes( + SOURCE_PR_URL, github_client, + special_files=SPECIAL_FILES, + ignore_files=IGNORE_FILES, + repo_configs=repo_configs, + max_non_system_sections=MAX_NON_SYSTEM_SECTIONS_FOR_AI, + pr_diff=pr_diff # Pass the PR diff to avoid re-fetching + ) + + # Step 3: Process different types of files based on operation type + print(f"\nšŸ“‹ Step 3: Processing files based on operation type...") + + # Import necessary functions + from file_updater import process_modified_sections, update_target_document_from_match_data + from toc_processor import process_toc_files + + # Step 3.1: Process deleted files (file-level deletions) + if deleted_files: + print(f"\nšŸ—‘ļø Step 3.1: Processing {len(deleted_files)} deleted files...") + process_deleted_files(deleted_files, github_client, repo_config) + print(f" āœ… Deleted files processed") + + # Step 3.2: Process added files (file-level additions) + if added_files: + print(f"\nšŸ“„ Step 3.2: Processing {len(added_files)} added files...") + process_added_files(added_files, SOURCE_PR_URL, github_client, ai_client, repo_config) + print(f" āœ… Added files processed") + + # Step 3.3: Process special files (TOC.md and similar) + if toc_files: + print(f"\nšŸ“‹ Step 3.3: Processing {len(toc_files)} special files (TOC)...") + process_toc_files(toc_files, SOURCE_PR_URL, github_client, ai_client, repo_config) + print(f" āœ… Special files processed") + + # Step 3.4: Process modified files (section-level modifications) + if modified_sections: + print(f"\nšŸ“ Step 3.4: Processing {len(modified_sections)} modified files...") + + # Process each modified file separately + for source_file_path, file_sections in modified_sections.items(): + print(f"\nšŸ“„ Processing modified file: {source_file_path}") + + # Extract file-specific diff from the complete PR diff + print(f" šŸ” Extracting file-specific diff for: {source_file_path}") + file_specific_diff = extract_file_diff_from_pr(pr_diff, source_file_path) + + if not file_specific_diff: + print(f" āš ļø No diff found for {source_file_path}, skipping...") + continue + + print(f" šŸ“Š File-specific diff: {len(file_specific_diff)} chars") + + # Determine file processing approach for modified files + file_type = determine_file_processing_type(source_file_path, file_sections, SPECIAL_FILES) + print(f" šŸ” File processing type: {file_type}") + + if file_type == "special_file_toc": + # Special files should have been processed in Step 3.3, skip here + print(f" ā­ļø Special file already processed in Step 3.3, skipping...") + continue + + elif file_type == "regular_modified": + # Regular markdown files with modifications + success = process_regular_modified_file( + source_file_path, + file_sections, + file_specific_diff, + SOURCE_PR_URL, + github_client, + ai_client, + repo_config, + MAX_NON_SYSTEM_SECTIONS_FOR_AI + ) + + if success: + print(f" āœ… Successfully processed {source_file_path}") + else: + print(f" āŒ Failed to process {source_file_path}") + + else: + print(f" āš ļø Unknown file processing type: {file_type} for {source_file_path}, skipping...") + + # Final summary + print(f"šŸ“Š Summary:") + print(f" šŸ“„ Added files: {len(added_files)} processed") + print(f" šŸ—‘ļø Deleted files: {len(deleted_files)} processed") + print(f" šŸ“‹ TOC files: {len(toc_files)} processed") + print(f" šŸ“ Modified files: {len(modified_sections)} processed") + print(f"šŸŽ‰ Workflow completed successfully!") + +if __name__ == "__main__": + main() diff --git a/scripts/translate_doc_pr/pr_analyzer.py b/scripts/translate_doc_pr/pr_analyzer.py new file mode 100644 index 0000000000000..c164da1520163 --- /dev/null +++ b/scripts/translate_doc_pr/pr_analyzer.py @@ -0,0 +1,1447 @@ +#!/usr/bin/env python3 +""" +PR Analyzer Module +Handles PR analysis, diff parsing, content getting, hierarchy building, and section getting +""" + +import json +import os +import re +import threading +from github import Github + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + """Thread-safe print function""" + with print_lock: + print(*args, **kwargs) + + +def parse_pr_url(pr_url): + """Parse PR URL to get repo info""" + parts = pr_url.split('/') + return parts[-4], parts[-3], int(parts[-1]) # owner, repo, pr_number + +def get_repo_config(pr_url, repo_configs): + """Get repository configuration based on source repo""" + owner, repo, pr_number = parse_pr_url(pr_url) + source_repo = f"{owner}/{repo}" + + if source_repo not in repo_configs: + raise ValueError(f"Unsupported source repository: {source_repo}. Supported: {list(repo_configs.keys())}") + + config = repo_configs[source_repo].copy() + config['source_repo'] = source_repo + config['pr_number'] = pr_number + + return config + +def get_pr_diff(pr_url, github_client): + """Get the diff content from a GitHub PR""" + try: + owner, repo, pr_number = parse_pr_url(pr_url) + repository = github_client.get_repo(f"{owner}/{repo}") + pr = repository.get_pull(pr_number) + + # Get files and their patches + files = pr.get_files() + diff_content = [] + + for file in files: + if file.filename.endswith('.md') and file.patch: + diff_content.append(f"File: {file.filename}") + diff_content.append(file.patch) + diff_content.append("-" * 80) + + return "\n".join(diff_content) + + except Exception as e: + print(f" āŒ Error getting PR diff: {e}") + return None + +def get_changed_line_ranges(file): + """Get the ranges of lines that were changed in the PR""" + changed_ranges = [] + patch = file.patch + if not patch: + return changed_ranges + + lines = patch.split('\n') + current_line = 0 + + for line in lines: + if line.startswith('@@'): + # Parse the hunk header to get line numbers + match = re.search(r'\+(\d+),?(\d+)?', line) + if match: + current_line = int(match.group(1)) + elif line.startswith('+') and not line.startswith('+++'): + # This is an added line + changed_ranges.append(current_line) + current_line += 1 + elif line.startswith('-') and not line.startswith('---'): + # This is a deleted line, also consider as changed + changed_ranges.append(current_line) + # Don't increment current_line for deleted lines + continue + elif line.startswith(' '): + # Context line + current_line += 1 + + return changed_ranges + +def analyze_diff_operations(file): + """Analyze diff to categorize operations as added, modified, or deleted (improved GitHub-like approach)""" + operations = { + 'added_lines': [], # Lines that were added + 'deleted_lines': [], # Lines that were deleted + 'modified_lines': [] # Lines that were modified (both added and deleted content) + } + + patch = file.patch + if not patch: + return operations + + lines = patch.split('\n') + current_line = 0 + deleted_line = 0 + + # Parse diff and keep track of sequence order for better modification detection + diff_sequence = [] # Track the order of operations in diff + + for i, line in enumerate(lines): + if line.startswith('@@'): + # Parse the hunk header to get line numbers + # Format: @@ -old_start,old_count +new_start,new_count @@ + match = re.search(r'-(\d+),?(\d+)?\s+\+(\d+),?(\d+)?', line) + if match: + deleted_line = int(match.group(1)) + current_line = int(match.group(3)) + elif line.startswith('+') and not line.startswith('+++'): + # This is an added line + added_entry = { + 'line_number': current_line, + 'content': line[1:], # Remove the '+' prefix + 'is_header': line[1:].strip().startswith('#'), + 'diff_index': i # Track position in diff + } + operations['added_lines'].append(added_entry) + diff_sequence.append(('added', added_entry)) + current_line += 1 + elif line.startswith('-') and not line.startswith('---'): + # This is a deleted line + deleted_entry = { + 'line_number': deleted_line, + 'content': line[1:], # Remove the '-' prefix + 'is_header': line[1:].strip().startswith('#'), + 'diff_index': i # Track position in diff + } + operations['deleted_lines'].append(deleted_entry) + diff_sequence.append(('deleted', deleted_entry)) + deleted_line += 1 + elif line.startswith(' '): + # Context line (unchanged) + current_line += 1 + deleted_line += 1 + + # GitHub-like modification detection: based on diff sequence proximity + modified_pairs = [] + deleted_headers = [d for d in operations['deleted_lines'] if d['is_header']] + added_headers = [a for a in operations['added_lines'] if a['is_header']] + + used_added_indices = set() + used_deleted_indices = set() + + # Helper function for semantic similarity + def are_headers_similar(old, new): + # Remove markdown markers + old_clean = old.replace('#', '').replace('`', '').strip() + new_clean = new.replace('#', '').replace('`', '').strip() + + # Check if one is a substring/extension of the other + if old_clean in new_clean or new_clean in old_clean: + return True + + # Check for similar patterns (like appending -pu, -new, etc.) + old_base = old_clean.split('-')[0] + new_base = new_clean.split('-')[0] + if old_base and new_base and old_base == new_base: + return True + + return False + + # GitHub-like approach: Look for adjacent or close operations in diff sequence + for i, deleted_header in enumerate(deleted_headers): + if i in used_deleted_indices: + continue + + for j, added_header in enumerate(added_headers): + if j in used_added_indices: + continue + + deleted_content = deleted_header['content'].strip() + added_content = added_header['content'].strip() + + # Check if they are close in the diff sequence (GitHub's approach) + diff_distance = abs(added_header['diff_index'] - deleted_header['diff_index']) + is_close_in_diff = diff_distance <= 5 # Allow small gap for context lines + + # Check semantic similarity + is_similar = are_headers_similar(deleted_content, added_content) + + # GitHub-like logic: prioritize diff proximity + semantic similarity + if is_close_in_diff and is_similar: + modified_pairs.append({ + 'deleted': deleted_header, + 'added': added_header, + 'original_content': deleted_header['content'] + }) + used_added_indices.add(j) + used_deleted_indices.add(i) + break + # Fallback: strong semantic similarity even if not adjacent + elif is_similar and abs(added_header['line_number'] - deleted_header['line_number']) <= 20: + modified_pairs.append({ + 'deleted': deleted_header, + 'added': added_header, + 'original_content': deleted_header['content'] + }) + used_added_indices.add(j) + used_deleted_indices.add(i) + break + + # Remove identified modifications from pure additions/deletions + for pair in modified_pairs: + if pair['deleted'] in operations['deleted_lines']: + operations['deleted_lines'].remove(pair['deleted']) + if pair['added'] in operations['added_lines']: + operations['added_lines'].remove(pair['added']) + # Store both new and original content for modified headers + modified_entry = pair['added'].copy() + modified_entry['original_content'] = pair['original_content'] + operations['modified_lines'].append(modified_entry) + + return operations + +def build_hierarchy_dict(file_content): + """Build hierarchy dictionary from file content, excluding content inside code blocks""" + lines = file_content.split('\n') + level_stack = [] + all_hierarchy_dict = {} + + # Track code block state + in_code_block = False + code_block_delimiter = None # Track the type of code block (``` or ```) + + # Build complete hierarchy for all headers + for line_num, line in enumerate(lines, 1): + original_line = line + line = line.strip() + + # Check for code block delimiters + if line.startswith('```') or line.startswith('~~~'): + if not in_code_block: + # Entering a code block + in_code_block = True + code_block_delimiter = line[:3] # Store the delimiter type + continue + elif line.startswith(code_block_delimiter): + # Exiting a code block + in_code_block = False + code_block_delimiter = None + continue + + # Skip processing if we're inside a code block + if in_code_block: + continue + + # Process headers only if not in code block + if line.startswith('#'): + match = re.match(r'^(#{1,10})\s+(.+)', line) + if match: + level = len(match.group(1)) + title = match.group(2).strip() + + # Remove items from stack that are at same or deeper level + while level_stack and level_stack[-1][0] >= level: + level_stack.pop() + + # Build hierarchy with special handling for top-level titles + if level == 1: + # Top-level titles are included directly without hierarchy path + hierarchy_line = line + elif level_stack: + # For other levels, build path but skip the top-level title (level 1) + path_parts = [item[1] for item in level_stack if item[0] > 1] # Skip level 1 items + path_parts.append(line) + hierarchy_line = " > ".join(path_parts) + else: + # Fallback for other cases + hierarchy_line = line + + if hierarchy_line: # Only add non-empty hierarchies + all_hierarchy_dict[line_num] = hierarchy_line + + level_stack.append((level, line)) + + return all_hierarchy_dict + +def build_hierarchy_path(lines, line_num, all_headers): + """Build the full hierarchy path for a header at given line""" + if line_num not in all_headers: + return [] + + current_header = all_headers[line_num] + current_level = current_header['level'] + hierarchy_path = [] + + # Find all parent headers + for check_line in sorted(all_headers.keys()): + if check_line >= line_num: + break + + header = all_headers[check_line] + if header['level'] < current_level: + # This is a potential parent + # Remove any headers at same or deeper level + while hierarchy_path and hierarchy_path[-1]['level'] >= header['level']: + hierarchy_path.pop() + hierarchy_path.append(header) + + # Add current header + hierarchy_path.append(current_header) + + return hierarchy_path + +def build_hierarchy_for_modified_section(file_content, target_line_num, original_line, base_hierarchy_dict): + """Build hierarchy path for a modified section using original content""" + lines = file_content.split('\n') + + # Get the level of the original header + original_match = re.match(r'^(#{1,10})\s+(.+)', original_line) + if not original_match: + return None + + original_level = len(original_match.group(1)) + original_title = original_match.group(2).strip() + + # Find parent sections by looking backwards from target line + level_stack = [] + + for line_num in range(1, target_line_num): + if line_num in base_hierarchy_dict: + # This is a header line + line_content = lines[line_num - 1].strip() + if line_content.startswith('#'): + match = re.match(r'^(#{1,10})\s+(.+)', line_content) + if match: + level = len(match.group(1)) + title = match.group(2).strip() + + # Remove items from stack that are at same or deeper level + while level_stack and level_stack[-1][0] >= level: + level_stack.pop() + + # Add this header to stack if it's a potential parent + if level < original_level: + level_stack.append((level, line_content)) + + # Build hierarchy path using original content + if level_stack: + path_parts = [item[1] for item in level_stack[1:]] # Skip first level + path_parts.append(original_line) + hierarchy_line = " > ".join(path_parts) + else: + hierarchy_line = original_line if original_level > 1 else "" + + return hierarchy_line if hierarchy_line else None + +def find_section_boundaries(lines, hierarchy_dict): + """Find the start and end line for each section based on hierarchy""" + section_boundaries = {} + + # Sort sections by line number + sorted_sections = sorted(hierarchy_dict.items(), key=lambda x: int(x[0])) + + for i, (line_num, hierarchy) in enumerate(sorted_sections): + start_line = int(line_num) - 1 # Convert to 0-based index + + # Find end line (start of next section at same or higher level) + end_line = len(lines) # Default to end of document + + if start_line >= len(lines): + continue + + # Get current section level + current_line = lines[start_line].strip() + if not current_line.startswith('#'): + continue + + current_level = len(current_line.split()[0]) # Count # characters + + # Look for next section at same or higher level + for j in range(start_line + 1, len(lines)): + line = lines[j].strip() + if line.startswith('#'): + line_level = len(line.split()[0]) if line.split() else 0 + if line_level <= current_level: + end_line = j + break + + section_boundaries[line_num] = { + 'start': start_line, + 'end': end_line, + 'hierarchy': hierarchy, + 'level': current_level + } + + return section_boundaries + +def extract_section_content(lines, start_line, hierarchy_dict): + """Extract the content of a section starting from start_line (includes sub-sections)""" + if not lines or start_line < 1 or start_line > len(lines): + return "" + + start_index = start_line - 1 # Convert to 0-based index + section_content = [] + + # Find the header at start_line + current_line = lines[start_index].strip() + if not current_line.startswith('#'): + return "" + + # Get the level of current header + current_level = len(current_line.split()[0]) # Count # characters + section_content.append(current_line) + + # Special handling for top-level titles (level 1) + if current_level == 1: + # For top-level titles, only extract content until the first next-level header (##) + for i in range(start_index + 1, len(lines)): + line = lines[i].strip() + + if line.startswith('#'): + # Check if this is a header of next level (##, ###, etc.) + line_level = len(line.split()[0]) if line.split() else 0 + if line_level > current_level: + # Found first subsection, stop here for top-level titles + break + elif line_level <= current_level: + # Found same or higher level header, also stop + break + + section_content.append(lines[i].rstrip()) # Keep original line without trailing whitespace + else: + # For non-top-level titles, use the original logic + # Extract content until we hit the next header of same or higher level + for i in range(start_index + 1, len(lines)): + line = lines[i].strip() + + if line.startswith('#'): + # Check if this is a header of same or higher level + line_level = len(line.split()[0]) if line.split() else 0 + if line_level <= current_level: + # Found a header of same or higher level, stop here regardless + # Each section should be extracted individually + break + + section_content.append(lines[i].rstrip()) # Keep original line without trailing whitespace + + return '\n'.join(section_content) + +def extract_section_direct_content(lines, start_line): + """Extract ONLY the direct content of a section (excluding sub-sections) - for source diff dict""" + if not lines or start_line < 1 or start_line > len(lines): + return "" + + start_index = start_line - 1 # Convert to 0-based index + section_content = [] + + # Find the header at start_line + current_line = lines[start_index].strip() + if not current_line.startswith('#'): + return "" + + # Add the header line + section_content.append(current_line) + + # Only extract until the first header (any level) + # This means we stop at ANY header - whether it's a sub-section OR same/higher level + for i in range(start_index + 1, len(lines)): + line = lines[i].strip() + if line.startswith('#'): + # Stop at ANY header to get only direct content + break + section_content.append(lines[i].rstrip()) + + return '\n'.join(section_content) + +def extract_frontmatter_content(file_lines): + """Extract content from the beginning of file to the first top-level header""" + if not file_lines: + return "" + + frontmatter_lines = [] + for i, line in enumerate(file_lines): + line_stripped = line.strip() + # Stop when we hit the first top-level header + if line_stripped.startswith('# '): + break + frontmatter_lines.append(line.rstrip()) + + return '\n'.join(frontmatter_lines) + + +def extract_affected_sections(hierarchy_dict, file_lines): + """Extract all affected sections based on hierarchy dict""" + affected_sections = {} + + for line_num, hierarchy in hierarchy_dict.items(): + if line_num == "0" and hierarchy == "frontmatter": + # Special handling for frontmatter + frontmatter_content = extract_frontmatter_content(file_lines) + if frontmatter_content: + affected_sections[line_num] = frontmatter_content + else: + line_number = int(line_num) + section_content = extract_section_content(file_lines, line_number, hierarchy_dict) + + if section_content: + affected_sections[line_num] = section_content + + return affected_sections + +def find_containing_section(line_num, all_headers): + """Find which section a line belongs to""" + current_section = None + for header_line_num in sorted(all_headers.keys()): + if header_line_num <= line_num: + current_section = header_line_num + else: + break + return current_section + +def find_affected_sections(lines, changed_lines, all_headers): + """Find which sections are affected by the changes""" + affected_sections = set() + + for changed_line in changed_lines: + # Find the section this changed line belongs to + current_section = None + + # Find the most recent header before or at the changed line + for line_num in sorted(all_headers.keys()): + if line_num <= changed_line: + current_section = line_num + else: + break + + if current_section: + # Only add the directly affected section (the one that directly contains the change) + affected_sections.add(current_section) + + return affected_sections + +def find_sections_by_operation_type(lines, operations, all_headers, base_hierarchy_dict=None): + """Find sections affected by different types of operations""" + sections = { + 'added': set(), + 'modified': set(), + 'deleted': set() + } + + # Process added lines + for added_line in operations['added_lines']: + line_num = added_line['line_number'] + if added_line['is_header']: + # This is a new header - only mark the section as added if the header itself is new + sections['added'].add(line_num) + # Note: We don't mark sections as "added" just because they contain new non-header content + # That would be a "modified" section, not an "added" section + + # Process modified lines + for modified_line in operations['modified_lines']: + line_num = modified_line['line_number'] + if modified_line['is_header']: + sections['modified'].add(line_num) + else: + section = find_containing_section(line_num, all_headers) + if section: + sections['modified'].add(section) + + # Process deleted lines - use base hierarchy to find deleted sections + for deleted_line in operations['deleted_lines']: + if deleted_line['is_header']: + # Find this header in the base file hierarchy (before deletion) + deleted_title = clean_title_for_matching(deleted_line['content']) + # Use base hierarchy if available, otherwise fall back to current headers + search_hierarchy = base_hierarchy_dict if base_hierarchy_dict else all_headers + + found_deleted = False + for line_num, hierarchy_line in search_hierarchy.items(): + # Extract title from hierarchy line + if ' > ' in hierarchy_line: + original_title = clean_title_for_matching(hierarchy_line.split(' > ')[-1]) + else: + original_title = clean_title_for_matching(hierarchy_line) + + if deleted_title == original_title: + sections['deleted'].add(line_num) + print(f" šŸ—‘ļø Detected deleted section: {deleted_line['content']} (line {line_num})") + found_deleted = True + break + + if not found_deleted: + # If not found by exact match, try partial matching for renamed sections + print(f" āš ļø Could not find deleted section: {deleted_line['content']}") + + return sections + + +def get_target_hierarchy_and_content(file_path, github_client, target_repo): + """Get target hierarchy and content""" + try: + repository = github_client.get_repo(target_repo) + file_content = repository.get_contents(file_path, ref="master").decoded_content.decode('utf-8') + lines = file_content.split('\n') + + # Build hierarchy using same method + hierarchy = build_hierarchy_dict(file_content) + + return hierarchy, lines + except Exception as e: + print(f" āŒ Error getting target file: {e}") + return {}, [] + +def get_source_sections_content(pr_url, file_path, source_affected, github_client): + """Get the content of source sections for better context""" + try: + owner, repo, pr_number = parse_pr_url(pr_url) + repository = github_client.get_repo(f"{owner}/{repo}") + pr = repository.get_pull(pr_number) + + # Get the source file content + file_content = repository.get_contents(file_path, ref=pr.head.sha).decoded_content.decode('utf-8') + lines = file_content.split('\n') + + # Extract source sections + source_sections = {} + + for line_num, hierarchy in source_affected.items(): + if line_num == "0" and hierarchy == "frontmatter": + # Special handling for frontmatter + frontmatter_content = extract_frontmatter_content(lines) + if frontmatter_content: + source_sections[line_num] = frontmatter_content + else: + line_number = int(line_num) + section_content = extract_section_content(lines, line_number, source_affected) + if section_content: + source_sections[line_num] = section_content + + return source_sections + except Exception as e: + thread_safe_print(f" āš ļø Could not get source sections: {e}") + return {} + +def get_source_file_hierarchy(file_path, pr_url, github_client, get_base_version=False): + """Get source file hierarchy from PR head or base""" + try: + owner, repo, pr_number = parse_pr_url(pr_url) + repository = github_client.get_repo(f"{owner}/{repo}") + pr = repository.get_pull(pr_number) + + if get_base_version: + # Get the source file content before PR changes (base version) + source_file_content = repository.get_contents(file_path, ref=pr.base.sha).decoded_content.decode('utf-8') + else: + # Get the source file content after PR changes (head version) + source_file_content = repository.get_contents(file_path, ref=pr.head.sha).decoded_content.decode('utf-8') + + source_hierarchy = build_hierarchy_dict(source_file_content) + + return source_hierarchy + + except Exception as e: + thread_safe_print(f" āŒ Error getting source file hierarchy: {e}") + return {} + +# Helper function needed for find_sections_by_operation_type +def clean_title_for_matching(title): + """Clean title for matching by removing markdown formatting and span elements""" + if not title: + return "" + + # Remove span elements like New in v5.0 + title = re.sub(r']*>.*?', '', title) + + # Remove markdown header prefix (# ## ### etc.) + title = re.sub(r'^#{1,6}\s*', '', title.strip()) + + # Remove backticks + title = title.replace('`', '') + + # Strip whitespace + title = title.strip() + + return title + +def find_previous_section_for_added(added_sections, hierarchy_dict): + """Find the previous section hierarchy for each added section group""" + insertion_points = {} + + if not added_sections: + return insertion_points + + # Group consecutive added sections + added_list = sorted(list(added_sections)) + groups = [] + current_group = [added_list[0]] + + for i in range(1, len(added_list)): + if added_list[i] - added_list[i-1] <= 10: # Consider sections within 10 lines as consecutive + current_group.append(added_list[i]) + else: + groups.append(current_group) + current_group = [added_list[i]] + groups.append(current_group) + + # For each group, find the previous section hierarchy + for group in groups: + first_new_section = min(group) + + # Find the section that comes before this group + previous_section_line = None + previous_section_hierarchy = None + + for line_num_str in sorted(hierarchy_dict.keys(), key=int): + line_num = int(line_num_str) + if line_num < first_new_section: + previous_section_line = line_num + previous_section_hierarchy = hierarchy_dict[line_num_str] + else: + break + + if previous_section_hierarchy: + insertion_points[f"group_{groups.index(group)}"] = { + 'previous_section_hierarchy': previous_section_hierarchy, + 'previous_section_line': previous_section_line, + 'new_sections': group, + 'insertion_type': 'multiple' if len(group) > 1 else 'single' + } + print(f" šŸ“ Added section group: {len(group)} sections after '{previous_section_hierarchy}'") + else: + print(f" āš ļø Could not find previous section for added sections starting at line {first_new_section}") + + return insertion_points + +def build_source_diff_dict(modified_sections, added_sections, deleted_sections, all_hierarchy_dict, base_hierarchy_dict, operations, file_content, base_file_content): + """Build source diff dictionary with correct structure for matching""" + from section_matcher import clean_title_for_matching + source_diff_dict = {} + + # Helper function to extract section content (only direct content, no sub-sections) + def extract_section_content_for_diff(line_num, hierarchy_dict): + if str(line_num) == "0": + # Handle frontmatter + return extract_frontmatter_content(file_content.split('\n')) + else: + return extract_section_direct_content(file_content.split('\n'), line_num) + + # Helper function to extract old content from base file (only direct content, no sub-sections) + def extract_old_content_for_diff(line_num, base_hierarchy_dict, base_file_content): + if str(line_num) == "0": + # Handle frontmatter from base file + return extract_frontmatter_content(base_file_content.split('\n')) + else: + return extract_section_direct_content(base_file_content.split('\n'), line_num) + + # Helper function to extract old content by hierarchy (for modified sections that may have moved) + def extract_old_content_by_hierarchy(original_hierarchy, base_hierarchy_dict, base_file_content): + """Extract old content by finding the section with matching hierarchy in base file (only direct content)""" + if original_hierarchy == "frontmatter": + return extract_frontmatter_content(base_file_content.split('\n')) + + # Find the line number in base file that matches the original hierarchy + for base_line_num_str, base_hierarchy in base_hierarchy_dict.items(): + if base_hierarchy == original_hierarchy: + base_line_num = int(base_line_num_str) if base_line_num_str != "0" else 0 + if base_line_num == 0: + return extract_frontmatter_content(base_file_content.split('\n')) + else: + return extract_section_direct_content(base_file_content.split('\n'), base_line_num) + + # If exact match not found, return empty string + print(f" āš ļø Could not find matching hierarchy in base file: {original_hierarchy}") + return "" + + # Helper function to build complete hierarchy for a section using base file info + def build_complete_original_hierarchy(line_num, current_hierarchy, base_hierarchy_dict, operations): + """Build complete hierarchy path for original section""" + line_num_str = str(line_num) + + # Special cases: frontmatter and top-level titles + if line_num_str == "0": + return "frontmatter" + + # Check if this line was modified and has original content + for modified_line in operations.get('modified_lines', []): + if (modified_line.get('is_header') and + modified_line.get('line_number') == line_num and + 'original_content' in modified_line): + original_line = modified_line['original_content'].strip() + + # For top-level titles, return the original content directly + if ' > ' not in current_hierarchy: + return original_line + + # For nested sections, build the complete hierarchy using original content + # Find the hierarchy path using base hierarchy dict and replace the leaf with original + if line_num_str in base_hierarchy_dict: + base_hierarchy = base_hierarchy_dict[line_num_str] + if ' > ' in base_hierarchy: + # Replace the leaf (last part) with original content + hierarchy_parts = base_hierarchy.split(' > ') + hierarchy_parts[-1] = original_line + return ' > '.join(hierarchy_parts) + else: + # Single level, return original content + return original_line + + # Fallback: return original content + return original_line + + # If not modified, use base hierarchy if available + if line_num_str in base_hierarchy_dict: + return base_hierarchy_dict[line_num_str] + + # If not found in base (new section), use current hierarchy + return current_hierarchy + + # Process modified sections + for line_num_str, hierarchy in modified_sections.items(): + line_num = int(line_num_str) if line_num_str != "0" else 0 + + # Build complete original hierarchy + original_hierarchy = build_complete_original_hierarchy(line_num, hierarchy, base_hierarchy_dict, operations) + + # Extract both old and new content + new_content = extract_section_content_for_diff(line_num, all_hierarchy_dict) + # Use hierarchy-based lookup for old content instead of line number + old_content = extract_old_content_by_hierarchy(original_hierarchy, base_hierarchy_dict, base_file_content) + + # Only include if content actually changed + if new_content != old_content: + # Check if this is a bottom modified section (no next section in base file) + is_bottom_modified = False + if line_num_str in base_hierarchy_dict: + # Get all sections in base file sorted by line number + base_sections = sorted([(int(ln), hier) for ln, hier in base_hierarchy_dict.items() if ln != "0"]) + + # Check if there's any section after this line in base file + has_next_section = any(base_line > line_num for base_line, _ in base_sections) + + if not has_next_section: + is_bottom_modified = True + print(f" āœ… Bottom modified section detected at line {line_num_str}: no next section in base file") + + # Use special marker for bottom modified sections + if is_bottom_modified: + final_original_hierarchy = f"bottom-modified-{line_num}" + else: + final_original_hierarchy = original_hierarchy + + source_diff_dict[f"modified_{line_num_str}"] = { + "new_line_number": line_num, + "original_hierarchy": final_original_hierarchy, + "operation": "modified", + "new_content": new_content, + "old_content": old_content + } + print(f" āœ… Real modification detected at line {line_num_str}: content changed") + else: + print(f" 🚫 Filtered out false positive at line {line_num_str}: content unchanged (likely line shift artifact)") + + # Process added sections - find next section from current document hierarchy + for line_num_str, hierarchy in added_sections.items(): + line_num = int(line_num_str) + + print(f" šŸ” Finding next section for added section at line {line_num}: {hierarchy}") + + # Strategy: Find the next section directly from the current document (post-PR) + # Get all current sections sorted by line number + current_sections = sorted([(int(ln), curr_hierarchy) for ln, curr_hierarchy in all_hierarchy_dict.items()]) + print(f" šŸ“‹ Current sections around line {line_num}: {[(ln, h.split(' > ')[-1] if ' > ' in h else h) for ln, h in current_sections if abs(ln - line_num) <= 15]}") + + next_section_original_hierarchy = None + + # Find the next section that comes after the added section in the current document + for curr_line_num, curr_hierarchy in current_sections: + if curr_line_num > line_num: + # Found the next section in current document + # Now find its original hierarchy in base document + curr_line_str = str(curr_line_num) + + # Get the original hierarchy for this next section + # Use the same logic as build_complete_original_hierarchy to get original content + if curr_line_str in base_hierarchy_dict: + # Check if this section was modified + was_modified = False + for modified_line in operations.get('modified_lines', []): + if (modified_line.get('is_header') and + modified_line.get('line_number') == curr_line_num and + 'original_content' in modified_line): + # This section was modified, use original content + original_line = modified_line['original_content'].strip() + base_hierarchy = base_hierarchy_dict[curr_line_str] + + if ' > ' in base_hierarchy: + # Replace the leaf with original content + hierarchy_parts = base_hierarchy.split(' > ') + hierarchy_parts[-1] = original_line + next_section_original_hierarchy = ' > '.join(hierarchy_parts) + else: + next_section_original_hierarchy = original_line + + print(f" āœ… Found next section (modified): line {curr_line_num} -> {next_section_original_hierarchy.split(' > ')[-1] if ' > ' in next_section_original_hierarchy else next_section_original_hierarchy}") + was_modified = True + break + + if not was_modified: + # Section was not modified, use base hierarchy directly + next_section_original_hierarchy = base_hierarchy_dict[curr_line_str] + print(f" āœ… Found next section (unchanged): line {curr_line_num} -> {next_section_original_hierarchy.split(' > ')[-1] if ' > ' in next_section_original_hierarchy else next_section_original_hierarchy}") + + break + else: + # This next section might also be new or modified + # Try to find it by content matching in base hierarchy + found_match = False + for base_line_str, base_hierarchy in base_hierarchy_dict.items(): + # Compare the leaf titles (last part of hierarchy) + curr_leaf = curr_hierarchy.split(' > ')[-1] if ' > ' in curr_hierarchy else curr_hierarchy + base_leaf = base_hierarchy.split(' > ')[-1] if ' > ' in base_hierarchy else base_hierarchy + + # Clean titles for comparison + curr_clean = clean_title_for_matching(curr_leaf) + base_clean = clean_title_for_matching(base_leaf) + + if curr_clean == base_clean: + next_section_original_hierarchy = base_hierarchy + print(f" āœ… Found next section (by content): {base_hierarchy.split(' > ')[-1] if ' > ' in base_hierarchy else base_hierarchy}") + found_match = True + break + + if found_match: + break + else: + print(f" āš ļø Next section at line {curr_line_num} not found in base, continuing search...") + + # If no next section found, this is being added at the end + if not next_section_original_hierarchy: + print(f" āœ… Bottom section detected: this section is added at the end of document") + # Use special marker for bottom added sections - no matching needed + next_section_original_hierarchy = f"bottom-added-{line_num}" + + source_diff_dict[f"added_{line_num_str}"] = { + "new_line_number": line_num, + "original_hierarchy": next_section_original_hierarchy, + "operation": "added", + "new_content": extract_section_content_for_diff(line_num, all_hierarchy_dict), + "old_content": None # Added sections have no old content + } + + # Process deleted sections - use original hierarchy from base file + for line_num_str, hierarchy in deleted_sections.items(): + line_num = int(line_num_str) + # Use complete hierarchy from base file + original_hierarchy = base_hierarchy_dict.get(line_num_str, hierarchy) + + # Extract old content for deleted sections + old_content = extract_old_content_for_diff(line_num, base_hierarchy_dict, base_file_content) + + source_diff_dict[f"deleted_{line_num_str}"] = { + "new_line_number": line_num, + "original_hierarchy": original_hierarchy, + "operation": "deleted", + "new_content": None, # No new content for deleted sections + "old_content": old_content # Show what was deleted + } + + # Sort the dictionary by new_line_number for better readability + sorted_items = sorted(source_diff_dict.items(), key=lambda x: x[1]['new_line_number']) + source_diff_dict = dict(sorted_items) + + return source_diff_dict + +def analyze_source_changes(pr_url, github_client, special_files=None, ignore_files=None, repo_configs=None, max_non_system_sections=120, pr_diff=None): + """Analyze source language changes and categorize them as added, modified, or deleted""" + # Import modules needed in this function + import os + import json + from toc_processor import process_toc_operations + + owner, repo, pr_number = parse_pr_url(pr_url) + repository = github_client.get_repo(f"{owner}/{repo}") + pr = repository.get_pull(pr_number) + + # Get repository configuration for target repo info + repo_config = get_repo_config(pr_url, repo_configs) + + print(f"šŸ“‹ Processing PR #{pr_number}: {pr.title}") + + # Get markdown files + files = pr.get_files() + markdown_files = [f for f in files if f.filename.endswith('.md')] + + print(f"šŸ“„ Found {len(markdown_files)} markdown files") + + # Return dictionaries for different operation types + added_sections = {} # New sections that were added + modified_sections = {} # Existing sections that were modified + deleted_sections = {} # Sections that were deleted + added_files = {} # Completely new files that were added + deleted_files = [] # Completely deleted files + ignored_files = [] # Files that were ignored + toc_files = {} # Special TOC files requiring special processing + + for file in markdown_files: + print(f"\nšŸ” Analyzing {file.filename}") + + # Check if this file should be ignored + if file.filename in ignore_files: + print(f" ā­ļø Skipping ignored file: {file.filename}") + ignored_files.append(file.filename) + continue + + # Check if this is a completely new file or deleted file + if file.status == 'added': + print(f" āž• Detected new file: {file.filename}") + try: + file_content = repository.get_contents(file.filename, ref=pr.head.sha).decoded_content.decode('utf-8') + added_files[file.filename] = file_content + print(f" āœ… Added complete file for translation") + continue + except Exception as e: + print(f" āŒ Error getting new file content: {e}") + continue + + elif file.status == 'removed': + print(f" šŸ—‘ļø Detected deleted file: {file.filename}") + deleted_files.append(file.filename) + print(f" āœ… Marked file for deletion") + continue + + # For modified files, check if it's a special file like TOC.md + try: + file_content = repository.get_contents(file.filename, ref=pr.head.sha).decoded_content.decode('utf-8') + except Exception as e: + print(f" āŒ Error getting content: {e}") + continue + + # Check if this is a TOC.md file requiring special processing + if os.path.basename(file.filename) in special_files: + print(f" šŸ“‹ Detected special file: {file.filename}") + + # Get target file content for comparison + try: + target_repository = github_client.get_repo(repo_config['target_repo']) + target_file_content = target_repository.get_contents(file.filename, ref="master").decoded_content.decode('utf-8') + target_lines = target_file_content.split('\n') + except Exception as e: + print(f" āš ļø Could not get target file content: {e}") + continue + + # Analyze diff operations for TOC.md + operations = analyze_diff_operations(file) + source_lines = file_content.split('\n') + + # Process with special TOC logic + toc_results = process_toc_operations(file.filename, operations, source_lines, target_lines, "") # Local path will be determined later + + # Store TOC operations for later processing + if any([toc_results['added'], toc_results['modified'], toc_results['deleted']]): + # Combine all operations for processing + all_toc_operations = [] + all_toc_operations.extend(toc_results['added']) + all_toc_operations.extend(toc_results['modified']) + all_toc_operations.extend(toc_results['deleted']) + + # Add to special TOC processing queue (separate from regular sections) + toc_files[file.filename] = { + 'type': 'toc', + 'operations': all_toc_operations + } + + print(f" šŸ“‹ TOC operations queued for processing:") + if toc_results['added']: + print(f" āž• Added: {len(toc_results['added'])} entries") + if toc_results['modified']: + print(f" āœļø Modified: {len(toc_results['modified'])} entries") + if toc_results['deleted']: + print(f" āŒ Deleted: {len(toc_results['deleted'])} entries") + else: + print(f" ā„¹ļø No TOC operations found") + + continue # Skip regular processing for TOC files + + # Analyze diff operations + operations = analyze_diff_operations(file) + print(f" šŸ“ Diff analysis: {len(operations['added_lines'])} added, {len(operations['modified_lines'])} modified, {len(operations['deleted_lines'])} deleted lines") + + lines = file_content.split('\n') + all_headers = {} + + # Track code block state + in_code_block = False + code_block_delimiter = None + + # First pass: collect all headers (excluding those in code blocks) + for line_num, line in enumerate(lines, 1): + original_line = line + line = line.strip() + + # Check for code block delimiters + if line.startswith('```') or line.startswith('~~~'): + if not in_code_block: + # Entering a code block + in_code_block = True + code_block_delimiter = line[:3] + continue + elif line.startswith(code_block_delimiter): + # Exiting a code block + in_code_block = False + code_block_delimiter = None + continue + + # Skip processing if we're inside a code block + if in_code_block: + continue + + # Process headers only if not in code block + if line.startswith('#'): + match = re.match(r'^(#{1,10})\s+(.+)', line) + if match: + level = len(match.group(1)) + title = match.group(2).strip() + all_headers[line_num] = { + 'level': level, + 'title': title, + 'line': line + } + + # Build complete hierarchy from HEAD (after changes) + all_hierarchy_dict = build_hierarchy_dict(file_content) + + # For deletion detection, we also need the base file hierarchy + try: + base_file_content = repository.get_contents(file.filename, ref=f"{repository.default_branch}").decoded_content.decode('utf-8') + base_hierarchy_dict = build_hierarchy_dict(base_file_content) + except Exception as e: + print(f" āš ļø Could not get base file content: {e}") + base_hierarchy_dict = all_hierarchy_dict + base_file_content = file_content # Fallback to current content + + # Find sections by operation type with corrected logic + sections_by_type = find_sections_by_operation_type(lines, operations, all_headers, base_hierarchy_dict) + + # Prioritize modified headers over added ones (fix for header changes like --host -> --hosts) + modified_header_lines = set() + for modified_line in operations['modified_lines']: + if modified_line['is_header']: + modified_header_lines.add(modified_line['line_number']) + + # Remove modified header lines from added set + sections_by_type['added'] = sections_by_type['added'] - modified_header_lines + + # Enhanced logic: check for actual content changes within sections + # This helps detect changes in section content (not just headers) + print(f" šŸ” Enhanced detection: checking for actual section content changes...") + + # Get only lines that have actual content changes (exclude headers) + real_content_changes = set() + + # Added lines (new content, excluding headers) + for added_line in operations['added_lines']: + if not added_line['is_header']: + real_content_changes.add(added_line['line_number']) + + # Deleted lines (removed content, excluding headers) + for deleted_line in operations['deleted_lines']: + if not deleted_line['is_header']: + real_content_changes.add(deleted_line['line_number']) + + # Modified lines (changed content, excluding headers) + for modified_line in operations['modified_lines']: + if not modified_line['is_header']: + real_content_changes.add(modified_line['line_number']) + + print(f" šŸ“ Real content changes (non-header): {sorted(real_content_changes)}") + + # Find sections that contain actual content changes + content_affected_sections = set() + for changed_line in real_content_changes: + # Find which section this changed line belongs to + containing_section = None + for line_num in sorted(all_headers.keys()): + if line_num <= changed_line: + containing_section = line_num + else: + break + + if containing_section and containing_section not in sections_by_type['added']: + # Additional check: make sure this is not just a line number shift + # Only add if the change is within reasonable distance from the section header + # AND if the changed line is not part of a completely deleted section header + is_deleted_header = False + for deleted_line in operations['deleted_lines']: + if (deleted_line['is_header'] and + abs(changed_line - deleted_line['line_number']) <= 2): + is_deleted_header = True + print(f" āš ļø Skipping change at line {changed_line} (deleted header near line {deleted_line['line_number']})") + break + + # More precise filtering: check if this change is actually meaningful + # Skip changes that are part of deleted content or line shifts due to deletions + should_include = True + + # Skip exact deleted headers + for deleted_line in operations['deleted_lines']: + if (deleted_line['is_header'] and + changed_line == deleted_line['line_number']): + should_include = False + print(f" āš ļø Skipping change at line {changed_line} (exact deleted header)") + break + + # Skip changes that are very close to deleted content AND far from their containing section + # This helps filter out line shift artifacts while keeping real content changes + if should_include: + for deleted_line in operations['deleted_lines']: + # Only skip if both conditions are met: + # 1. Very close to deleted content (within 5 lines) + # 2. The change is far from its containing section (likely a shift artifact) + distance_to_deletion = abs(changed_line - deleted_line['line_number']) + distance_to_section = changed_line - containing_section + + if (distance_to_deletion <= 5 and distance_to_section > 100): + should_include = False + print(f" āš ļø Skipping change at line {changed_line} (likely line shift: {distance_to_deletion} lines from deletion, {distance_to_section} from section)") + break + + if should_include and changed_line - containing_section <= 30: + content_affected_sections.add(containing_section) + print(f" šŸ“ Content change at line {changed_line} affects section at line {containing_section}") + elif should_include: + print(f" āš ļø Skipping distant change at line {changed_line} from section {containing_section}") + + # Add content-modified sections to the modified set, but exclude sections that are already marked as added or deleted + for line_num in content_affected_sections: + if (line_num not in sections_by_type['modified'] and + line_num not in sections_by_type['added'] and + line_num not in sections_by_type['deleted']): # āœ… Critical fix: exclude deleted sections + sections_by_type['modified'].add(line_num) + print(f" šŸ“ Added content-modified section at line {line_num}") + elif line_num in sections_by_type['deleted']: + print(f" 🚫 Skipping content-modified section at line {line_num}: already marked as deleted") + + # Prepare sections data for source_diff_dict + file_modified = {} + file_added = {} + file_deleted = {} + + # Build modified sections + for line_num in sections_by_type['modified']: + if line_num in all_hierarchy_dict: + file_modified[str(line_num)] = all_hierarchy_dict[line_num] + + # Build added sections + for line_num in sections_by_type['added']: + if line_num in all_hierarchy_dict: + file_added[str(line_num)] = all_hierarchy_dict[line_num] + + # Build deleted sections + for line_num in sections_by_type['deleted']: + if line_num in base_hierarchy_dict: + file_deleted[str(line_num)] = base_hierarchy_dict[line_num] + + # Check for frontmatter changes (content before first top-level header) + print(f" šŸ” Checking for frontmatter changes...") + frontmatter_changed = False + + # Check if any changes occur before the first top-level header + first_header_line = None + for line_num in sorted(all_headers.keys()): + header_info = all_headers[line_num] + if header_info['level'] == 1: # First top-level header + first_header_line = line_num + break + + print(f" šŸ“Š First header line: {first_header_line}") + print(f" šŸ“Š Real content changes: {sorted(real_content_changes)}") + + if first_header_line: + # Check if any real content changes are before the first header + for line_num in real_content_changes: + #print(f" šŸ” Checking line {line_num} vs first header {first_header_line}") + if line_num < first_header_line: + frontmatter_changed = True + print(f" šŸ“„ Frontmatter change detected: line {line_num} < {first_header_line}") + break + + print(f" šŸ“Š Frontmatter changed: {frontmatter_changed}") + + if frontmatter_changed: + print(f" šŸ“„ Frontmatter changes detected (before line {first_header_line})") + # Add frontmatter as a special section with line number 0 + file_modified["0"] = "frontmatter" + print(f" āœ… Added frontmatter section to modified sections") + + # Build source diff dictionary + source_diff_dict = build_source_diff_dict( + file_modified, file_added, file_deleted, + all_hierarchy_dict, base_hierarchy_dict, + operations, file_content, base_file_content + ) + + # Breakpoint: Output source_diff_dict to file for review with file prefix + + # Ensure temp_output directory exists + script_dir = os.path.dirname(os.path.abspath(__file__)) + temp_dir = os.path.join(script_dir, "temp_output") + os.makedirs(temp_dir, exist_ok=True) + + file_prefix = file.filename.replace('/', '-').replace('.md', '') + output_file = os.path.join(temp_dir, f"{file_prefix}-source-diff-dict.json") + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(source_diff_dict, f, ensure_ascii=False, indent=2) + + print(f" šŸ’¾ Saved source diff dictionary to: {output_file}") + print(f" šŸ“Š Source diff dictionary contains {len(source_diff_dict)} sections:") + for key, diff_info in source_diff_dict.items(): + print(f" {diff_info['operation']}: {key} -> original_hierarchy: {diff_info['original_hierarchy']}") + + # source-diff-dict.json generation is complete, continue to next step in main.py + + # For modified headers, we need to build a mapping using original titles for matching + original_hierarchy_dict = all_hierarchy_dict.copy() + + # Update hierarchy dict to use original content for modified headers when needed for matching + for line_num in sections_by_type['modified']: + if line_num in all_headers: + header_info = all_headers[line_num] + # Check if this header was modified and has original content + for op in operations['modified_lines']: + if (op['is_header'] and + op['line_number'] == line_num and + 'original_content' in op): + # Create hierarchy path using original content for matching + original_line = op['original_content'].strip() + if original_line.startswith('#'): + # Build original hierarchy for matching + original_hierarchy = build_hierarchy_for_modified_section( + file_content, line_num, original_line, all_hierarchy_dict) + if original_hierarchy: + original_hierarchy_dict[line_num] = original_hierarchy + break + + # Process added sections + if sections_by_type['added']: + file_added = {} + # Find insertion points using the simplified logic: + # Record the previous section hierarchy for each added section + insertion_points = find_previous_section_for_added(sections_by_type['added'], all_hierarchy_dict) + + # Get actual content for added sections + for line_num in sections_by_type['added']: + if line_num in all_hierarchy_dict: + file_added[str(line_num)] = all_hierarchy_dict[line_num] + + # Get source sections content (actual content, not just hierarchy) + if file_added: + source_sections_content = get_source_sections_content(pr_url, file.filename, file_added, github_client) + file_added = source_sections_content # Replace hierarchy with actual content + + if file_added: + added_sections[file.filename] = { + 'sections': file_added, + 'insertion_points': insertion_points + } + print(f" āž• Found {len(file_added)} added sections with {len(insertion_points)} insertion points") + + # Process modified sections + if sections_by_type['modified']: + file_modified = {} + for line_num in sections_by_type['modified']: + if line_num in original_hierarchy_dict: + file_modified[str(line_num)] = original_hierarchy_dict[line_num] + + if file_modified: + modified_sections[file.filename] = { + 'sections': file_modified, + 'original_hierarchy': original_hierarchy_dict, + 'current_hierarchy': all_hierarchy_dict + } + print(f" āœļø Found {len(file_modified)} modified sections") + + # Process deleted sections + if sections_by_type['deleted']: + file_deleted = {} + for line_num in sections_by_type['deleted']: + # Use base hierarchy to get the deleted section info + if line_num in base_hierarchy_dict: + file_deleted[str(line_num)] = base_hierarchy_dict[line_num] + + if file_deleted: + deleted_sections[file.filename] = file_deleted + print(f" āŒ Found {len(file_deleted)} deleted sections") + + # Enhanced logic: also check content-level changes using legacy detection + # This helps detect changes in section content (not just headers) + print(f" šŸ” Enhanced detection: checking content-level changes...") + changed_lines = get_changed_line_ranges(file) + affected_sections = find_affected_sections(lines, changed_lines, all_headers) + + legacy_modified = {} + for line_num in affected_sections: + if line_num in all_hierarchy_dict: + section_hierarchy = all_hierarchy_dict[line_num] + # Only add if not already detected by operation-type analysis + already_detected = False + if file.filename in modified_sections: + for existing_line, existing_hierarchy in modified_sections[file.filename].get('sections', {}).items(): + if existing_hierarchy == section_hierarchy: + already_detected = True + break + + if not already_detected: + legacy_modified[str(line_num)] = section_hierarchy + + if legacy_modified: + print(f" āœ… Enhanced detection found {len(legacy_modified)} additional content-modified sections") + # Merge with existing modified sections + if file.filename in modified_sections: + # Merge the sections + existing_sections = modified_sections[file.filename].get('sections', {}) + existing_sections.update(legacy_modified) + modified_sections[file.filename]['sections'] = existing_sections + else: + # Create new entry + modified_sections[file.filename] = { + 'sections': legacy_modified, + 'original_hierarchy': all_hierarchy_dict, + 'current_hierarchy': all_hierarchy_dict + } + + print(f"\nšŸ“Š Summary:") + #print(f" āœļø Modified files: {} files") + print(f" šŸ“„ Added files: {len(added_files)} files") + print(f" šŸ—‘ļø Deleted files: {len(deleted_files)} files") + print(f" šŸ“‹ TOC files: {len(toc_files)} files") + if ignored_files: + print(f" ā­ļø Ignored files: {len(ignored_files)} files") + for ignored_file in ignored_files: + print(f" - {ignored_file}") + + return added_sections, modified_sections, deleted_sections, added_files, deleted_files, toc_files diff --git a/scripts/translate_doc_pr/requirements.txt b/scripts/translate_doc_pr/requirements.txt new file mode 100644 index 0000000000000..d8336cf8cebe7 --- /dev/null +++ b/scripts/translate_doc_pr/requirements.txt @@ -0,0 +1,4 @@ +PyGithub>=1.55.0 +openai>=1.0.0 +tiktoken>=0.4.0 +google-generativeai>=0.3.0 diff --git a/scripts/translate_doc_pr/section_matcher.py b/scripts/translate_doc_pr/section_matcher.py new file mode 100644 index 0000000000000..ce4ef61116c89 --- /dev/null +++ b/scripts/translate_doc_pr/section_matcher.py @@ -0,0 +1,973 @@ +""" +Section Matcher Module +Handles section hierarchy matching including direct matching and AI matching +""" + +import os +import re +import json +import threading +from github import Github +from openai import OpenAI + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def clean_title_for_matching(title): + """Clean title for matching by removing markdown formatting and span elements""" + if not title: + return "" + + # Remove span elements like New in v5.0 + title = re.sub(r']*>.*?', '', title) + + # Remove markdown header prefix (# ## ### etc.) + title = re.sub(r'^#{1,6}\s*', '', title.strip()) + + # Remove backticks + title = title.replace('`', '') + + # Strip whitespace + title = title.strip() + + return title + +def is_system_variable_or_config(title): + """Check if a title represents a system variable or configuration item""" + cleaned_title = clean_title_for_matching(title) + + if not cleaned_title: + return False + + # Check if original title had backticks (indicating code/config item) + original_has_backticks = '`' in title + + # System variables and config items are typically: + # 1. Alphanumeric characters with underscores, hyphens, dots, or percent signs + # 2. No spaces in the middle + # 3. Often contain underscores, hyphens, dots, or percent signs + # 4. May contain uppercase letters (like alert rule names) + # 5. Single words wrapped in backticks (like `capacity`, `engine`) + + # Check if it contains only allowed characters (including % for metrics/alerts) + allowed_chars = re.match(r'^[a-zA-Z0-9_\-\.%]+$', cleaned_title) + + # Check if it contains at least one separator (common in system vars/config/alerts) + has_separator = ('_' in cleaned_title or '-' in cleaned_title or + '.' in cleaned_title or '%' in cleaned_title) + + # Check if it doesn't contain spaces (spaces would indicate it's likely a regular title) + no_spaces = ' ' not in cleaned_title + + # Additional patterns for alert rules and metrics + is_alert_rule = (cleaned_title.startswith('PD_') or + cleaned_title.startswith('TiDB_') or + cleaned_title.startswith('TiKV_') or + cleaned_title.endswith('_alert') or + '%' in cleaned_title) + + # NEW: Check if it's a single word in backticks (config/variable name) + # Examples: `capacity`, `engine`, `enable`, `dirname` etc. + is_single_backticked_word = (original_has_backticks and + allowed_chars and + no_spaces and + len(cleaned_title.split()) == 1) + + return bool(allowed_chars and (has_separator or is_alert_rule or is_single_backticked_word) and no_spaces) + +def find_toplevel_title_matches(source_sections, target_lines): + """Find matches for top-level titles (# Level) by direct pattern matching""" + matched_dict = {} + failed_matches = [] + skipped_sections = [] + + thread_safe_print(f"šŸ” Searching for top-level title matches") + + for source_line_num, source_hierarchy in source_sections.items(): + # Extract the leaf title from hierarchy + source_leaf_title = source_hierarchy.split(' > ')[-1] if ' > ' in source_hierarchy else source_hierarchy + + # Only process top-level titles + if not source_leaf_title.startswith('# '): + skipped_sections.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'Not a top-level title' + }) + continue + + thread_safe_print(f" šŸ“ Looking for top-level match: {source_leaf_title}") + + # Find the first top-level title in target document + target_match = None + for line_num, line in enumerate(target_lines, 1): + line = line.strip() + if line.startswith('# '): + target_match = { + 'line_num': line_num, + 'title': line, + 'hierarchy_string': line[2:].strip() # Remove '# ' prefix for hierarchy + } + thread_safe_print(f" āœ“ Found target top-level at line {line_num}: {line}") + break + + if target_match: + matched_dict[str(target_match['line_num'])] = target_match['hierarchy_string'] + thread_safe_print(f" āœ… Top-level match: line {target_match['line_num']}") + else: + thread_safe_print(f" āŒ No top-level title found in target") + failed_matches.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'No top-level title found in target' + }) + + thread_safe_print(f"šŸ“Š Top-level matching result: {len(matched_dict)} matches found") + if failed_matches: + thread_safe_print(f"āš ļø {len(failed_matches)} top-level sections failed to match:") + for failed in failed_matches: + thread_safe_print(f" āŒ Line {failed['line_num']}: {failed['hierarchy']} - {failed['reason']}") + + return matched_dict, failed_matches, skipped_sections + + +def find_direct_matches_for_special_files(source_sections, target_hierarchy, target_lines): + """Find direct matches for system variables/config items without using AI""" + matched_dict = {} + failed_matches = [] + skipped_sections = [] + + # Build target headers with hierarchy paths + target_headers = {} + for line_num, line in enumerate(target_lines, 1): + line = line.strip() + if line.startswith('#'): + match = re.match(r'^(#{1,10})\s+(.+)', line) + if match: + level = len(match.group(1)) + title = match.group(2).strip() + target_headers[line_num] = { + 'level': level, + 'title': title, + 'line': line + } + + thread_safe_print(f" šŸ” Searching for direct matches among {len(target_headers)} target headers") + + for source_line_num, source_hierarchy in source_sections.items(): + # Extract the leaf title from hierarchy + source_leaf_title = source_hierarchy.split(' > ')[-1] if ' > ' in source_hierarchy else source_hierarchy + source_clean_title = clean_title_for_matching(source_leaf_title) + + thread_safe_print(f" šŸ“ Looking for match: {source_clean_title}") + + if not is_system_variable_or_config(source_leaf_title): + thread_safe_print(f" āš ļø Not a system variable/config, skipping direct match") + skipped_sections.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'Not a system variable or config item' + }) + continue + + # Find potential matches in target + potential_matches = [] + for target_line_num, target_header in target_headers.items(): + target_clean_title = clean_title_for_matching(target_header['title']) + + if source_clean_title == target_clean_title: + # Build hierarchy path for this target header + hierarchy_path = build_hierarchy_path(target_lines, target_line_num, target_headers) + potential_matches.append({ + 'line_num': target_line_num, + 'header': target_header, + 'hierarchy_path': hierarchy_path, + 'hierarchy_string': ' > '.join([f"{'#' * h['level']} {h['title']}" for h in hierarchy_path if h['level'] > 1 or len(hierarchy_path) == 1]) + }) + thread_safe_print(f" āœ“ Found potential match at line {target_line_num}: {target_header['title']}") + + if len(potential_matches) == 1: + # Single match found + match = potential_matches[0] + matched_dict[str(match['line_num'])] = match['hierarchy_string'] + thread_safe_print(f" āœ… Direct match: line {match['line_num']}") + elif len(potential_matches) > 1: + # Multiple matches, need to use parent hierarchy to disambiguate + thread_safe_print(f" šŸ”€ Multiple matches found ({len(potential_matches)}), using parent hierarchy") + + # Extract parent hierarchy from source + source_parts = source_hierarchy.split(' > ') + if len(source_parts) > 1: + source_parent_titles = [clean_title_for_matching(part) for part in source_parts[:-1]] + + best_match = None + best_score = -1 + + for match in potential_matches: + # Compare parent hierarchy + target_parent_titles = [clean_title_for_matching(h['title']) for h in match['hierarchy_path'][:-1]] + + # Calculate similarity score + score = 0 + min_len = min(len(source_parent_titles), len(target_parent_titles)) + + for i in range(min_len): + if i < len(source_parent_titles) and i < len(target_parent_titles): + if source_parent_titles[-(i+1)] == target_parent_titles[-(i+1)]: # Compare from end + score += 1 + else: + break + + thread_safe_print(f" šŸ“Š Match at line {match['line_num']} score: {score}") + + if score > best_score: + best_score = score + best_match = match + + if best_match and best_score > 0: + matched_dict[str(best_match['line_num'])] = best_match['hierarchy_string'] + thread_safe_print(f" āœ… Best match: line {best_match['line_num']} (score: {best_score})") + else: + thread_safe_print(f" āŒ No good parent hierarchy match found") + failed_matches.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'Multiple matches found but no good parent hierarchy match' + }) + else: + thread_safe_print(f" āš ļø No parent hierarchy in source, cannot disambiguate") + failed_matches.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'Multiple matches found but no parent hierarchy to disambiguate' + }) + else: + thread_safe_print(f" āŒ No matches found for: {source_clean_title}") + # Try fuzzy matching for similar titles (e.g., --host vs --hosts) + fuzzy_matched = False + source_clean_lower = source_clean_title.lower() + for target_header in target_headers: + # Handle both dict and tuple formats + if isinstance(target_header, dict): + target_clean = clean_title_for_matching(target_header['title']) + elif isinstance(target_header, (list, tuple)) and len(target_header) >= 2: + target_clean = clean_title_for_matching(target_header[1]) # title is at index 1 + else: + continue # Skip invalid entries + target_clean_lower = target_clean.lower() + # Check for similar titles (handle plural/singular and minor differences) + # Case 1: One is substring of another (e.g., --host vs --hosts) + # Case 2: Small character difference (1-2 characters) + len_diff = abs(len(source_clean_lower) - len(target_clean_lower)) + if (len_diff <= 2 and + (source_clean_lower in target_clean_lower or + target_clean_lower in source_clean_lower)): + thread_safe_print(f" ā‰ˆ Fuzzy match found: {source_clean_title} ā‰ˆ {target_clean}") + if isinstance(target_header, dict): + matched_dict[str(target_header['line_num'])] = target_header['hierarchy_string'] + thread_safe_print(f" āœ… Fuzzy match: line {target_header['line_num']}") + elif isinstance(target_header, (list, tuple)) and len(target_header) >= 3: + matched_dict[str(target_header[0])] = target_header[2] # line_num at index 0, hierarchy at index 2 + thread_safe_print(f" āœ… Fuzzy match: line {target_header[0]}") + fuzzy_matched = True + break + + if not fuzzy_matched: + failed_matches.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'No matching section found in target' + }) + + thread_safe_print(f" šŸ“Š Direct matching result: {len(matched_dict)} matches found") + + if failed_matches: + thread_safe_print(f" āš ļø {len(failed_matches)} sections failed to match:") + for failed in failed_matches: + thread_safe_print(f" āŒ Line {failed['line_num']}: {failed['hierarchy']} - {failed['reason']}") + + if skipped_sections: + thread_safe_print(f" ā„¹ļø {len(skipped_sections)} sections skipped (not system variables/config):") + for skipped in skipped_sections: + thread_safe_print(f" ā­ļø Line {skipped['line_num']}: {skipped['hierarchy']} - {skipped['reason']}") + + return matched_dict, failed_matches, skipped_sections + +def filter_non_system_sections(target_hierarchy): + """Filter out system variable/config sections from target hierarchy for AI mapping""" + filtered_hierarchy = {} + system_sections_count = 0 + + for line_num, hierarchy in target_hierarchy.items(): + # Extract the leaf title from hierarchy + leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy + + if is_system_variable_or_config(leaf_title): + system_sections_count += 1 + else: + filtered_hierarchy[line_num] = hierarchy + + thread_safe_print(f" šŸ”§ Filtered target hierarchy: {len(filtered_hierarchy)} non-system sections (removed {system_sections_count} system sections)") + + return filtered_hierarchy + +def get_corresponding_sections(source_sections, target_sections, ai_client, source_language, target_language, max_tokens=20000): + """Use AI to find corresponding sections between different languages""" + + # Format source sections + source_text = "\n".join(source_sections) + target_text = "\n".join(target_sections) + number_of_sections = len(source_sections) + + prompt = f"""I am aligning the {source_language} and {target_language} documentation for TiDB. I have modified the following {number_of_sections} sections in the {source_language} file: + +{source_text} + +Here is the section structure of the corresponding {target_language} file. Please select the corresponding {number_of_sections} sections in {target_language} from the following list that I should modify. Do not output any other text, return the Markdown code block enclosed in three backticks. + +{target_text}""" + + thread_safe_print(f"\n šŸ“¤ AI Mapping Prompt ({source_language} → {target_language}):") + thread_safe_print(f" " + "="*80) + thread_safe_print(f" {prompt}") + thread_safe_print(f" " + "="*80) + + # Import token estimation function from main + try: + from main import print_token_estimation + print_token_estimation(prompt, f"Section mapping ({source_language} → {target_language})") + except ImportError: + # Fallback if import fails - use tiktoken + try: + import tiktoken + enc = tiktoken.get_encoding("cl100k_base") + tokens = enc.encode(prompt) + actual_tokens = len(tokens) + char_count = len(prompt) + thread_safe_print(f" šŸ’° Section mapping ({source_language} → {target_language})") + thread_safe_print(f" šŸ“ Input: {char_count:,} characters") + thread_safe_print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") + except Exception: + # Final fallback to character approximation + estimated_tokens = len(prompt) // 4 + char_count = len(prompt) + thread_safe_print(f" šŸ’° Section mapping ({source_language} → {target_language})") + thread_safe_print(f" šŸ“ Input: {char_count:,} characters") + thread_safe_print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") + + try: + ai_response = ai_client.chat_completion( + messages=[ + {"role": "user", "content": prompt} + ], + temperature=0.1, + max_tokens=max_tokens + ) + + thread_safe_print(f"\n šŸ“„ AI Mapping Response:") + thread_safe_print(f" " + "-"*80) + thread_safe_print(f" {ai_response}") + thread_safe_print(f" " + "-"*80) + + return ai_response + except Exception as e: + print(f" āŒ AI mapping error: {e}") + return None + +def parse_ai_response(ai_response): + """Parse AI response to extract section names""" + sections = [] + lines = ai_response.split('\n') + + for line in lines: + line = line.strip() + # Skip markdown code block markers and empty lines + if line and not line.startswith('```'): + # Remove leading "## " if present and clean up + if line.startswith('## '): + sections.append(line) + elif line.startswith('- '): + # Handle cases where AI returns a list + sections.append(line[2:].strip()) + + return sections + +def find_matching_line_numbers(ai_sections, target_hierarchy_dict): + """Find line numbers in target hierarchy dict that match AI sections""" + matched_dict = {} + + for ai_section in ai_sections: + # Look for exact matches first + found = False + for line_num, hierarchy in target_hierarchy_dict.items(): + if hierarchy == ai_section: + matched_dict[str(line_num)] = hierarchy + found = True + break + + if not found: + # Look for partial matches (in case of slight differences) + for line_num, hierarchy in target_hierarchy_dict.items(): + # Remove common variations and compare + ai_clean = ai_section.replace('### ', '').replace('## ', '').strip() + hierarchy_clean = hierarchy.replace('### ', '').replace('## ', '').strip() + + if ai_clean in hierarchy_clean or hierarchy_clean in ai_clean: + matched_dict[str(line_num)] = hierarchy + thread_safe_print(f" ā‰ˆ Partial match found at line {line_num}: {hierarchy}") + found = True + break + + if not found: + thread_safe_print(f" āœ— No match found for: {ai_section}") + + return matched_dict + +def build_hierarchy_path(lines, line_num, all_headers): + """Build the full hierarchy path for a header at given line (from auto-sync-pr-changes.py)""" + if line_num not in all_headers: + return [] + + current_header = all_headers[line_num] + current_level = current_header['level'] + hierarchy_path = [] + + # Find all parent headers + for check_line in sorted(all_headers.keys()): + if check_line >= line_num: + break + + header = all_headers[check_line] + if header['level'] < current_level: + # This is a potential parent + # Remove any headers at same or deeper level + while hierarchy_path and hierarchy_path[-1]['level'] >= header['level']: + hierarchy_path.pop() + hierarchy_path.append(header) + + # Add current header + hierarchy_path.append(current_header) + + return hierarchy_path + +def map_insertion_points_to_target(insertion_points, target_hierarchy, target_lines, file_path, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Map source insertion points to target language locations""" + target_insertion_points = {} + + thread_safe_print(f" šŸ“ Mapping {len(insertion_points)} insertion points to target...") + + for group_key, point_info in insertion_points.items(): + previous_section_hierarchy = point_info['previous_section_hierarchy'] + thread_safe_print(f" šŸ” Finding target location for: {previous_section_hierarchy}") + + # Extract title for system variable checking + if ' > ' in previous_section_hierarchy: + title = previous_section_hierarchy.split(' > ')[-1] + else: + title = previous_section_hierarchy + + # Check if this is a system variable/config that can be directly matched + cleaned_title = clean_title_for_matching(title) + if is_system_variable_or_config(cleaned_title): + thread_safe_print(f" šŸŽÆ Direct matching for system var/config: {cleaned_title}") + + # Direct matching for system variables + temp_source = {point_info['previous_section_line']: previous_section_hierarchy} + matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( + temp_source, target_hierarchy, target_lines + ) + + if matched_dict: + # Get the first (and should be only) matched target line + target_line = list(matched_dict.keys())[0] + + # Find the end of this section + target_line_num = int(target_line) + insertion_after_line = find_section_end_line(target_line_num, target_hierarchy, target_lines) + + target_insertion_points[group_key] = { + 'insertion_after_line': insertion_after_line, + 'target_hierarchy': target_hierarchy.get(str(target_line_num), ''), + 'insertion_type': point_info['insertion_type'], + 'new_sections': point_info['new_sections'] + } + thread_safe_print(f" āœ… Direct match found, insertion after line {insertion_after_line}") + continue + + # If not a system variable or direct matching failed, use AI + thread_safe_print(f" šŸ¤– Using AI mapping for: {cleaned_title}") + + # Filter target hierarchy for AI (remove system sections) + filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) + + # Check if filtered hierarchy is too large for AI + # Use provided max_non_system_sections parameter + if len(filtered_target_hierarchy) > max_non_system_sections: + thread_safe_print(f" āŒ Target hierarchy too large for AI: {len(filtered_target_hierarchy)} > {max_non_system_sections}") + continue + + # Prepare source for AI mapping + temp_source = {str(point_info['previous_section_line']): previous_section_hierarchy} + + # Get AI mapping + ai_response = get_corresponding_sections( + list(temp_source.values()), + list(filtered_target_hierarchy.values()), + ai_client, + repo_config['source_language'], + repo_config['target_language'], + max_tokens=20000 # Use default value since this function doesn't accept max_tokens yet + ) + + if ai_response: + # Parse AI response and find matching line numbers + ai_sections = parse_ai_response(ai_response) + ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) + + if ai_matched and len(ai_matched) > 0: + # Get the first match (we only have one source section) + target_line = list(ai_matched.keys())[0] + target_line_num = int(target_line) + + # Find the end of this section + insertion_after_line = find_section_end_line(target_line_num, target_hierarchy, target_lines) + + target_insertion_points[group_key] = { + 'insertion_after_line': insertion_after_line, + 'target_hierarchy': target_hierarchy.get(target_line, ''), + 'insertion_type': point_info['insertion_type'], + 'new_sections': point_info['new_sections'] + } + thread_safe_print(f" āœ… AI match found, insertion after line {insertion_after_line}") + else: + thread_safe_print(f" āŒ No AI matching sections found for: {previous_section_hierarchy}") + else: + thread_safe_print(f" āŒ No AI response received for: {previous_section_hierarchy}") + + return target_insertion_points + +def extract_hierarchies_from_diff_dict(source_diff_dict): + """Extract original_hierarchy from source_diff_dict for section matching""" + extracted_hierarchies = {} + + for key, diff_info in source_diff_dict.items(): + operation = diff_info.get('operation', '') + original_hierarchy = diff_info.get('original_hierarchy', '') + + # Process all sections: modified, deleted, and added + if operation in ['modified', 'deleted', 'added'] and original_hierarchy: + # Use the key as the identifier for the hierarchy + extracted_hierarchies[key] = original_hierarchy + + thread_safe_print(f"šŸ“„ Extracted {len(extracted_hierarchies)} hierarchies from source diff dict:") + for key, hierarchy in extracted_hierarchies.items(): + thread_safe_print(f" {key}: {hierarchy}") + + return extracted_hierarchies + +def match_source_diff_to_target(source_diff_dict, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections=120, max_tokens=20000): + """ + Match source_diff_dict original_hierarchy to target file sections + Uses direct matching for system variables/config and AI matching for others + + Returns: + dict: Matched sections with enhanced information including: + - target_line: Line number in target file + - target_hierarchy: Target section hierarchy + - insertion_type: For added sections only + - source_original_hierarchy: Original hierarchy from source + - source_operation: Operation type (modified/added/deleted) + - source_old_content: Old content from source diff + - source_new_content: New content from source diff + """ + thread_safe_print(f"šŸ”— Starting source diff to target matching...") + + # Extract hierarchies from source diff dict + source_hierarchies = extract_hierarchies_from_diff_dict(source_diff_dict) + + if not source_hierarchies: + thread_safe_print(f"āš ļø No hierarchies to match") + return {} + + # Process sections in original order to maintain consistency + # Initialize final matching results with ordered dict to preserve order + from collections import OrderedDict + all_matched_sections = OrderedDict() + + # Categorize sections for processing strategy but maintain order + direct_match_sections = OrderedDict() + ai_match_sections = OrderedDict() + added_sections = OrderedDict() + bottom_sections = OrderedDict() # New category for bottom sections + + for key, hierarchy in source_hierarchies.items(): + # Check if this is a bottom section (no matching needed) + if hierarchy.startswith('bottom-'): + bottom_sections[key] = hierarchy + # Check if this is an added section + elif key.startswith('added_'): + added_sections[key] = hierarchy + else: + # Extract the leaf title from hierarchy for checking + leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy + + # Check if this is suitable for direct matching + if (hierarchy == "frontmatter" or + leaf_title.startswith('# ') or # Top-level titles + is_system_variable_or_config(leaf_title)): # System variables/config + direct_match_sections[key] = hierarchy + else: + ai_match_sections[key] = hierarchy + + thread_safe_print(f"šŸ“Š Section categorization:") + thread_safe_print(f" šŸŽÆ Direct matching: {len(direct_match_sections)} sections") + thread_safe_print(f" šŸ¤– AI matching: {len(ai_match_sections)} sections") + thread_safe_print(f" āž• Added sections: {len(added_sections)} sections") + thread_safe_print(f" šŸ”š Bottom sections: {len(bottom_sections)} sections (no matching needed)") + + # Process each section in original order + thread_safe_print(f"\nšŸ”„ Processing sections in original order...") + + for key, hierarchy in source_hierarchies.items(): + thread_safe_print(f" šŸ” Processing {key}: {hierarchy}") + + # Determine processing strategy based on section type and content + if hierarchy.startswith('bottom-'): + # Bottom section - no matching needed, append to end + thread_safe_print(f" šŸ”š Bottom section - append to end of document") + result = { + "target_line": "-1", # Special marker for bottom sections + "target_hierarchy": hierarchy # Keep the bottom-xxx marker + } + elif key.startswith('added_'): + # Added section - find insertion point + thread_safe_print(f" āž• Added section - finding insertion point") + result = process_added_section(key, hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens) + else: + # Modified or deleted section - find matching section + operation = source_diff_dict[key].get('operation', 'unknown') + thread_safe_print(f" {operation.capitalize()} section - finding target match") + result = process_modified_or_deleted_section(key, hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens) + + if result: + # Add source language information from source_diff_dict + source_info = source_diff_dict.get(key, {}) + + # Extract target content from target_lines + target_line = result.get('target_line', 'unknown') + target_content = "" + if target_line != 'unknown' and target_line != '0': + try: + target_line_num = int(target_line) + # For ALL operations, only extract direct content (no sub-sections) + # This avoids duplication when both parent and child sections have operations + target_content = extract_section_direct_content(target_line_num, target_lines) + except (ValueError, IndexError): + target_content = "" + elif target_line == '0': + # For frontmatter, extract content from beginning to first header + target_content = extract_frontmatter_content(target_lines) + + enhanced_result = { + **result, # Include existing target matching info + 'target_content': target_content, # Add target section content + 'source_original_hierarchy': source_info.get('original_hierarchy', ''), + 'source_operation': source_info.get('operation', ''), + 'source_old_content': source_info.get('old_content', ''), + 'source_new_content': source_info.get('new_content', '') + } + all_matched_sections[key] = enhanced_result + thread_safe_print(f" āœ… {key}: -> line {target_line}") + else: + thread_safe_print(f" āŒ {key}: matching failed") + + thread_safe_print(f"\nšŸ“Š Final matching results: {len(all_matched_sections)} total matches") + return all_matched_sections + +def process_modified_or_deleted_section(key, hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens=20000): + """Process modified or deleted sections to find target matches""" + # Extract the leaf title from hierarchy for checking + leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy + + # Check if this is suitable for direct matching + if (hierarchy == "frontmatter" or + leaf_title.startswith('# ') or # Top-level titles + is_system_variable_or_config(leaf_title)): # System variables/config + + if hierarchy == "frontmatter": + return {"target_line": "0", "target_hierarchy": "frontmatter"} + + elif leaf_title.startswith('# '): + # Top-level title matching + temp_sections = {key: hierarchy} + matched_dict, failed_matches, skipped_sections = find_toplevel_title_matches( + temp_sections, target_lines + ) + if matched_dict: + target_line = list(matched_dict.keys())[0] + # For top-level titles, add # prefix to the hierarchy + return { + "target_line": target_line, + "target_hierarchy": f"# {matched_dict[target_line]}" + } + + else: + # System variable/config matching + temp_sections = {key: hierarchy} + matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( + temp_sections, target_hierarchy, target_lines + ) + if matched_dict: + target_line = list(matched_dict.keys())[0] + target_hierarchy_str = list(matched_dict.values())[0] + + # Extract the leaf title and add # prefix, remove top-level title from hierarchy + if ' > ' in target_hierarchy_str: + # Remove top-level title and keep only the leaf with ## prefix + leaf_title = target_hierarchy_str.split(' > ')[-1] + formatted_hierarchy = f"## {leaf_title}" + else: + # Single level, add ## prefix + formatted_hierarchy = f"## {target_hierarchy_str}" + + return { + "target_line": target_line, + "target_hierarchy": formatted_hierarchy + } + else: + # AI matching for non-system sections + filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) + + if len(filtered_target_hierarchy) <= max_non_system_sections: + temp_sections = {key: hierarchy} + + ai_response = get_corresponding_sections( + list(temp_sections.values()), + list(filtered_target_hierarchy.values()), + ai_client, + repo_config['source_language'], + repo_config['target_language'], + max_tokens + ) + + if ai_response: + ai_sections = parse_ai_response(ai_response) + ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) + + if ai_matched: + target_line = list(ai_matched.keys())[0] + target_hierarchy_str = list(ai_matched.values())[0] + + # Format AI matched hierarchy with # prefix and remove top-level title + formatted_hierarchy = format_target_hierarchy(target_hierarchy_str) + + return { + "target_line": target_line, + "target_hierarchy": formatted_hierarchy + } + + return None + +def format_target_hierarchy(target_hierarchy_str): + """Format target hierarchy to preserve complete hierarchy structure""" + if target_hierarchy_str.startswith('##') or target_hierarchy_str.startswith('#'): + # Already formatted, return as is + return target_hierarchy_str + elif ' > ' in target_hierarchy_str: + # Keep complete hierarchy structure, just ensure proper formatting + return target_hierarchy_str + else: + # Single level, add ## prefix for compatibility + return f"## {target_hierarchy_str}" + +def process_added_section(key, reference_hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens=20000): + """Process added sections to find insertion points""" + # For added sections, hierarchy points to the next section (where to insert before) + reference_leaf = reference_hierarchy.split(' > ')[-1] if ' > ' in reference_hierarchy else reference_hierarchy + + if (reference_hierarchy == "frontmatter" or + reference_leaf.startswith('# ') or + is_system_variable_or_config(reference_leaf)): + + # Use direct matching for the reference section + temp_reference = {f"ref_{key}": reference_hierarchy} + + if reference_hierarchy == "frontmatter": + return { + "target_line": "0", + "target_hierarchy": "frontmatter", + "insertion_type": "before_reference" + } + + elif reference_leaf.startswith('# '): + matched_dict, failed_matches, skipped_sections = find_toplevel_title_matches( + temp_reference, target_lines + ) + if matched_dict: + target_line = list(matched_dict.keys())[0] + formatted_hierarchy = f"# {matched_dict[target_line]}" + return { + "target_line": target_line, + "target_hierarchy": formatted_hierarchy, + "insertion_type": "before_reference" + } + + else: + # System variable/config + matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( + temp_reference, target_hierarchy, target_lines + ) + if matched_dict: + target_line = list(matched_dict.keys())[0] + target_hierarchy_str = list(matched_dict.values())[0] + formatted_hierarchy = format_target_hierarchy(target_hierarchy_str) + return { + "target_line": target_line, + "target_hierarchy": formatted_hierarchy, + "insertion_type": "before_reference" + } + else: + # Use AI matching for the reference section + filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) + + if len(filtered_target_hierarchy) <= max_non_system_sections: + temp_reference = {f"ref_{key}": reference_hierarchy} + + ai_response = get_corresponding_sections( + list(temp_reference.values()), + list(filtered_target_hierarchy.values()), + ai_client, + repo_config['source_language'], + repo_config['target_language'], + max_tokens + ) + + if ai_response: + ai_sections = parse_ai_response(ai_response) + ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) + + if ai_matched: + target_line = list(ai_matched.keys())[0] + target_hierarchy_str = list(ai_matched.values())[0] + formatted_hierarchy = format_target_hierarchy(target_hierarchy_str) + return { + "target_line": target_line, + "target_hierarchy": formatted_hierarchy, + "insertion_type": "before_reference" + } + + return None + +def extract_target_section_content(target_line_num, target_lines): + """Extract target section content from target_lines (includes sub-sections)""" + if target_line_num >= len(target_lines): + return "" + + start_line = target_line_num - 1 # Convert to 0-based index + + # Find the end of the section by looking for the next header + current_line = target_lines[start_line].strip() + if not current_line.startswith('#'): + return current_line + + current_level = len(current_line.split()[0]) # Count # characters + end_line = len(target_lines) # Default to end of file + + # For top-level headers (# level 1), stop at first sublevel (## level 2) + # For other headers, stop at same or higher level + if current_level == 1: + # Top-level header: stop at first ## (level 2) or higher + for i in range(start_line + 1, len(target_lines)): + line = target_lines[i].strip() + if line.startswith('#'): + line_level = len(line.split()[0]) + if line_level >= 2: # Stop at ## or higher level + end_line = i + break + else: + # Sub-level header: stop at same or higher level (traditional behavior) + for i in range(start_line + 1, len(target_lines)): + line = target_lines[i].strip() + if line.startswith('#'): + line_level = len(line.split()[0]) + if line_level <= current_level: + end_line = i + break + + # Extract content from start_line to end_line + section_content = '\n'.join(target_lines[start_line:end_line]) + return section_content.strip() + +def extract_section_direct_content(target_line_num, target_lines): + """Extract ONLY the direct content of a section (excluding sub-sections)""" + if target_line_num >= len(target_lines): + return "" + + start_line = target_line_num - 1 # Convert to 0-based index + + # Find the end of the section by looking for the next header + current_line = target_lines[start_line].strip() + if not current_line.startswith('#'): + return current_line + + current_level = len(current_line.split()[0]) # Count # characters + end_line = len(target_lines) # Default to end of file + + # Only extract until the first header (any level) + # This means we stop at ANY header - whether it's a sub-section OR same/higher level + for i in range(start_line + 1, len(target_lines)): + line = target_lines[i].strip() + if line.startswith('#'): + # Stop at ANY header to get only direct content + end_line = i + break + + # Extract content from start_line to end_line + section_content = '\n'.join(target_lines[start_line:end_line]) + return section_content.strip() + +def extract_frontmatter_content(target_lines): + """Extract frontmatter content from beginning to first header""" + if not target_lines: + return "" + + frontmatter_lines = [] + for i, line in enumerate(target_lines): + line_stripped = line.strip() + # Stop when we hit the first top-level header + if line_stripped.startswith('# '): + break + frontmatter_lines.append(line.rstrip()) + + return '\n'.join(frontmatter_lines) + +def find_section_end_line(section_start_line, target_hierarchy, target_lines): + """Find the end line of a section to determine insertion point (from auto-sync-pr-changes.py)""" + + # Get the current section's level + current_section_line = target_lines[section_start_line - 1].strip() + current_level = len(current_section_line.split()[0]) if current_section_line.startswith('#') else 5 + + # Find the next section at the same level or higher (lower number) + next_section_line = None + for line_num_str in sorted(target_hierarchy.keys(), key=int): + line_num = int(line_num_str) + if line_num > section_start_line: + # Check the level of this section + section_line = target_lines[line_num - 1].strip() + if section_line.startswith('#'): + section_level = len(section_line.split()[0]) + if section_level <= current_level: + next_section_line = line_num + break + + if next_section_line: + # Insert before the next same-level or higher-level section + return next_section_line - 1 + else: + # This is the last section at this level, insert at the end of the file + return len(target_lines) diff --git a/scripts/translate_doc_pr/toc_processor.py b/scripts/translate_doc_pr/toc_processor.py new file mode 100644 index 0000000000000..71cce4a17f8bb --- /dev/null +++ b/scripts/translate_doc_pr/toc_processor.py @@ -0,0 +1,434 @@ +""" +TOC Processor Module +Handles special processing logic for TOC.md files +""" + +import os +import re +import json +import threading +from github import Github +from openai import OpenAI + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def extract_toc_link_from_line(line): + """Extract the link part (including parentheses) from a TOC line""" + # Pattern to match [text](link) format + pattern = r'\[([^\]]+)\]\(([^)]+)\)' + match = re.search(pattern, line) + if match: + return f"({match.group(2)})" # Return (link) including parentheses + return None + +def is_toc_translation_needed(line): + """Check if a TOC line needs translation based on content in square brackets""" + # Extract content within square brackets [content] + pattern = r'\[([^\]]+)\]' + match = re.search(pattern, line) + if match: + content = match.group(1) + # Skip translation if content has no Chinese and no spaces + has_chinese = bool(re.search(r'[\u4e00-\u9fff]', content)) + has_spaces = ' ' in content + + # Need translation if has Chinese OR has spaces + # Skip translation only if it's alphanumeric/technical term without spaces + return has_chinese or has_spaces + return True # Default to translate if can't parse + +def find_best_toc_match(target_link, target_lines, source_line_num): + """Find the best matching line in target TOC based on link content and line proximity""" + matches = [] + + for i, line in enumerate(target_lines): + line_link = extract_toc_link_from_line(line.strip()) + if line_link and line_link == target_link: + matches.append({ + 'line_num': i + 1, # Convert to 1-based + 'line': line.strip(), + 'distance': abs((i + 1) - source_line_num) + }) + + if not matches: + return None + + # Sort by distance to source line number, choose the closest one + matches.sort(key=lambda x: x['distance']) + return matches[0] + +def group_consecutive_lines(lines): + """Group consecutive lines together""" + if not lines: + return [] + + # Sort lines by line number + sorted_lines = sorted(lines, key=lambda x: x['line_number']) + + groups = [] + current_group = [sorted_lines[0]] + + for i in range(1, len(sorted_lines)): + current_line = sorted_lines[i] + prev_line = sorted_lines[i-1] + + # Consider lines consecutive if they are within 2 lines of each other + if current_line['line_number'] - prev_line['line_number'] <= 2: + current_group.append(current_line) + else: + groups.append(current_group) + current_group = [current_line] + + groups.append(current_group) + return groups + +def process_toc_operations(file_path, operations, source_lines, target_lines, target_local_path): + """Process TOC.md file operations with special logic""" + thread_safe_print(f"\nšŸ“‹ Processing TOC.md with special logic...") + + results = { + 'added': [], + 'modified': [], + 'deleted': [] + } + + # Process deleted lines first + for deleted_line in operations['deleted_lines']: + if not deleted_line['is_header']: # TOC lines are not headers + deleted_content = deleted_line['content'] + deleted_link = extract_toc_link_from_line(deleted_content) + + if deleted_link: + thread_safe_print(f" šŸ—‘ļø Processing deleted TOC line with link: {deleted_link}") + + # Find matching line in target + match = find_best_toc_match(deleted_link, target_lines, deleted_line['line_number']) + if match: + thread_safe_print(f" āœ… Found target line {match['line_num']}: {match['line']}") + results['deleted'].append({ + 'source_line': deleted_line['line_number'], + 'target_line': match['line_num'], + 'content': deleted_content + }) + else: + thread_safe_print(f" āŒ No matching line found for {deleted_link}") + + # Process added lines + added_groups = group_consecutive_lines(operations['added_lines']) + for group in added_groups: + if group: # Skip empty groups + first_added_line = group[0] + thread_safe_print(f" āž• Processing added TOC group starting at line {first_added_line['line_number']}") + + # Find the previous line in source to determine insertion point + previous_line_num = first_added_line['line_number'] - 1 + if previous_line_num > 0 and previous_line_num <= len(source_lines): + previous_line_content = source_lines[previous_line_num - 1] + previous_link = extract_toc_link_from_line(previous_line_content) + + if previous_link: + thread_safe_print(f" šŸ“ Previous line link: {previous_link}") + + # Find matching previous line in target + match = find_best_toc_match(previous_link, target_lines, previous_line_num) + if match: + thread_safe_print(f" āœ… Found target insertion point after line {match['line_num']}") + + # Process each line in the group + for added_line in group: + added_content = added_line['content'] + if is_toc_translation_needed(added_content): + results['added'].append({ + 'source_line': added_line['line_number'], + 'target_insertion_after': match['line_num'], + 'content': added_content, + 'needs_translation': True + }) + thread_safe_print(f" šŸ“ Added for translation: {added_content.strip()}") + else: + results['added'].append({ + 'source_line': added_line['line_number'], + 'target_insertion_after': match['line_num'], + 'content': added_content, + 'needs_translation': False + }) + thread_safe_print(f" ā­ļø Added without translation: {added_content.strip()}") + else: + thread_safe_print(f" āŒ No target insertion point found for {previous_link}") + else: + thread_safe_print(f" āŒ No link found in previous line: {previous_line_content.strip()}") + + # Process modified lines + modified_groups = group_consecutive_lines(operations['modified_lines']) + for group in modified_groups: + if group: # Skip empty groups + first_modified_line = group[0] + thread_safe_print(f" āœļø Processing modified TOC group starting at line {first_modified_line['line_number']}") + + # Find the previous line in source to determine target location + previous_line_num = first_modified_line['line_number'] - 1 + if previous_line_num > 0 and previous_line_num <= len(source_lines): + previous_line_content = source_lines[previous_line_num - 1] + previous_link = extract_toc_link_from_line(previous_line_content) + + if previous_link: + thread_safe_print(f" šŸ“ Previous line link: {previous_link}") + + # Find matching previous line in target + match = find_best_toc_match(previous_link, target_lines, previous_line_num) + if match: + # Process each line in the group + for modified_line in group: + modified_content = modified_line['content'] + if is_toc_translation_needed(modified_content): + results['modified'].append({ + 'source_line': modified_line['line_number'], + 'target_line_context': match['line_num'], + 'content': modified_content, + 'needs_translation': True + }) + thread_safe_print(f" šŸ“ Modified for translation: {modified_content.strip()}") + else: + results['modified'].append({ + 'source_line': modified_line['line_number'], + 'target_line_context': match['line_num'], + 'content': modified_content, + 'needs_translation': False + }) + thread_safe_print(f" ā­ļø Modified without translation: {modified_content.strip()}") + else: + thread_safe_print(f" āŒ No target context found for {previous_link}") + else: + thread_safe_print(f" āŒ No link found in previous line: {previous_line_content.strip()}") + + return results + +def find_toc_modification_line(mod_op, target_lines): + """Find the actual line number to modify in target TOC based on context""" + # This function helps find the exact line to modify in target TOC + # based on the modification operation context + + target_line_context = mod_op.get('target_line_context', 0) + + # Look for the line after the context line that should be modified + # This is a simplified approach - in practice, you might need more sophisticated logic + + if target_line_context > 0 and target_line_context < len(target_lines): + # Check if the next line is the one to modify + return target_line_context + 1 + + return target_line_context + +def translate_toc_lines(toc_operations, ai_client, repo_config): + """Translate multiple TOC lines at once""" + lines_to_translate = [] + + # Collect all lines that need translation + for op in toc_operations: + if op.get('needs_translation', False): + lines_to_translate.append({ + 'operation_type': 'added' if 'target_insertion_after' in op else 'modified', + 'content': op['content'], + 'source_line': op['source_line'] + }) + + if not lines_to_translate: + thread_safe_print(f" ā­ļø No TOC lines need translation") + return {} + + thread_safe_print(f" šŸ¤– Translating {len(lines_to_translate)} TOC lines...") + + # Prepare content for AI translation + content_dict = {} + for i, line_info in enumerate(lines_to_translate): + content_dict[f"line_{i}"] = line_info['content'] + + source_lang = repo_config['source_language'] + target_lang = repo_config['target_language'] + + prompt = f"""You are a professional translator. Please translate the following TOC (Table of Contents) lines from {source_lang} to {target_lang}. + +IMPORTANT INSTRUCTIONS: +1. Preserve ALL formatting, indentation, spaces, and dashes exactly as they appear +2. Only translate the text content within square brackets [text] +3. Keep all markdown links, parentheses, and special characters unchanged +4. Maintain the exact same indentation and spacing structure + +Input lines to translate: +{json.dumps(content_dict, indent=2, ensure_ascii=False)} + +Please return the translated lines in the same JSON format, preserving all formatting and only translating the text within square brackets. + +Return format: +{{ + "line_0": "translated line with preserved formatting", + "line_1": "translated line with preserved formatting" +}}""" + + #print(prompt) #DEBUG + # Add token estimation + try: + from main import print_token_estimation + print_token_estimation(prompt, "TOC translation") + except ImportError: + # Fallback if import fails - use tiktoken + try: + import tiktoken + enc = tiktoken.get_encoding("cl100k_base") + tokens = enc.encode(prompt) + actual_tokens = len(tokens) + char_count = len(prompt) + print(f" šŸ’° TOC translation") + print(f" šŸ“ Input: {char_count:,} characters") + print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") + except Exception: + # Final fallback to character approximation + estimated_tokens = len(prompt) // 4 + char_count = len(prompt) + print(f" šŸ’° TOC translation") + print(f" šŸ“ Input: {char_count:,} characters") + print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") + + try: + ai_response = ai_client.chat_completion( + messages=[{"role": "user", "content": prompt}], + temperature=0.1 + ) + #print(ai_response) #DEBUG + thread_safe_print(f" šŸ“ AI translation response received") + + # Parse AI response + try: + json_start = ai_response.find('{') + json_end = ai_response.rfind('}') + 1 + + if json_start != -1 and json_end > json_start: + json_str = ai_response[json_start:json_end] + translated_lines = json.loads(json_str) + + # Map back to original operations + translation_mapping = {} + for i, line_info in enumerate(lines_to_translate): + key = f"line_{i}" + if key in translated_lines: + translation_mapping[line_info['source_line']] = translated_lines[key] + + thread_safe_print(f" āœ… Successfully translated {len(translation_mapping)} TOC lines") + return translation_mapping + + except json.JSONDecodeError as e: + thread_safe_print(f" āŒ Failed to parse AI translation response: {e}") + return {} + + except Exception as e: + thread_safe_print(f" āŒ AI translation failed: {e}") + return {} + +def process_toc_file(file_path, toc_data, pr_url, github_client, ai_client, repo_config): + """Process a single TOC.md file with special logic""" + thread_safe_print(f"\nšŸ“‹ Processing TOC file: {file_path}") + + try: + target_local_path = repo_config['target_local_path'] + target_file_path = os.path.join(target_local_path, file_path) + + # Read current target file + with open(target_file_path, 'r', encoding='utf-8') as f: + target_content = f.read() + + target_lines = target_content.split('\n') + operations = toc_data['operations'] + + # Separate operations by type + deleted_ops = [op for op in operations if 'target_line' in op] + added_ops = [op for op in operations if 'target_insertion_after' in op] + modified_ops = [op for op in operations if 'target_line_context' in op] + + thread_safe_print(f" šŸ“Š TOC operations: {len(deleted_ops)} deleted, {len(added_ops)} added, {len(modified_ops)} modified") + + # Process deletions first (work backwards to maintain line numbers) + if deleted_ops: + thread_safe_print(f" šŸ—‘ļø Processing {len(deleted_ops)} deletions...") + deleted_ops.sort(key=lambda x: x['target_line'], reverse=True) + + for del_op in deleted_ops: + target_line_num = del_op['target_line'] - 1 # Convert to 0-based + if 0 <= target_line_num < len(target_lines): + thread_safe_print(f" āŒ Deleting line {del_op['target_line']}: {target_lines[target_line_num].strip()}") + del target_lines[target_line_num] + + # Process modifications + if modified_ops: + thread_safe_print(f" āœļø Processing {len(modified_ops)} modifications...") + + # Get translations for operations that need them + translations = translate_toc_lines(modified_ops, ai_client, repo_config) + + for mod_op in modified_ops: + target_line_num = find_toc_modification_line(mod_op, target_lines) - 1 # Convert to 0-based + + if 0 <= target_line_num < len(target_lines): + if mod_op.get('needs_translation', False) and mod_op['source_line'] in translations: + new_content = translations[mod_op['source_line']] + thread_safe_print(f" āœļø Modifying line {target_line_num + 1} with translation") + else: + new_content = mod_op['content'] + thread_safe_print(f" āœļø Modifying line {target_line_num + 1} without translation") + + target_lines[target_line_num] = new_content + + # Process additions last + if added_ops: + thread_safe_print(f" āž• Processing {len(added_ops)} additions...") + + # Get translations for operations that need them + translations = translate_toc_lines(added_ops, ai_client, repo_config) + + # Group additions by insertion point and process in reverse order + added_ops.sort(key=lambda x: x['target_insertion_after'], reverse=True) + + for add_op in added_ops: + insertion_after = add_op['target_insertion_after'] + + if add_op.get('needs_translation', False) and add_op['source_line'] in translations: + new_content = translations[add_op['source_line']] + thread_safe_print(f" āž• Inserting after line {insertion_after} with translation") + else: + new_content = add_op['content'] + thread_safe_print(f" āž• Inserting after line {insertion_after} without translation") + + # Insert the new line + if insertion_after < len(target_lines): + target_lines.insert(insertion_after, new_content) + else: + target_lines.append(new_content) + + # Write updated content back to file + updated_content = '\n'.join(target_lines) + with open(target_file_path, 'w', encoding='utf-8') as f: + f.write(updated_content) + + thread_safe_print(f" āœ… TOC file updated: {file_path}") + + except Exception as e: + thread_safe_print(f" āŒ Error processing TOC file {file_path}: {e}") + +def process_toc_files(toc_files, pr_url, github_client, ai_client, repo_config): + """Process all TOC files""" + if not toc_files: + return + + thread_safe_print(f"\nšŸ“‹ Processing {len(toc_files)} TOC files...") + + for file_path, toc_data in toc_files.items(): + if toc_data['type'] == 'toc': + process_toc_file(file_path, toc_data, pr_url, github_client, ai_client, repo_config) + else: + thread_safe_print(f" āš ļø Unknown TOC data type: {toc_data['type']} for {file_path}") + + thread_safe_print(f" āœ… All TOC files processed")