qiancai · qiancai · Sep 25, 2025 · Sep 24, 2025 · Sep 24, 2025 · Sep 25, 2025
diff --git a/.github/workflows/sync-docs-cn-to-en.yml b/.github/workflows/sync-docs-cn-to-en.yml
@@ -0,0 +1,134 @@
+name: Sync Docs Changes from ZH PR to EN PR
+
+on:
+  workflow_dispatch:
+    inputs:
+      source_pr_url:
+        description: 'Source PR URL (Chinese docs repository)'
+        required: true
+        type: string
+        default: ''
+      target_pr_url:
+        description: 'Target PR URL (English docs repository)'
+        required: true
+        type: string
+        default: ''
+      ai_provider:
+        description: 'AI Provider to use for translation'
+        required: false
+        type: choice
+        options:
+          - deepseek
+          - gemini
+        default: 'gemini'
+
+jobs:
+  sync-docs:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout current repository
+        uses: actions/checkout@v4
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.9'
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.9'
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r scripts/translate_doc_pr/requirements.txt
+
+      - name: Extract PR information
+        id: extract_info
+        run: |
+          # Extract source repo info
+          SOURCE_URL="${{ github.event.inputs.source_pr_url }}"
+          SOURCE_OWNER=$(echo $SOURCE_URL | cut -d'/' -f4)
+          SOURCE_REPO=$(echo $SOURCE_URL | cut -d'/' -f5)
+          SOURCE_PR=$(echo $SOURCE_URL | cut -d'/' -f7)
+
+          # Extract target repo info
+          TARGET_URL="${{ github.event.inputs.target_pr_url }}"
+          TARGET_OWNER=$(echo $TARGET_URL | cut -d'/' -f4)
+          TARGET_REPO=$(echo $TARGET_URL | cut -d'/' -f5)
+          TARGET_PR=$(echo $TARGET_URL | cut -d'/' -f7)
+
+          echo "source_owner=${SOURCE_OWNER}" >> $GITHUB_OUTPUT
+          echo "source_repo=${SOURCE_REPO}" >> $GITHUB_OUTPUT
+          echo "source_pr=${SOURCE_PR}" >> $GITHUB_OUTPUT
+          echo "target_owner=${TARGET_OWNER}" >> $GITHUB_OUTPUT
+          echo "target_repo=${TARGET_REPO}" >> $GITHUB_OUTPUT
+          echo "target_pr=${TARGET_PR}" >> $GITHUB_OUTPUT
+
+          echo "Source: ${SOURCE_OWNER}/${SOURCE_REPO}#${SOURCE_PR}"
+          echo "Target: ${TARGET_OWNER}/${TARGET_REPO}#${TARGET_PR}"
+
+      - name: Get target PR branch info
+        id: target_branch
+        run: |
+          # Get target PR branch name
+          TARGET_BRANCH=$(curl -s \
+            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            -H "Accept: application/vnd.github.v3+json" \
+            "https://api.github.com/repos/${{ steps.extract_info.outputs.target_owner }}/${{ steps.extract_info.outputs.target_repo }}/pulls/${{ steps.extract_info.outputs.target_pr }}" \
+            | jq -r '.head.ref')
+
+          echo "target_branch=${TARGET_BRANCH}" >> $GITHUB_OUTPUT
+          echo "Target branch: ${TARGET_BRANCH}"
+
+      - name: Clone target repository
+        run: |
+          # Clone target repository with the PR branch
+          git clone https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ steps.extract_info.outputs.target_owner }}/${{ steps.extract_info.outputs.target_repo }}.git target_repo
+          cd target_repo
+          git checkout ${{ steps.target_branch.outputs.target_branch }}
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Run sync script
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DEEPSEEK_API_TOKEN: ${{ secrets.DEEPSEEK_API_TOKEN }}
+          GEMINI_API_TOKEN: ${{ secrets.GEMINI_API_TOKEN }}
+          SOURCE_PR_URL: ${{ github.event.inputs.source_pr_url }}
+          TARGET_PR_URL: ${{ github.event.inputs.target_pr_url }}
+          AI_PROVIDER: ${{ github.event.inputs.ai_provider }}
+          TARGET_REPO_PATH: ${{ github.workspace }}/target_repo
+        run: |
+          cd scripts/translate_doc_pr
+          python main_workflow.py
+
+      - name: Commit and push changes
+        run: |
+          cd target_repo
+          git add .
+          if git diff --staged --quiet; then
+            echo "No changes to commit"
+          else
+            git commit -m "Auto-sync: Update English docs from Chinese PR ${{ github.event.inputs.source_pr_url }}
+
+            Synced from: ${{ github.event.inputs.source_pr_url }}
+            Target PR: ${{ github.event.inputs.target_pr_url }}
+            AI Provider: ${{ github.event.inputs.ai_provider }}
+
+            Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>"
+
+            git push origin ${{ steps.target_branch.outputs.target_branch }}
+            echo "Changes pushed to target PR branch: ${{ steps.target_branch.outputs.target_branch }}"
+          fi
+
+      - name: Add comment to target PR
+        run: |
+          # Add a comment to the target PR about the sync
+          curl -X POST \
+            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            -H "Accept: application/vnd.github.v3+json" \
+            "https://api.github.com/repos/${{ steps.extract_info.outputs.target_owner }}/${{ steps.extract_info.outputs.target_repo }}/issues/${{ steps.extract_info.outputs.target_pr }}/comments" \
+            -d "{
+              \"body\": \"🤖 **Auto-sync completed**\\n\\n📥 **Source PR**: ${{ github.event.inputs.source_pr_url }}\\n🎯 **Target PR**: ${{ github.event.inputs.target_pr_url }}\\n✅ English documentation has been updated based on Chinese documentation changes.\\n\\n_This comment was generated automatically by the sync workflow._\"
+            }"
diff --git a/scripts/translate_doc_pr/__init__.py b/scripts/translate_doc_pr/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+"""
+Auto-Sync PR Changes - Refactored Modular Version
+
+This package contains the refactored version of the auto-sync-pr-changes script,
+split into logical modules for better maintainability and testing.
+
+Modules:
+- pr_analyzer: PR analysis, diff parsing, content getting, hierarchy building
+- section_matcher: Section matching (direct matching + AI matching)  
+- file_adder: New file processing and translation
+- file_deleter: Deleted file processing
+- file_updater: Updated file processing and translation
+- toc_processor: TOC file special processing
+- main: Main orchestration function
+"""
+
+# Import main functionality for easy access
+from main import main
+
+# Make main function available at package level
+__all__ = ["main"]
-from main import main
-
-# Make main function available at package level
-__all__ = ["main"]
+from .main_workflow import main
+
+# Make main function available at package level
+__all__ = ["main"]
-from main import main
-
-# Make main function available at package level
-__all__ = ["main"]
+from .main_workflow import main
+
+# Make main function available at package level
+__all__ = ["main"]
diff --git a/scripts/translate_doc_pr/file_adder.py b/scripts/translate_doc_pr/file_adder.py
@@ -0,0 +1,193 @@
+"""
+File Adder Module
+Handles processing and translation of newly added files
+"""
+
+import os
+import re
+import json
+import threading
+from github import Github
+from openai import OpenAI
+
+# Thread-safe printing
+print_lock = threading.Lock()
+
+def thread_safe_print(*args, **kwargs):
+    with print_lock:
+        print(*args, **kwargs)
+
+def create_section_batches(file_content, max_lines_per_batch=200):
+    """Create batches of file content for translation, respecting section boundaries"""
+    lines = file_content.split('\n')
+
+    # Find all section headers
+    section_starts = []
+    for i, line in enumerate(lines):
+        line = line.strip()
+        if line.startswith('#'):
+            match = re.match(r'^(#{1,10})\s+(.+)', line)
+            if match:
+                section_starts.append(i + 1)  # 1-based line numbers
+
+    # If no sections found, just batch by line count
+    if not section_starts:
+        batches = []
+        for i in range(0, len(lines), max_lines_per_batch):
+            batch_lines = lines[i:i + max_lines_per_batch]
+            batches.append('\n'.join(batch_lines))
+        return batches
+
+    # Create batches respecting section boundaries
+    batches = []
+    current_batch_start = 0
+
+    for i, section_start in enumerate(section_starts):
+        section_start_idx = section_start - 1  # Convert to 0-based
+
+        # Check if adding this section would exceed the line limit
+        if (section_start_idx - current_batch_start) > max_lines_per_batch:
+            # Close current batch at the previous section boundary
+            if current_batch_start < section_start_idx:
+                batch_lines = lines[current_batch_start:section_start_idx]
+                batches.append('\n'.join(batch_lines))
+                current_batch_start = section_start_idx
+
+        # If this is the last section, or the next section would create a batch too large
+        if i == len(section_starts) - 1:
+            # Add remaining content as final batch
+            batch_lines = lines[current_batch_start:]
+            batches.append('\n'.join(batch_lines))
+        else:
+            next_section_start = section_starts[i + 1] - 1  # 0-based
+            if (next_section_start - current_batch_start) > max_lines_per_batch:
+                # Close current batch at current section boundary
+                batch_lines = lines[current_batch_start:section_start_idx]
+                if batch_lines:  # Only add non-empty batches
+                    batches.append('\n'.join(batch_lines))
+                current_batch_start = section_start_idx
+
+    # Clean up any empty batches
+    batches = [batch for batch in batches if batch.strip()]
+
+    return batches
+
+def translate_file_batch(batch_content, ai_client, source_language="English", target_language="Chinese"):
+    """Translate a single batch of file content using AI"""
+    if not batch_content.strip():
+        return batch_content
+
+    thread_safe_print(f"   🤖 Translating batch ({len(batch_content.split())} words)...")
+
+    prompt = f"""You are a professional technical writer. Please translate the following {source_language} content to {target_language}.
+
+IMPORTANT INSTRUCTIONS:
+1. Preserve ALL Markdown formatting (headers, links, code blocks, tables, etc.)
+2. Do NOT translate:
+   - Code examples, SQL queries, configuration values
+   - Technical terms like "TiDB", "TiKV", "PD", API names, etc.
+   - File paths, URLs, and command line examples
+   - Variable names and system configuration parameters
+3. Translate only the descriptive text and explanations
+4. Maintain the exact structure and indentation
+5. Keep all special characters and formatting intact
+
+Content to translate:
+{batch_content}
+
+Please provide the translated content maintaining all formatting and structure."""
+
+    # Add token estimation
+    try:
+        from main import print_token_estimation
+        print_token_estimation(prompt, "File addition translation")
+    except ImportError:
+        # Fallback if import fails - use tiktoken
-    # Add token estimation
-    try:
-        from main import print_token_estimation
-        print_token_estimation(prompt, "File addition translation")
-    except ImportError:
-        # Fallback if import fails - use tiktoken
+    # Add token estimation
+    try:
+        from main_workflow import print_token_estimation
+        print_token_estimation(prompt, "File addition translation")
+    except ImportError:
+        # Fallback if import fails - use tiktoken
-    # Add token estimation
-    try:
-        from main import print_token_estimation
-        print_token_estimation(prompt, "File addition translation")
-    except ImportError:
-        # Fallback if import fails - use tiktoken
+    # Add token estimation
+    try:
+        from main_workflow import print_token_estimation
+        print_token_estimation(prompt, "File addition translation")
+    except ImportError:
+        # Fallback if import fails - use tiktoken
+        try:
+            import tiktoken
+            enc = tiktoken.get_encoding("cl100k_base")
+            tokens = enc.encode(prompt)
+            actual_tokens = len(tokens)
+            char_count = len(prompt)
+            print(f"   💰 File addition translation")
+            print(f"      📝 Input: {char_count:,} characters")
+            print(f"      🔢 Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)")
+        except Exception:
+            # Final fallback to character approximation
+            estimated_tokens = len(prompt) // 4
+            char_count = len(prompt)
+            print(f"   💰 File addition translation")
+            print(f"      📝 Input: {char_count:,} characters")
+            print(f"      🔢 Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)")
+
+    try:
+        translated_content = ai_client.chat_completion(
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.1
+        )
+        thread_safe_print(f"   ✅ Batch translation completed")
+        return translated_content
+
+    except Exception as e:
+        thread_safe_print(f"   ❌ Batch translation failed: {e}")
+        return batch_content  # Return original content if translation fails
+
+def process_added_files(added_files, pr_url, github_client, ai_client, repo_config):
+    """Process newly added files by translating and creating them in target repository"""
+    if not added_files:
+        thread_safe_print("\n📄 No new files to process")
+        return
+
+    thread_safe_print(f"\n📄 Processing {len(added_files)} newly added files...")
+
+    target_local_path = repo_config['target_local_path']
+    source_language = repo_config['source_language']
+    target_language = repo_config['target_language']
+
+    for file_path, file_content in added_files.items():
+        thread_safe_print(f"\n📝 Processing new file: {file_path}")
+
+        # Create target file path
+        target_file_path = os.path.join(target_local_path, file_path)
+        target_dir = os.path.dirname(target_file_path)
+
+        # Create directory if it doesn't exist
+        if not os.path.exists(target_dir):
+            os.makedirs(target_dir, exist_ok=True)
+            thread_safe_print(f"   📁 Created directory: {target_dir}")
+
+        # Check if file already exists
+        if os.path.exists(target_file_path):
+            thread_safe_print(f"   ⚠️  Target file already exists: {target_file_path}")
+            continue
+
+        # Create section batches for translation
+        batches = create_section_batches(file_content, max_lines_per_batch=200)
+        thread_safe_print(f"   📦 Created {len(batches)} batches for translation")
+
+        # Translate each batch
+        translated_batches = []
+        for i, batch in enumerate(batches):
+            thread_safe_print(f"   🔄 Processing batch {i+1}/{len(batches)}")
+            translated_batch = translate_file_batch(
+                batch, 
+                ai_client, 
+                source_language, 
+                target_language
+            )
+            translated_batches.append(translated_batch)
+
+        # Combine translated batches
+        translated_content = '\n'.join(translated_batches)
+
+        # Write translated content to target file
+        try:
+            with open(target_file_path, 'w', encoding='utf-8') as f:
+                f.write(translated_content)
+
+            thread_safe_print(f"   ✅ Created translated file: {target_file_path}")
+
+        except Exception as e:
+            thread_safe_print(f"   ❌ Error creating file {target_file_path}: {e}")
+
+    thread_safe_print(f"\n✅ Completed processing all new files")
diff --git a/scripts/translate_doc_pr/file_deleter.py b/scripts/translate_doc_pr/file_deleter.py
@@ -0,0 +1,45 @@
+"""
+File Deleter Module
+Handles processing of deleted files and deleted sections
+"""
+
+import os
+import threading
+from github import Github
+
+# Thread-safe printing
+print_lock = threading.Lock()
+
+def thread_safe_print(*args, **kwargs):
+    with print_lock:
+        print(*args, **kwargs)
+
+def process_deleted_files(deleted_files, github_client, repo_config):
+    """Process deleted files by removing them from target repository"""
+    if not deleted_files:
+        thread_safe_print("\n🗑️  No files to delete")
+        return
+
+    thread_safe_print(f"\n🗑️  Processing {len(deleted_files)} deleted files...")
+
+    target_local_path = repo_config['target_local_path']
+
+    for file_path in deleted_files:
+        thread_safe_print(f"\n🗑️  Processing deleted file: {file_path}")
+
+        # Create target file path
+        target_file_path = os.path.join(target_local_path, file_path)
+
+        # Check if file exists in target
+        if os.path.exists(target_file_path):
+            try:
+                os.remove(target_file_path)
+                thread_safe_print(f"   ✅ Deleted file: {target_file_path}")
+            except Exception as e:
+                thread_safe_print(f"   ❌ Error deleting file {target_file_path}: {e}")
+        else:
+            thread_safe_print(f"   ⚠️  Target file not found: {target_file_path}")
+
+    thread_safe_print(f"\n✅ Completed processing deleted files")
+
+# Section deletion logic moved to file_updater.py