diff --git a/.env.sample b/.env.sample index c26cdce..c46b598 100644 --- a/.env.sample +++ b/.env.sample @@ -1,2 +1,26 @@ -GEMINI_PROJECT_ID= -GITHUB_TOKEN= \ No newline at end of file +# Google Gemini API Configuration +GEMINI_PROJECT_ID= +GEMINI_LOCATION=us-central1 +GEMINI_MODEL=gemini-2.5-pro-exp-03-25 +# Uncomment if using API key instead of project ID +# GEMINI_API_KEY= + +# Alternative LLM APIs (uncomment to use) +# ANTHROPIC_API_KEY= +# OPENAI_API_KEY= + +# GitHub API Configuration +GITHUB_TOKEN= + +# Logging Configuration +LOG_DIR=logs + +# Cache Configuration +CACHE_ENABLED=true +CACHE_FILE=llm_cache.json + +# Streamlit Configuration +STREAMLIT_SERVER_PORT=8501 +STREAMLIT_SERVER_HEADLESS=true +STREAMLIT_SERVER_ADDRESS=0.0.0.0 +STREAMLIT_BROWSER_GATHER_USAGE_STATS=false \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c0c2b93 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,45 @@ +FROM python:3.10-slim + +WORKDIR /app + +# Install system dependencies including Git, bash, and PDF conversion tools +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + bash \ + pandoc \ + wkhtmltopdf \ + texlive-xetex \ + texlive-fonts-recommended \ + texlive-plain-generic \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements first for better caching +COPY requirements.txt . + +# Install dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the rest of the application +COPY . . + +# Create necessary directories with proper permissions +RUN mkdir -p logs output && chmod -R 777 logs output + +# Expose the Streamlit port +EXPOSE 8501 + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONDONTWRITEBYTECODE=1 +ENV STREAMLIT_SERVER_PORT=8501 +ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0 +ENV STREAMLIT_SERVER_HEADLESS=true +ENV LOG_DIR=/app/logs +ENV CACHE_FILE=/app/llm_cache.json +ENV CACHE_ENABLED=true +ENV GIT_PYTHON_REFRESH=quiet +ENV OUTPUT_DIR=/app/output + +# Default command (can be overridden by docker-compose) +CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"] \ No newline at end of file diff --git a/README.md b/README.md index 789e367..063f9f1 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,29 @@ This is a tutorial project of [Pocket Flow](https://github.com/The-Pocket/Pocket ## 🚀 Getting Started +### Option 1: Using Docker (Recommended) + +1. Clone this repository + +2. Configure your environment variables in the `.env` file: + ```bash + # Copy the sample .env file + cp .env.sample .env + + # Edit the .env file with your credentials + # GEMINI_PROJECT_ID=your-project-id + # GITHUB_TOKEN=your-github-token + ``` + +3. Run the application using Docker Compose: + ```bash + docker-compose up -d + ``` + +4. Access the Streamlit web interface at http://localhost:8501 + +### Option 2: Manual Installation + 1. Clone this repository 2. Install dependencies: @@ -82,22 +105,22 @@ This is a tutorial project of [Pocket Flow](https://github.com/The-Pocket/Pocket ```bash python utils/call_llm.py ``` - -7. Generate a complete codebase tutorial by running the main script: - ```bash - # Analyze a GitHub repository - python main.py --repo https://github.com/username/repo --include "*.py" "*.js" --exclude "tests/*" --max-size 50000 - - # Or, analyze a local directory - python main.py --dir /path/to/your/codebase --include "*.py" --exclude "*test*" - ``` - - `--repo` or `--dir` - Specify either a GitHub repo URL or a local directory path (required, mutually exclusive) - - `-n, --name` - Project name (optional, derived from URL/directory if omitted) - - `-t, --token` - GitHub token (or set GITHUB_TOKEN environment variable) - - `-o, --output` - Output directory (default: ./output) - - `-i, --include` - Files to include (e.g., "*.py" "*.js") - - `-e, --exclude` - Files to exclude (e.g., "tests/*" "docs/*") - - `-s, --max-size` - Maximum file size in bytes (default: 100KB) +4. Run the Streamlit web interface: + ```bash + streamlit run app.py + ``` + + Or generate a complete codebase tutorial directly using the command line: + ```bash + python main.py https://github.com/username/repo --include "*.py" "*.js" --exclude "tests/*" --max-size 50000 + ``` + - `repo_url` - URL of the GitHub repository (required) + - `-n, --name` - Project name (optional, derived from URL if omitted) + - `-t, --token` - GitHub token (or set GITHUB_TOKEN environment variable) + - `-o, --output` - Output directory (default: ./output) + - `-i, --include` - Files to include (e.g., "*.py" "*.js") + - `-e, --exclude` - Files to exclude (e.g., "tests/*" "docs/*") + - `-s, --max-size` - Maximum file size in bytes (default: 100KB) The application will crawl the repository, analyze the codebase structure, generate tutorial content, and save the output in the specified directory (default: ./output). diff --git a/app.py b/app.py new file mode 100644 index 0000000..15a0093 --- /dev/null +++ b/app.py @@ -0,0 +1,447 @@ +import streamlit as st +import os +import dotenv +import tempfile +import json +import time +from flow import create_tutorial_flow +from utils.markdown_converter import markdown_to_html, markdown_to_pdf, create_combined_markdown, get_file_contents + +# Load environment variables +dotenv.load_dotenv() + +# Default file patterns +DEFAULT_INCLUDE_PATTERNS = { + "*.py", "*.js", "*.ts", "*.go", "*.java", "*.pyi", "*.pyx", + "*.c", "*.cc", "*.cpp", "*.h", "*.md", "*.rst", "Dockerfile", + "Makefile", "*.yaml", "*.yml" +} + +DEFAULT_EXCLUDE_PATTERNS = { + "*test*", "tests/*", "docs/*", "examples/*", "v1/*", + "dist/*", "build/*", "experimental/*", "deprecated/*", + "legacy/*", ".git/*", ".github/*" +} + +# Set page config +st.set_page_config( + page_title="Codebase Tutorial Generator", + page_icon="📚", + layout="wide" +) + +# Title and description +st.title("📚 Codebase Tutorial Generator") +st.markdown(""" +This app generates comprehensive tutorials for GitHub codebases using AI. +Simply provide a GitHub repository URL and customize the generation settings. +""") + +# Sidebar for configuration +with st.sidebar: + st.header("Configuration") + + # GitHub token input + github_token = st.text_input( + "GitHub Token (optional)", + value=os.environ.get("GITHUB_TOKEN", ""), + type="password", + help="Personal access token for GitHub API. Helps avoid rate limits." + ) + + # Output directory + output_dir = st.text_input( + "Output Directory", + value="output", + help="Directory where the tutorial will be saved" + ) + + # Advanced options + with st.expander("Advanced Options"): + # File size limit + max_file_size = st.number_input( + "Max File Size (bytes)", + value=100000, + min_value=1000, + help="Maximum file size to process (in bytes)" + ) + + # Include patterns + include_patterns_str = st.text_area( + "Include Patterns", + value="\n".join(DEFAULT_INCLUDE_PATTERNS), + help="File patterns to include (one per line)" + ) + + # Exclude patterns + exclude_patterns_str = st.text_area( + "Exclude Patterns", + value="\n".join(DEFAULT_EXCLUDE_PATTERNS), + help="File patterns to exclude (one per line)" + ) + +# Main form +with st.form("tutorial_form"): + # Repository URL + repo_url = st.text_input( + "GitHub Repository URL", + placeholder="https://github.com/username/repository", + help="URL of the public GitHub repository" + ) + + # Project name (optional) + project_name = st.text_input( + "Project Name (optional)", + help="Custom name for the project (derived from URL if omitted)" + ) + + # Submit button + submit_button = st.form_submit_button("Generate Tutorial") + +# Process form submission +if submit_button: + if not repo_url: + st.error("Please enter a GitHub repository URL") + else: + # Show progress + progress_bar = st.progress(0) + status_text = st.empty() + + # Parse include/exclude patterns + include_patterns = set(filter(None, include_patterns_str.split("\n"))) + exclude_patterns = set(filter(None, exclude_patterns_str.split("\n"))) + + # Initialize shared dictionary + shared = { + "repo_url": repo_url, + "project_name": project_name if project_name else None, + "github_token": github_token if github_token else os.environ.get("GITHUB_TOKEN"), + "output_dir": output_dir, + "include_patterns": include_patterns, + "exclude_patterns": exclude_patterns, + "max_file_size": max_file_size, + "files": [], + "abstractions": [], + "relationships": {}, + "chapter_order": [], + "chapters": [], + "final_output_dir": None + } + + try: + # Create and run the flow + status_text.text("Starting tutorial generation...") + progress_bar.progress(10) + + tutorial_flow = create_tutorial_flow() + + # Update status for each node + status_text.text("Fetching repository...") + progress_bar.progress(20) + + # Run the flow with progress updates + # Note: In a real implementation, you would need to modify the flow + # to provide progress updates or use callbacks + try: + result = tutorial_flow.run(shared) + + progress_bar.progress(100) + status_text.text("Tutorial generation complete!") + + # Display result + if result and result.get("final_output_dir"): + output_dir = result["final_output_dir"] + st.success(f"Tutorial generated successfully in: {output_dir}") + + # Check if output directory exists + if os.path.exists(output_dir) and os.path.isdir(output_dir): + st.markdown("### Tutorial Content") + files = sorted(os.listdir(output_dir)) + if files: + # Create tabs for each file plus a "Complete Tutorial" tab + tab_names = [f.replace('.md', '') for f in files] + tab_names.append("Complete Tutorial") + tabs = st.tabs(tab_names) + + # Prepare combined content for the complete tutorial + combined_content = "" + file_contents = {} + + # First, read all file contents + for file in files: + file_path = os.path.join(output_dir, file) + if os.path.isfile(file_path) and file.endswith('.md'): + try: + with open(file_path, "r", encoding="utf-8") as f: + file_contents[file] = f.read() + except Exception as e: + file_contents[file] = f"Error reading file: {str(e)}" + + # Process individual tabs + for i, file in enumerate(files): + if file in file_contents: + content = file_contents[file] + + # Display the content in the corresponding tab + with tabs[i]: + # Add download button at the top + with open(os.path.join(output_dir, file), "rb") as f: + st.download_button( + label=f"Download {file}", + data=f, + file_name=file, + mime="text/markdown" + ) + + # Display the markdown content + st.markdown(content) + + # Create combined content using our utility function + combined_content, combined_file_path = create_combined_markdown( + file_contents, + os.path.join(output_dir, "complete_tutorial.md") + ) + + # Display the complete tutorial tab + with tabs[-1]: + if combined_content and combined_file_path: + # Add download buttons for markdown + with open(combined_file_path, "rb") as f: + st.download_button( + label="Download Complete Tutorial (Markdown)", + data=f, + file_name="complete_tutorial.md", + mime="text/markdown" + ) + + # Convert to HTML for better rendering + html_content = markdown_to_html(combined_content) + if html_content: + # Save HTML file + html_file_path = os.path.join(output_dir, "complete_tutorial.html") + with open(html_file_path, "w", encoding="utf-8") as f: + f.write(html_content) + + # Add download button for HTML + with open(html_file_path, "rb") as f: + st.download_button( + label="Download Complete Tutorial (HTML)", + data=f, + file_name="complete_tutorial.html", + mime="text/html" + ) + + # Convert to PDF and add PDF download button + try: + with st.spinner("Converting to PDF..."): + # Create a file for the PDF + pdf_file_path = os.path.join(output_dir, "complete_tutorial.pdf") + + # Convert markdown to PDF + pdf_path = markdown_to_pdf(combined_content, pdf_file_path) + + if pdf_path and os.path.exists(pdf_path): + with open(pdf_path, "rb") as f: + st.download_button( + label="Download Complete Tutorial (PDF)", + data=f, + file_name="complete_tutorial.pdf", + mime="application/pdf" + ) + else: + st.warning("PDF conversion failed. Please download the HTML or markdown version instead.") + except Exception as e: + st.warning(f"PDF conversion failed: {str(e)}. Please download the HTML or markdown version instead.") + + # Display the combined content with proper rendering + st.markdown("## Complete Tutorial") + st.markdown("This tab shows all chapters combined into a single document.") + + # Use HTML display for better rendering of Mermaid diagrams + if html_content: + st.components.v1.html(html_content, height=800, scrolling=True) + else: + # Fallback to regular markdown display + st.markdown(combined_content) + else: + st.error("Failed to create combined tutorial content.") + else: + st.info("No files found in the output directory.") + else: + # If the directory doesn't exist, try to find it in the output base directory + output_base_dir = shared.get("output_dir", "output") + project_name = shared.get("project_name", "") + + # Try to find the project directory in the output base directory + if os.path.exists(output_base_dir) and os.path.isdir(output_base_dir): + project_dirs = [d for d in os.listdir(output_base_dir) + if os.path.isdir(os.path.join(output_base_dir, d))] + + if project_name and project_name in project_dirs: + # Found the project directory + actual_output_dir = os.path.join(output_base_dir, project_name) + st.success(f"Found output directory at: {actual_output_dir}") + + # List files for download and viewing + st.markdown("### Tutorial Content") + files = sorted(os.listdir(actual_output_dir)) + if files: + # Create tabs for each file plus a "Complete Tutorial" tab + tab_names = [f.replace('.md', '') for f in files] + tab_names.append("Complete Tutorial") + tabs = st.tabs(tab_names) + + # Prepare combined content for the complete tutorial + combined_content = "" + file_contents = {} + + # First, read all file contents + for file in files: + file_path = os.path.join(actual_output_dir, file) + if os.path.isfile(file_path) and file.endswith('.md'): + try: + with open(file_path, "r", encoding="utf-8") as f: + file_contents[file] = f.read() + except Exception as e: + file_contents[file] = f"Error reading file: {str(e)}" + + # Process individual tabs + for i, file in enumerate(files): + if file in file_contents: + content = file_contents[file] + + # Display the content in the corresponding tab + with tabs[i]: + # Add download button at the top + with open(os.path.join(actual_output_dir, file), "rb") as f: + st.download_button( + label=f"Download {file}", + data=f, + file_name=file, + mime="text/markdown" + ) + + # Display the markdown content + st.markdown(content) + + # Create combined content using our utility function + combined_content, combined_file_path = create_combined_markdown( + file_contents, + os.path.join(actual_output_dir, "complete_tutorial.md") + ) + + # Display the complete tutorial tab + with tabs[-1]: + if combined_content and combined_file_path: + # Add download buttons for markdown + with open(combined_file_path, "rb") as f: + st.download_button( + label="Download Complete Tutorial (Markdown)", + data=f, + file_name="complete_tutorial.md", + mime="text/markdown" + ) + + # Convert to HTML for better rendering + html_content = markdown_to_html(combined_content) + if html_content: + # Save HTML file + html_file_path = os.path.join(actual_output_dir, "complete_tutorial.html") + with open(html_file_path, "w", encoding="utf-8") as f: + f.write(html_content) + + # Add download button for HTML + with open(html_file_path, "rb") as f: + st.download_button( + label="Download Complete Tutorial (HTML)", + data=f, + file_name="complete_tutorial.html", + mime="text/html" + ) + + # Convert to PDF and add PDF download button + try: + with st.spinner("Converting to PDF..."): + # Create a file for the PDF + pdf_file_path = os.path.join(actual_output_dir, "complete_tutorial.pdf") + + # Convert markdown to PDF + pdf_path = markdown_to_pdf(combined_content, pdf_file_path) + + if pdf_path and os.path.exists(pdf_path): + with open(pdf_path, "rb") as f: + st.download_button( + label="Download Complete Tutorial (PDF)", + data=f, + file_name="complete_tutorial.pdf", + mime="application/pdf" + ) + else: + st.warning("PDF conversion failed. Please download the HTML or markdown version instead.") + except Exception as e: + st.warning(f"PDF conversion failed: {str(e)}. Please download the HTML or markdown version instead.") + + # Display the combined content with proper rendering + st.markdown("## Complete Tutorial") + st.markdown("This tab shows all chapters combined into a single document.") + + # Use HTML display for better rendering of Mermaid diagrams + if html_content: + st.components.v1.html(html_content, height=800, scrolling=True) + else: + # Fallback to regular markdown display + st.markdown(combined_content) + else: + st.error("Failed to create combined tutorial content.") + else: + st.info("No files found in the output directory.") + else: + # List all available project directories + if project_dirs: + st.warning(f"Output directory '{output_dir}' not found, but found these project directories:") + for dir_name in project_dirs: + dir_path = os.path.join(output_base_dir, dir_name) + st.info(f"- {dir_path}") + else: + st.warning(f"Output directory '{output_dir}' not found and no project directories found in {output_base_dir}") + else: + st.warning(f"Output directory not found or not accessible: {output_dir}") + else: + # Try to find any output directories + output_base_dir = shared.get("output_dir", "output") + if os.path.exists(output_base_dir) and os.path.isdir(output_base_dir): + project_dirs = [d for d in os.listdir(output_base_dir) + if os.path.isdir(os.path.join(output_base_dir, d))] + if project_dirs: + st.success("Tutorial generation completed! Found these output directories:") + for dir_name in project_dirs: + dir_path = os.path.join(output_base_dir, dir_name) + st.info(f"- {dir_path}") + else: + st.warning(f"Tutorial generation completed but no output directories found in {output_base_dir}") + else: + st.warning("Tutorial generation completed but output directory not found.") + except Exception as e: + progress_bar.progress(100) + status_text.text("Tutorial generation failed!") + st.error(f"Error generating tutorial: {str(e)}") + st.exception(e) + + except Exception as e: + st.error(f"Error generating tutorial: {str(e)}") + st.exception(e) + +# Display information about the app +st.markdown("---") +st.markdown(""" +### How it works +1. The app clones the GitHub repository +2. It analyzes the codebase structure and identifies key abstractions +3. It determines relationships between components +4. It generates tutorial chapters in a logical order +5. Finally, it combines everything into a comprehensive tutorial + +### Requirements +- A public GitHub repository +- Google Gemini API access (configured via environment variables) +""") \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..9f05d7a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,39 @@ +version: '3' + +services: + tutorial-generator: + build: + context: . + dockerfile: Dockerfile + ports: + - "8501:8501" + volumes: + - ./output:/app/output + - ./logs:/app/logs + - ./llm_cache.json:/app/llm_cache.json + env_file: + - .env + environment: + - PYTHONUNBUFFERED=1 + - PYTHONDONTWRITEBYTECODE=1 + - STREAMLIT_SERVER_PORT=8501 + - STREAMLIT_SERVER_ADDRESS=0.0.0.0 + - STREAMLIT_SERVER_HEADLESS=true + - LOG_DIR=/app/logs + - CACHE_FILE=/app/llm_cache.json + - CACHE_ENABLED=true + - OUTPUT_DIR=/app/output + restart: unless-stopped + # Ensure the container has write permissions to the output directory + user: "${UID:-1000}:${GID:-1000}" + # Create output directory with proper permissions + command: > + bash -c " + mkdir -p /app/output && + chmod -R 777 /app/output && + mkdir -p /app/logs && + chmod -R 777 /app/logs && + touch /app/llm_cache.json && + chmod 666 /app/llm_cache.json && + streamlit run app.py --server.port=8501 --server.address=0.0.0.0 + " \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 06253bc..3ecba93 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,8 @@ gitpython>=3.1.0 google-cloud-aiplatform>=1.25.0 google-genai>=1.9.0 python-dotenv>=1.0.0 +streamlit>=1.32.0 +markdown>=3.4.0 +pdfkit>=1.0.0 +weasyprint>=59.0 +pymdown-extensions>=10.0.0 diff --git a/test_markdown_converter.py b/test_markdown_converter.py new file mode 100644 index 0000000..51c7232 --- /dev/null +++ b/test_markdown_converter.py @@ -0,0 +1,55 @@ +import os +from utils.markdown_converter import markdown_to_html, markdown_to_pdf, create_combined_markdown, get_file_contents + +# Test directory +output_dir = "output/GIM-BACK" + +# If the directory doesn't exist, try to find it +if not os.path.exists(output_dir): + output_base_dir = "output" + if os.path.exists(output_base_dir) and os.path.isdir(output_base_dir): + project_dirs = [d for d in os.listdir(output_base_dir) + if os.path.isdir(os.path.join(output_base_dir, d))] + print(f"Available project directories: {project_dirs}") + if project_dirs: + output_dir = os.path.join(output_base_dir, project_dirs[0]) + print(f"Using first available directory: {output_dir}") + +# Check if output directory exists +if os.path.exists(output_dir) and os.path.isdir(output_dir): + print(f"Output directory exists: {output_dir}") + + # Get file contents + file_contents = get_file_contents(output_dir, '.md') + print(f"Found {len(file_contents)} markdown files") + + # Create combined markdown + combined_content, combined_file_path = create_combined_markdown( + file_contents, + os.path.join(output_dir, "test_combined.md") + ) + + if combined_content and combined_file_path: + print(f"Created combined markdown file: {combined_file_path}") + + # Convert to HTML + html_content = markdown_to_html(combined_content) + if html_content: + html_file_path = os.path.join(output_dir, "test_combined.html") + with open(html_file_path, "w", encoding="utf-8") as f: + f.write(html_content) + print(f"Created HTML file: {html_file_path}") + else: + print("Failed to convert to HTML") + + # Convert to PDF + pdf_file_path = os.path.join(output_dir, "test_combined.pdf") + pdf_path = markdown_to_pdf(combined_content, pdf_file_path) + if pdf_path and os.path.exists(pdf_path): + print(f"Created PDF file: {pdf_path}") + else: + print("Failed to convert to PDF") + else: + print("Failed to create combined markdown") +else: + print(f"Output directory does not exist: {output_dir}") \ No newline at end of file diff --git a/test_markdown_render.py b/test_markdown_render.py new file mode 100644 index 0000000..1da6b76 --- /dev/null +++ b/test_markdown_render.py @@ -0,0 +1,37 @@ +import os + +# Test directory +output_dir = "output/GIM-BACK" + +# If the directory doesn't exist, try to find it +if not os.path.exists(output_dir): + output_base_dir = "output" + if os.path.exists(output_base_dir) and os.path.isdir(output_base_dir): + project_dirs = [d for d in os.listdir(output_base_dir) + if os.path.isdir(os.path.join(output_base_dir, d))] + print(f"Available project directories: {project_dirs}") + if project_dirs: + output_dir = os.path.join(output_base_dir, project_dirs[0]) + print(f"Using first available directory: {output_dir}") + +# Check if output directory exists +if os.path.exists(output_dir) and os.path.isdir(output_dir): + print(f"Output directory exists: {output_dir}") + + # List files in the directory + files = sorted(os.listdir(output_dir)) + print(f"Files in directory: {files}") + + # Read and print the content of each file + for file in files: + file_path = os.path.join(output_dir, file) + if os.path.isfile(file_path): + try: + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + print(f"\n--- {file} ---") + print(f"First 100 characters: {content[:100]}...") + except Exception as e: + print(f"Error reading file {file}: {str(e)}") +else: + print(f"Output directory does not exist: {output_dir}") \ No newline at end of file diff --git a/test_output_dir.py b/test_output_dir.py new file mode 100644 index 0000000..24a1619 --- /dev/null +++ b/test_output_dir.py @@ -0,0 +1,47 @@ +import os + +# Test directory detection logic +output_base_dir = "output" +project_name = "GIM-BACK" + +# Test with non-existent directory +non_existent_dir = "output/NON-EXISTENT" +print(f"\nTesting with non-existent directory: {non_existent_dir}") +if os.path.exists(non_existent_dir) and os.path.isdir(non_existent_dir): + print(f"Directory exists: {non_existent_dir}") +else: + print(f"Directory does not exist: {non_existent_dir}") + + # Try to find it in the output base directory + if os.path.exists(output_base_dir) and os.path.isdir(output_base_dir): + project_dirs = [d for d in os.listdir(output_base_dir) + if os.path.isdir(os.path.join(output_base_dir, d))] + + print(f"Available project directories: {project_dirs}") + else: + print(f"Output base directory does not exist: {output_base_dir}") + +print("\nTesting with existing directory:") + +print(f"Checking if output directory exists: {output_base_dir}") +if os.path.exists(output_base_dir) and os.path.isdir(output_base_dir): + print(f"Output base directory exists: {output_base_dir}") + + # List all directories in the output base directory + project_dirs = [d for d in os.listdir(output_base_dir) + if os.path.isdir(os.path.join(output_base_dir, d))] + + print(f"Found project directories: {project_dirs}") + + if project_name and project_name in project_dirs: + # Found the project directory + actual_output_dir = os.path.join(output_base_dir, project_name) + print(f"Found project directory: {actual_output_dir}") + + # List files in the project directory + files = os.listdir(actual_output_dir) + print(f"Files in project directory: {files}") + else: + print(f"Project directory '{project_name}' not found in {output_base_dir}") +else: + print(f"Output base directory does not exist: {output_base_dir}") \ No newline at end of file diff --git a/utils/call_llm.py b/utils/call_llm.py index 0d794b4..ffa37af 100644 --- a/utils/call_llm.py +++ b/utils/call_llm.py @@ -17,11 +17,16 @@ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logger.addHandler(file_handler) -# Simple cache configuration -cache_file = "llm_cache.json" +# Cache configuration from environment variables +cache_file = os.getenv("CACHE_FILE", "llm_cache.json") +cache_enabled = os.getenv("CACHE_ENABLED", "true").lower() == "true" -# By default, we Google Gemini 2.5 pro, as it shows great performance for code understanding -def call_llm(prompt: str, use_cache: bool = True) -> str: +# By default, we use Google Gemini 2.5 pro, as it shows great performance for code understanding +def call_llm(prompt: str, use_cache: bool = None) -> str: + # Determine if cache should be used (parameter overrides environment variable) + if use_cache is None: + use_cache = cache_enabled + # Log the prompt logger.info(f"PROMPT: {prompt}") @@ -33,55 +38,63 @@ def call_llm(prompt: str, use_cache: bool = True) -> str: try: with open(cache_file, 'r') as f: cache = json.load(f) - except: - logger.warning(f"Failed to load cache, starting with empty cache") + except Exception as e: + logger.warning(f"Failed to load cache, starting with empty cache: {e}") # Return from cache if exists if prompt in cache: - logger.info(f"RESPONSE: {cache[prompt]}") + logger.info(f"RESPONSE (cached): {cache[prompt]}") return cache[prompt] # Call the LLM if not in cache or cache disabled - client = genai.Client( - vertexai=True, - # TODO: change to your own project id and location - project=os.getenv("GEMINI_PROJECT_ID", "your-project-id"), - location=os.getenv("GEMINI_LOCATION", "us-central1") - ) - # You can comment the previous line and use the AI Studio key instead: - # client = genai.Client( - # api_key=os.getenv("GEMINI_API_KEY", "your-api_key"), - # ) - model = os.getenv("GEMINI_MODEL", "gemini-2.5-pro-exp-03-25") - response = client.models.generate_content( - model=model, - contents=[prompt] - ) - response_text = response.text - - # Log the response - logger.info(f"RESPONSE: {response_text}") - - # Update cache if enabled - if use_cache: - # Load cache again to avoid overwrites - cache = {} - if os.path.exists(cache_file): + try: + # Check if using API key or Vertex AI + api_key = os.getenv("GEMINI_API_KEY") + if api_key: + # Use API key authentication + client = genai.Client(api_key=api_key) + else: + # Use Vertex AI authentication + client = genai.Client( + vertexai=True, + project=os.getenv("GEMINI_PROJECT_ID", "your-project-id"), + location=os.getenv("GEMINI_LOCATION", "us-central1") + ) + + model = os.getenv("GEMINI_MODEL", "gemini-2.5-pro-exp-03-25") + response = client.models.generate_content( + model=model, + contents=[prompt] + ) + response_text = response.text + + # Log the response + logger.info(f"RESPONSE: {response_text}") + + # Update cache if enabled + if use_cache: + # Load cache again to avoid overwrites + cache = {} + if os.path.exists(cache_file): + try: + with open(cache_file, 'r') as f: + cache = json.load(f) + except Exception as e: + logger.warning(f"Failed to reload cache: {e}") + + # Add to cache and save + cache[prompt] = response_text try: - with open(cache_file, 'r') as f: - cache = json.load(f) - except: - pass + with open(cache_file, 'w') as f: + json.dump(cache, f) + except Exception as e: + logger.error(f"Failed to save cache: {e}") - # Add to cache and save - cache[prompt] = response_text - try: - with open(cache_file, 'w') as f: - json.dump(cache, f) - except Exception as e: - logger.error(f"Failed to save cache: {e}") + return response_text - return response_text + except Exception as e: + logger.error(f"Error calling Gemini API: {e}") + raise Exception(f"Failed to generate content with Gemini: {e}") # # Use Anthropic Claude 3.7 Sonnet Extended Thinking # def call_llm(prompt, use_cache: bool = True): diff --git a/utils/markdown_converter.py b/utils/markdown_converter.py new file mode 100644 index 0000000..e829b9c --- /dev/null +++ b/utils/markdown_converter.py @@ -0,0 +1,266 @@ +import os +import tempfile +import subprocess +import logging +import base64 +from pathlib import Path + +logger = logging.getLogger(__name__) + +def markdown_to_html(markdown_content): + """ + Convert markdown content to HTML with proper rendering of code blocks and Mermaid diagrams. + + Args: + markdown_content (str): The markdown content to convert + + Returns: + str: The HTML content + """ + try: + # Create a temporary file for the markdown content + with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_md: + temp_md.write(markdown_content) + temp_md_path = temp_md.name + + # Create a temporary file for the HTML output + with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as temp_html: + temp_html_path = temp_html.name + + # Convert markdown to HTML using pandoc + cmd = [ + 'pandoc', + temp_md_path, + '-o', temp_html_path, + '--standalone', + '--highlight-style=tango', + '--toc', + '--toc-depth=3', + '--number-sections', + '-f', 'markdown+yaml_metadata_block+raw_html+fenced_divs+mermaid', + '--embed-resources', + '--mathjax', + '--template=default', + '--css', 'https://cdn.jsdelivr.net/npm/github-markdown-css/github-markdown.min.css', + '--include-in-header', '-' + ] + + # Add Mermaid script to the header + mermaid_script = """ + + + + """ + + # Run the command with the Mermaid script as input to --include-in-header + process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + _, stderr = process.communicate(input=mermaid_script) + + if process.returncode != 0: + logger.error(f"Error converting markdown to HTML: {stderr}") + return None + + # Read the HTML content + with open(temp_html_path, 'r', encoding='utf-8') as f: + html_content = f.read() + + # Clean up temporary files + os.unlink(temp_md_path) + os.unlink(temp_html_path) + + return html_content + + except Exception as e: + logger.error(f"Error in markdown_to_html: {str(e)}") + return None + +def create_combined_markdown(files_dict, output_path=None): + """ + Combine multiple markdown files into a single markdown file. + + Args: + files_dict (dict): Dictionary mapping filenames to their content + output_path (str, optional): Path to save the combined markdown file + + Returns: + tuple: (combined_content, output_path) + """ + try: + # Start with index.md if it exists + combined_content = "" + if 'index.md' in files_dict: + combined_content += files_dict['index.md'] + "\n\n---\n\n" + + # Add all numbered files in order + numbered_files = sorted([f for f in files_dict.keys() + if f.startswith(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')) + and f.endswith('.md')]) + + for file in numbered_files: + combined_content += files_dict[file] + "\n\n---\n\n" + + # Add any remaining files + for file in files_dict: + if file != 'index.md' and file not in numbered_files and file.endswith('.md'): + combined_content += files_dict[file] + "\n\n---\n\n" + + # Save to file if output_path is provided + if output_path: + os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(combined_content) + + return combined_content, output_path + + except Exception as e: + logger.error(f"Error in create_combined_markdown: {str(e)}") + return None, None + +def html_to_pdf(html_content, output_path=None): + """ + Convert HTML content to PDF using wkhtmltopdf. + + Args: + html_content (str): The HTML content to convert + output_path (str, optional): Path to save the PDF + + Returns: + str: The path to the generated PDF + """ + try: + # Create a temporary file for the HTML content + with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as temp_html: + temp_html.write(html_content) + temp_html_path = temp_html.name + + # Create a temporary file for the PDF output if not provided + if output_path is None: + temp_pdf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) + temp_pdf.close() + output_path = temp_pdf.name + + # Convert HTML to PDF using wkhtmltopdf + cmd = [ + 'wkhtmltopdf', + '--enable-local-file-access', + '--javascript-delay', '1000', # Wait for JavaScript to execute (for Mermaid) + '--no-stop-slow-scripts', + '--margin-top', '20', + '--margin-right', '20', + '--margin-bottom', '20', + '--margin-left', '20', + '--page-size', 'A4', + '--encoding', 'UTF-8', + '--footer-center', '[page]/[topage]', + temp_html_path, + output_path + ] + + # Run the command + process = subprocess.run(cmd, capture_output=True, text=True) + + # Clean up temporary files + os.unlink(temp_html_path) + + if process.returncode != 0: + logger.error(f"Error converting HTML to PDF: {process.stderr}") + return None + + return output_path + + except Exception as e: + logger.error(f"Error in html_to_pdf: {str(e)}") + return None + +def markdown_to_pdf(markdown_content, output_path=None): + """ + Convert markdown content to PDF. + + Args: + markdown_content (str): The markdown content to convert + output_path (str, optional): Path to save the PDF + + Returns: + str: The path to the generated PDF + """ + # Convert markdown to HTML + html_content = markdown_to_html(markdown_content) + if not html_content: + return None + + # Convert HTML to PDF + return html_to_pdf(html_content, output_path) + +def get_file_contents(directory, file_pattern=None): + """ + Get the contents of all files in a directory. + + Args: + directory (str): The directory to search + file_pattern (str, optional): A pattern to match filenames + + Returns: + dict: Dictionary mapping filenames to their content + """ + try: + files_dict = {} + for file in os.listdir(directory): + if file_pattern and not file.endswith(file_pattern): + continue + + file_path = os.path.join(directory, file) + if os.path.isfile(file_path): + try: + with open(file_path, 'r', encoding='utf-8') as f: + files_dict[file] = f.read() + except Exception as e: + logger.error(f"Error reading file {file}: {str(e)}") + + return files_dict + + except Exception as e: + logger.error(f"Error in get_file_contents: {str(e)}") + return {} \ No newline at end of file diff --git a/utils/markdown_to_pdf.py b/utils/markdown_to_pdf.py new file mode 100644 index 0000000..a2e2215 --- /dev/null +++ b/utils/markdown_to_pdf.py @@ -0,0 +1,404 @@ +import os +import subprocess +import tempfile +import logging + +logger = logging.getLogger(__name__) + +def markdown_to_pdf(markdown_content, output_path=None): + """ + Convert markdown content to PDF using pandoc. + + Args: + markdown_content (str): The markdown content to convert + output_path (str, optional): The path to save the PDF. If None, a temporary file will be created. + + Returns: + str: The path to the generated PDF file + """ + try: + # Create a temporary file for the markdown content + with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_md: + temp_md.write(markdown_content) + temp_md_path = temp_md.name + + # Create a temporary file for the PDF output if not provided + if output_path is None: + temp_pdf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) + temp_pdf.close() + output_path = temp_pdf.name + + # Create a CSS file for styling + css_content = """ + body { + font-family: 'Arial', sans-serif; + line-height: 1.6; + max-width: 800px; + margin: 0 auto; + padding: 20px; + } + h1, h2, h3, h4, h5, h6 { + color: #333; + margin-top: 24px; + margin-bottom: 16px; + } + h1 { + font-size: 2em; + border-bottom: 1px solid #eaecef; + padding-bottom: 0.3em; + } + h2 { + font-size: 1.5em; + border-bottom: 1px solid #eaecef; + padding-bottom: 0.3em; + } + code { + font-family: 'Courier New', Courier, monospace; + background-color: #f6f8fa; + padding: 0.2em 0.4em; + border-radius: 3px; + } + pre { + background-color: #f6f8fa; + border-radius: 3px; + padding: 16px; + overflow: auto; + } + pre code { + background-color: transparent; + padding: 0; + } + blockquote { + border-left: 4px solid #dfe2e5; + padding: 0 1em; + color: #6a737d; + } + table { + border-collapse: collapse; + width: 100%; + margin-bottom: 16px; + } + table, th, td { + border: 1px solid #dfe2e5; + } + th, td { + padding: 6px 13px; + } + th { + background-color: #f6f8fa; + } + img { + max-width: 100%; + } + hr { + height: 0.25em; + padding: 0; + margin: 24px 0; + background-color: #e1e4e8; + border: 0; + } + """ + + with tempfile.NamedTemporaryFile(mode='w', suffix='.css', delete=False) as temp_css: + temp_css.write(css_content) + temp_css_path = temp_css.name + + # Convert markdown to PDF using pandoc + cmd = [ + 'pandoc', + temp_md_path, + '-o', output_path, + '--pdf-engine=xelatex', + '-V', 'geometry:margin=1in', + '--highlight-style=tango', + '--standalone', + '--css', temp_css_path, + '--toc', # Table of contents + '--toc-depth=3', + '--number-sections', + '-V', 'colorlinks=true', + '-V', 'linkcolor=blue', + '-V', 'urlcolor=blue', + '-V', 'toccolor=blue', + '-f', 'markdown+yaml_metadata_block+raw_html+fenced_divs+mermaid', + '--embed-resources', + '--standalone' + ] + + # Run the command + result = subprocess.run(cmd, capture_output=True, text=True) + + # Clean up temporary files + os.unlink(temp_md_path) + os.unlink(temp_css_path) + + if result.returncode != 0: + logger.error(f"Error converting markdown to PDF: {result.stderr}") + # Try alternative method with weasyprint + return _markdown_to_pdf_weasyprint(markdown_content, output_path) + + return output_path + + except Exception as e: + logger.error(f"Error in markdown_to_pdf: {str(e)}") + # Try alternative method with weasyprint + return _markdown_to_pdf_weasyprint(markdown_content, output_path) + +def _markdown_to_pdf_weasyprint(markdown_content, output_path): + """ + Alternative method to convert markdown to PDF using WeasyPrint. + """ + try: + import markdown + from weasyprint import HTML, CSS + from pymdownx.superfences import SuperFencesExtension + from pymdownx.highlight import HighlightExtension + + # Convert markdown to HTML + html = markdown.markdown( + markdown_content, + extensions=[ + 'extra', + 'toc', + 'tables', + 'fenced_code', + 'codehilite', + HighlightExtension(css_class='highlight'), + SuperFencesExtension(custom_fences=[ + {'name': 'mermaid', 'class': 'mermaid', 'format': lambda x, y, z: f'
{x}
'} + ]) + ] + ) + + # Add CSS for styling + css_content = """ + body { + font-family: Arial, sans-serif; + line-height: 1.6; + margin: 2cm; + } + h1, h2, h3, h4, h5, h6 { + color: #333; + margin-top: 24px; + margin-bottom: 16px; + } + h1 { + font-size: 2em; + border-bottom: 1px solid #eaecef; + padding-bottom: 0.3em; + } + h2 { + font-size: 1.5em; + border-bottom: 1px solid #eaecef; + padding-bottom: 0.3em; + } + code { + font-family: monospace; + background-color: #f6f8fa; + padding: 0.2em 0.4em; + border-radius: 3px; + } + pre { + background-color: #f6f8fa; + border-radius: 3px; + padding: 16px; + overflow: auto; + } + pre code { + background-color: transparent; + padding: 0; + } + blockquote { + border-left: 4px solid #dfe2e5; + padding: 0 1em; + color: #6a737d; + } + table { + border-collapse: collapse; + width: 100%; + margin-bottom: 16px; + } + table, th, td { + border: 1px solid #dfe2e5; + } + th, td { + padding: 6px 13px; + } + th { + background-color: #f6f8fa; + } + img { + max-width: 100%; + } + hr { + height: 0.25em; + padding: 0; + margin: 24px 0; + background-color: #e1e4e8; + border: 0; + } + .highlight .hll { background-color: #ffffcc } + .highlight .c { color: #999988; font-style: italic } /* Comment */ + .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */ + .highlight .k { color: #000000; font-weight: bold } /* Keyword */ + .highlight .o { color: #000000; font-weight: bold } /* Operator */ + .highlight .cm { color: #999988; font-style: italic } /* Comment.Multiline */ + .highlight .cp { color: #999999; font-weight: bold; font-style: italic } /* Comment.Preproc */ + .highlight .c1 { color: #999988; font-style: italic } /* Comment.Single */ + .highlight .cs { color: #999999; font-weight: bold; font-style: italic } /* Comment.Special */ + .highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ + .highlight .ge { color: #000000; font-style: italic } /* Generic.Emph */ + .highlight .gr { color: #aa0000 } /* Generic.Error */ + .highlight .gh { color: #999999 } /* Generic.Heading */ + .highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ + .highlight .go { color: #888888 } /* Generic.Output */ + .highlight .gp { color: #555555 } /* Generic.Prompt */ + .highlight .gs { font-weight: bold } /* Generic.Strong */ + .highlight .gu { color: #aaaaaa } /* Generic.Subheading */ + .highlight .gt { color: #aa0000 } /* Generic.Traceback */ + .highlight .kc { color: #000000; font-weight: bold } /* Keyword.Constant */ + .highlight .kd { color: #000000; font-weight: bold } /* Keyword.Declaration */ + .highlight .kn { color: #000000; font-weight: bold } /* Keyword.Namespace */ + .highlight .kp { color: #000000; font-weight: bold } /* Keyword.Pseudo */ + .highlight .kr { color: #000000; font-weight: bold } /* Keyword.Reserved */ + .highlight .kt { color: #445588; font-weight: bold } /* Keyword.Type */ + .highlight .m { color: #009999 } /* Literal.Number */ + .highlight .s { color: #d01040 } /* Literal.String */ + .highlight .na { color: #008080 } /* Name.Attribute */ + .highlight .nb { color: #0086B3 } /* Name.Builtin */ + .highlight .nc { color: #445588; font-weight: bold } /* Name.Class */ + .highlight .no { color: #008080 } /* Name.Constant */ + .highlight .nd { color: #3c5d5d; font-weight: bold } /* Name.Decorator */ + .highlight .ni { color: #800080 } /* Name.Entity */ + .highlight .ne { color: #990000; font-weight: bold } /* Name.Exception */ + .highlight .nf { color: #990000; font-weight: bold } /* Name.Function */ + .highlight .nl { color: #990000; font-weight: bold } /* Name.Label */ + .highlight .nn { color: #555555 } /* Name.Namespace */ + .highlight .nt { color: #000080 } /* Name.Tag */ + .highlight .nv { color: #008080 } /* Name.Variable */ + .highlight .ow { color: #000000; font-weight: bold } /* Operator.Word */ + .highlight .w { color: #bbbbbb } /* Text.Whitespace */ + .highlight .mf { color: #009999 } /* Literal.Number.Float */ + .highlight .mh { color: #009999 } /* Literal.Number.Hex */ + .highlight .mi { color: #009999 } /* Literal.Number.Integer */ + .highlight .mo { color: #009999 } /* Literal.Number.Oct */ + .highlight .sb { color: #d01040 } /* Literal.String.Backtick */ + .highlight .sc { color: #d01040 } /* Literal.String.Char */ + .highlight .sd { color: #d01040 } /* Literal.String.Doc */ + .highlight .s2 { color: #d01040 } /* Literal.String.Double */ + .highlight .se { color: #d01040 } /* Literal.String.Escape */ + .highlight .sh { color: #d01040 } /* Literal.String.Heredoc */ + .highlight .si { color: #d01040 } /* Literal.String.Interpol */ + .highlight .sx { color: #d01040 } /* Literal.String.Other */ + .highlight .sr { color: #009926 } /* Literal.String.Regex */ + .highlight .s1 { color: #d01040 } /* Literal.String.Single */ + .highlight .ss { color: #990073 } /* Literal.String.Symbol */ + .highlight .bp { color: #999999 } /* Name.Builtin.Pseudo */ + .highlight .vc { color: #008080 } /* Name.Variable.Class */ + .highlight .vg { color: #008080 } /* Name.Variable.Global */ + .highlight .vi { color: #008080 } /* Name.Variable.Instance */ + .highlight .il { color: #009999 } /* Literal.Number.Integer.Long */ + """ + + # Create a complete HTML document + complete_html = f""" + + + + + Tutorial + + + + {html} + + + """ + + # Convert HTML to PDF + HTML(string=complete_html).write_pdf(output_path) + + return output_path + + except Exception as e: + logger.error(f"Error in _markdown_to_pdf_weasyprint: {str(e)}") + # Try another alternative method with pdfkit + return _markdown_to_pdf_pdfkit(markdown_content, output_path) + +def _markdown_to_pdf_pdfkit(markdown_content, output_path): + """ + Another alternative method to convert markdown to PDF using pdfkit (wkhtmltopdf). + """ + try: + import markdown + import pdfkit + + # Convert markdown to HTML + html = markdown.markdown( + markdown_content, + extensions=['extra', 'toc', 'tables', 'fenced_code', 'codehilite'] + ) + + # Add CSS for styling + css_content = """ + body { + font-family: Arial, sans-serif; + line-height: 1.6; + margin: 2cm; + } + h1, h2, h3, h4, h5, h6 { + color: #333; + } + code { + font-family: monospace; + background-color: #f6f8fa; + padding: 0.2em 0.4em; + border-radius: 3px; + } + pre { + background-color: #f6f8fa; + border-radius: 3px; + padding: 16px; + overflow: auto; + } + """ + + # Create a complete HTML document + complete_html = f""" + + + + + Tutorial + + + + {html} + + + """ + + # Convert HTML to PDF + options = { + 'page-size': 'A4', + 'margin-top': '2cm', + 'margin-right': '2cm', + 'margin-bottom': '2cm', + 'margin-left': '2cm', + 'encoding': 'UTF-8', + 'no-outline': None, + 'enable-local-file-access': None + } + + pdfkit.from_string(complete_html, output_path, options=options) + + return output_path + + except Exception as e: + logger.error(f"Error in _markdown_to_pdf_pdfkit: {str(e)}") + # If all methods fail, return None + return None \ No newline at end of file