generative-manim/docs.py at main · Rroqheo/generative-manim · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import re
import time
import requests
import html2text
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

# Base URL of the documentation
BASE_URL = "https://docs.manim.community/en/stable/"

# Base directory to save the markdown files
OUTPUT_DIR = "docs_md"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# A session to reuse HTTP connections
session = requests.Session()

def is_valid_url(url):
    """
    Only allow URLs that belong to the docs.manim.community/en/stable/ site.
    """
    parsed = urlparse(url)
    base_parsed = urlparse(BASE_URL)
    return (parsed.scheme in ("http", "https") and
            parsed.netloc == base_parsed.netloc and
            parsed.path.startswith(base_parsed.path))

def url_to_local_path(url):
    """
    Convert a URL into a local file path that preserves the URL’s folder structure.

    For example, a URL ending with:
        /_modules/manim/mobject/geometry/line.html
    will be saved as:
        docs_md/_modules/manim/mobject/geometry/line.html.md
    """
    parsed = urlparse(url)
    base_path = urlparse(BASE_URL).path
    # Get the relative path after the base
    rel_path = parsed.path[len(base_path):].lstrip("/")
    if not rel_path:
        rel_path = "index.html"
    local_path = os.path.join(OUTPUT_DIR, rel_path)
    # Ensure the file ends with .md (appending .md even if it ends with .html)
    local_path += ".md"
    return local_path

def convert_html_to_markdown(html_content):
    """
    Convert HTML content to Markdown using html2text.
    """
    h = html2text.HTML2Text()
    h.ignore_links = False
    h.body_width = 0  # do not wrap lines
    return h.handle(html_content)

def crawl(url, visited):
    """
    Recursively crawl the documentation pages starting from the given URL.
    """
    if url in visited:
        return
    print(f"Processing: {url}")
    visited.add(url)

    try:
        response = session.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"Failed to get {url}: {e}")
        return

    html_content = response.text
    soup = BeautifulSoup(html_content, "html.parser")

    # Extract only the first element with class "content"
    content_div = soup.find(class_="content")
    if content_div:
        content_html = str(content_div)
    else:
        print(f"No content div found in {url}; using full page.")
        content_html = html_content

    markdown = convert_html_to_markdown(content_html)

    # Determine the local file path and ensure its directory exists
    local_path = url_to_local_path(url)
    os.makedirs(os.path.dirname(local_path), exist_ok=True)

    with open(local_path, "w", encoding="utf-8") as f:
        f.write(markdown)
    print(f"Saved markdown to {local_path}")

    # Find and process links on the page
    for link in soup.find_all("a", href=True):
        href = link["href"]
        full_url = urljoin(url, href)
        full_url = full_url.split("#")[0]  # remove any fragment identifier
        if is_valid_url(full_url) and full_url not in visited:
            time.sleep(0.1)  # be polite with a short delay
            crawl(full_url, visited)

def combine_markdown_files(root_dir, output_file):
    """
    Recursively traverse root_dir and combine all .md files into one huge Markdown file.
    A heading structure (with '#' characters) is added based on the folder hierarchy.
    """
    with open(output_file, "w", encoding="utf-8") as out:
        def process_dir(current_dir, level):
            # Write a heading for the current directory (skip if we're at the root)
            if os.path.abspath(current_dir) != os.path.abspath(root_dir):
                dir_name = os.path.basename(current_dir)
                out.write("\n" + "#" * level + " " + dir_name + "\n\n")

            # Get sorted list of items
            items = sorted(os.listdir(current_dir))
            # Separate directories and markdown files
            dirs = [i for i in items if os.path.isdir(os.path.join(current_dir, i))]
            md_files = [i for i in items if os.path.isfile(os.path.join(current_dir, i)) and i.endswith(".md")]

            # Process markdown files in the current directory
            for md_file in md_files:
                file_path = os.path.join(current_dir, md_file)
                # Use a heading level one deeper than the directory
                out.write("\n" + "#" * (level + 1) + " " + md_file + "\n\n")
                with open(file_path, "r", encoding="utf-8") as f:
                    out.write(f.read() + "\n\n")

            # Recursively process subdirectories
            for d in dirs:
                process_dir(os.path.join(current_dir, d), level + 1)

        process_dir(root_dir, 1)
    print(f"Combined markdown saved to {output_file}")

if __name__ == "__main__":
    visited = set()
    crawl(BASE_URL, visited)
    print("Download complete.")

    # After crawling, combine all markdown files into one huge markdown file.
    combined_output = "combined_docs.md"
    combine_markdown_files(OUTPUT_DIR, combined_output)