forked from marcelo-earth/generative-manim
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocs.py
More file actions
144 lines (122 loc) · 5.04 KB
/
Copy pathdocs.py
File metadata and controls
144 lines (122 loc) · 5.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import re
import time
import requests
import html2text
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
# Base URL of the documentation
BASE_URL = "https://docs.manim.community/en/stable/"
# Base directory to save the markdown files
OUTPUT_DIR = "docs_md"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# A session to reuse HTTP connections
session = requests.Session()
def is_valid_url(url):
"""
Only allow URLs that belong to the docs.manim.community/en/stable/ site.
"""
parsed = urlparse(url)
base_parsed = urlparse(BASE_URL)
return (parsed.scheme in ("http", "https") and
parsed.netloc == base_parsed.netloc and
parsed.path.startswith(base_parsed.path))
def url_to_local_path(url):
"""
Convert a URL into a local file path that preserves the URL’s folder structure.
For example, a URL ending with:
/_modules/manim/mobject/geometry/line.html
will be saved as:
docs_md/_modules/manim/mobject/geometry/line.html.md
"""
parsed = urlparse(url)
base_path = urlparse(BASE_URL).path
# Get the relative path after the base
rel_path = parsed.path[len(base_path):].lstrip("/")
if not rel_path:
rel_path = "index.html"
local_path = os.path.join(OUTPUT_DIR, rel_path)
# Ensure the file ends with .md (appending .md even if it ends with .html)
local_path += ".md"
return local_path
def convert_html_to_markdown(html_content):
"""
Convert HTML content to Markdown using html2text.
"""
h = html2text.HTML2Text()
h.ignore_links = False
h.body_width = 0 # do not wrap lines
return h.handle(html_content)
def crawl(url, visited):
"""
Recursively crawl the documentation pages starting from the given URL.
"""
if url in visited:
return
print(f"Processing: {url}")
visited.add(url)
try:
response = session.get(url)
response.raise_for_status()
except Exception as e:
print(f"Failed to get {url}: {e}")
return
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
# Extract only the first element with class "content"
content_div = soup.find(class_="content")
if content_div:
content_html = str(content_div)
else:
print(f"No content div found in {url}; using full page.")
content_html = html_content
markdown = convert_html_to_markdown(content_html)
# Determine the local file path and ensure its directory exists
local_path = url_to_local_path(url)
os.makedirs(os.path.dirname(local_path), exist_ok=True)
with open(local_path, "w", encoding="utf-8") as f:
f.write(markdown)
print(f"Saved markdown to {local_path}")
# Find and process links on the page
for link in soup.find_all("a", href=True):
href = link["href"]
full_url = urljoin(url, href)
full_url = full_url.split("#")[0] # remove any fragment identifier
if is_valid_url(full_url) and full_url not in visited:
time.sleep(0.1) # be polite with a short delay
crawl(full_url, visited)
def combine_markdown_files(root_dir, output_file):
"""
Recursively traverse root_dir and combine all .md files into one huge Markdown file.
A heading structure (with '#' characters) is added based on the folder hierarchy.
"""
with open(output_file, "w", encoding="utf-8") as out:
def process_dir(current_dir, level):
# Write a heading for the current directory (skip if we're at the root)
if os.path.abspath(current_dir) != os.path.abspath(root_dir):
dir_name = os.path.basename(current_dir)
out.write("\n" + "#" * level + " " + dir_name + "\n\n")
# Get sorted list of items
items = sorted(os.listdir(current_dir))
# Separate directories and markdown files
dirs = [i for i in items if os.path.isdir(os.path.join(current_dir, i))]
md_files = [i for i in items if os.path.isfile(os.path.join(current_dir, i)) and i.endswith(".md")]
# Process markdown files in the current directory
for md_file in md_files:
file_path = os.path.join(current_dir, md_file)
# Use a heading level one deeper than the directory
out.write("\n" + "#" * (level + 1) + " " + md_file + "\n\n")
with open(file_path, "r", encoding="utf-8") as f:
out.write(f.read() + "\n\n")
# Recursively process subdirectories
for d in dirs:
process_dir(os.path.join(current_dir, d), level + 1)
process_dir(root_dir, 1)
print(f"Combined markdown saved to {output_file}")
if __name__ == "__main__":
visited = set()
crawl(BASE_URL, visited)
print("Download complete.")
# After crawling, combine all markdown files into one huge markdown file.
combined_output = "combined_docs.md"
combine_markdown_files(OUTPUT_DIR, combined_output)