-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_ncode.py
executable file
·132 lines (98 loc) · 4.04 KB
/
get_ncode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/python
import os
import sys
import re
import requests
import glob
from bs4 import BeautifulSoup
import json
def get_novel_data(main_url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
response = requests.get(main_url, headers=headers)
response.encoding = "utf-8"
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
total_chapters = len(soup.find_all("div", class_="p-eplist__sublist"))
if total_chapters == 0 :
return 0, ''
novel_title = soup.find("h1", class_="p-novel__title").text
return total_chapters, novel_title
def is_valid_directory_name(directory_name):
pattern = r'^[^\/:*?"<>|]+$'
if re.match(pattern, directory_name):
return True
return False
def is_valid_file_name(file_name):
pattern = r'^[^\/:*?"<>|]+\.txt$'
if re.match(pattern, file_name):
return True
return False
def fix_file_name(file_name):
pattern = r'[\/:*?"<>|]+'
fixed_file_name = re.sub(pattern, '', file_name)
return fixed_file_name + '.txt' if not fixed_file_name.endswith('.txt') else fixed_file_name
def add_chapter_number(file_name, chapter_number, padding_width):
file_name = f"{str(chapter_number).zfill(padding_width)}_{file_name}"
return file_name + '.txt' if not file_name.endswith('.txt') else file_name
def check_directory(directory_name):
if os.path.exists(directory_name):
print(f"目录 {directory_name} 已存在")
return
if not is_valid_directory_name(directory_name):
print("目录名称不符合规范")
return
try:
os.makedirs(directory_name)
print(f"目录 {directory_name} 创建成功")
except OSError as error:
print(f"创建目录 {directory_name} 时出错: {error}")
def main(novel_dir):
main_url = f"https://ncode.syosetu.com/{novel_dir}"
total_chapters, novel_title = get_novel_data(main_url)
if total_chapters == 0 :
return 200
check_directory(novel_dir)
with open("./title.json", 'r+',encoding="utf-8") as f :
titles = json.load(f)
titles[novel_dir] = novel_title
f.seek(0)
json.dump(titles, f, ensure_ascii=False)
#print(f"该作品总共有 {total_chapters} 章。")
start_chapter, end_chapter = 1, 10000
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
for index in range(start_chapter, end_chapter + 1):
chapter_url = f"{main_url}/{index}/"
if len(glob.glob(f"./{novel_dir}/{str(index).zfill(3)}_*.txt")) > 0 :
continue
response = requests.get(chapter_url, headers=headers)
response.encoding = "utf-8"
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
nothing = soup.find("div", class_="nothing")
if nothing is not None :
break
chapter_title = soup.find("h1", class_='p-novel__title').text.strip()
chapter_content = soup.find("div", class_='p-novel__body')
#padding_width = len(str(total_chapters))
file_name = f"{chapter_title[:50]}.txt"
if not is_valid_file_name(file_name):
#print("文件名包含不符合规范的字符,修复中...")
file_name = fix_file_name(file_name)
#print("添加章节号及前导零到文件名中...")
file_name = add_chapter_number(file_name, index, 4)
filename = f"{novel_dir}/{file_name}"
if os.path.exists(filename):
#print(f"文件已经存在,跳过:{filename}")
continue
with open(filename, "w", encoding="utf-8") as file:
file.write(chapter_title + "\n")
for paragraph in chapter_content.find_all("p"):
file.write(paragraph.text + "\n")
#print(f"文件已保存为:{filename}")
return 400
if __name__ == '__main__':
main(sys.argv[1])