-
Notifications
You must be signed in to change notification settings - Fork 0
/
日语数据收集.py
47 lines (38 loc) · 1.41 KB
/
日语数据收集.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# %%
import os
import re
import json
# 要遍历的目录
notebook_dir_jp = r"D:\OneDrive\Notebook"
# 输出文件
output_file_jp = r"notebook_japanese.json"
# 定义一个函数来提取日语字符(含平假名、片假名、汉字)
# 日语字符范围示例:
# 平假名: \u3040-\u309F
# 片假名: \u30A0-\u30FF
# 汉字: \u4E00-\u9FAF (等同中文汉字范围)
jp_pattern = r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF]+'
def extract_japanese(text):
return ''.join(re.findall(jp_pattern, text))
data_jp = []
for root, dirs, files in os.walk(notebook_dir_jp):
for file in files:
if file.endswith(".md"):
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 提取日语字符
content_japanese = extract_japanese(content)
# 存入字典
file_data = {
"file_name": file,
"content": content, # 原始内容
"content_japanese": content_japanese # 日语提取结果
}
data_jp.append(file_data)
# 将数据逐行写入json文件,每行一个JSON对象
with open(output_file_jp, 'w', encoding='utf-8') as f:
for item in data_jp:
json.dump(item, f, ensure_ascii=False)
f.write('\n')
print(f"日语数据已保存到 {output_file_jp},每行一个 JSON 对象。")