-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikipedia_client.py
31 lines (29 loc) · 1.09 KB
/
wikipedia_client.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import wikipedia
import os.path
import xml.etree.cElementTree as ET
import logging
def download_articles(page_titles_list, dump_folder_path, lang="en"):
logging.info("Articles download start")
wikipedia.set_lang(lang)
extracted_pages_titles=[]
for title in page_titles_list:
filename = title.encode('ascii',errors='ignore').replace(" ","_")+".xml"
dump_file_path=dump_folder_path+filename
if os.path.isfile(dump_file_path):
logging.error(title+": Dump file already exists")
continue
try:
page=wikipedia.page(title)
file = open(dump_file_path, 'a+')
root=ET.Element("documents")
doc = ET.SubElement(root, "doc")
doc.set('title', page.title)
doc.text=page.content
tree = ET.ElementTree(root)
tree.write(file)
extracted_pages_titles.append(page.title)
except:
print(title+" page not saved")
logging.info(title+" page not saved")
logging.info("Articles download end")
return extracted_pages_titles