-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWiki_deaths.py
88 lines (63 loc) · 2.24 KB
/
Wiki_deaths.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import wikipediaapi
import requests
import json
def main():
wiki = wikipediaapi.Wikipedia(language='en')
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
years = [x for x in range(1992, 2023)]
check_pages(wiki, years, month)
tot_data = {}
for year in years:
tot_data[year] = {}
for month in months:
print("processing", year, month)
page = wiki.page(f'Deaths_in_{month}_{str(year)}')
parsed_page = parse_page(page.text, month, years)
tot_data[year][month] = parsed_page
print("Added",sum(len(v) for v in parsed_page.values()),"deaths")
# create file and write on it
with open("Death_File.json", "w") as file_json:
json.dump(tot_data, file_json, indent=3)
# check existence
def check_pages(wiki, years, months):
page_names = []
for year in years:
for month in months:
page_names.append(f'Deaths_in_{month}_{str(year)}')
existence = []
for check in page_names:
page_py = wiki.page(check)
existence.append(page_py.exists())
print(page_py.title)
def parse_page(text , months, years):
text_divided = text.split("\n")
day_to_death = {}
day = 1
flag_in_day = False
for line in text_divided:
if line == str(day):
day_to_death[day] = []
day += 1
flag_in_day = True
elif line != "" and not line.startswith("=") and flag_in_day == True:
parsed_line = parse_line(line)
if parsed_line is not None:
day_to_death[day-1].append(parsed_line)
return day_to_death
def parse_line(line):
info_dic = {}
line_list = line.strip(".").split(", ")
if len(line_list) > 1:
info_dic["Name"] = line_list[0]
try:
info_dic["Age"] = int(line_list[1])
except ValueError as e:
return None
info_dic["Other_info"] = line_list[2:]
return info_dic
else:
return None
def pprint (obj):
print(json.dumps(obj, indent=3))
if __name__ == "__main__":
main()