-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_downloader.py
150 lines (121 loc) · 5.85 KB
/
html_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import logging
import requests
from download_job import DownloadJob
from job_queue import JobQueue
logger = logging.getLogger("unsc_db_filler")
class DownloadFailed(Exception):
pass
class HTMLDownloader:
"""
This class is responsible for Downloading the UNSC pages we need to build our dataset.
They are the general Meeting pages, and the Veto page.
This class uses a Job Queue since the un.org web servers frequently give random errors.
"""
def __init__(self, path: str = "./", since: int = 1946, until: int = 2021) -> None:
self.path = path
self.since = since
self.until = until
self.job_queue = JobQueue()
def fetch_meeting_tables(self) -> None:
"""
Fetches all the UN Meeting tables webpages between the year `self.since`
and `self.until` to the location `self.path`.
:param since: The first year to fetch the tables for
:param until: The last year to fetch the tables for
:return:
"""
for year in range(self.since, self.until):
# Some pages on the UN website end with .html, others with .htm...
# it seems that prior to the year 1994, pages are .html, after that, .htm.
# Since we do not want to check all these pages individually, we'll just try
# and go to the .html version, and if that fails go to .htm
if (
year == 2022
): # TODO: UN made it again inconsistent... we can query https://research.un.org/en/docs/sc/quick/meetings/2022
# and query the iframe in which this page is rendered to figure out what the real url is we need to grab.
url = f"https://www.un.org/depts/dhl/resguide/scact{year}_table_en.html"
elif year < 1994:
url = f"https://www.un.org/depts/dhl/resguide/scact{year}_table_en.html"
else:
url = f"https://www.un.org/depts/dhl/resguide/scact{year}_table_en.htm"
# Originally we just tried html. If it failed, try .htm.
# That trick was not reliable..the UN sometimes has both .html and .html..
# but one is then incomplete!
#
# if res.status_code == 404:
# logger.info("Failed to fetch .html, retrying .htm...")
# url = f"https://www.un.org/depts/dhl/resguide/scact{year}_table_en.htm"
# res = fetch_url(url)
#
# Leaving in this comment as 'historical software engineering story telling'
# https://www.youtube.com/watch?v=4PaWFYm0kEw&t=48s
download_job = DownloadJob(
url=url,
dest_file=f"scact{year}_table_en.html",
description=f"UNSC Meeting Table for Year `{year}`",
)
self.job_queue.enqueue(download_job)
# In 2020 the UN started to document Covid remote meetings in another page.
# Half their page was in the previous pages, half their page is on the new page:
# https://www.un.org/depts/dhl/resguide/SC_2020-revised.html
#
# We create another downloadjob for 2020:
if year == 2020:
download_job = DownloadJob(
url="https://www.un.org/depts/dhl/resguide/SC_2020-revised.html",
dest_file=f"scact2020_covid_table_en.html",
description=f"UNSC Meeting Covid Table for Year `{year}`",
)
self.job_queue.enqueue(download_job)
self.job_queue.process(self.download)
def fetch_veto_table(self) -> None:
"""
Enqueues the download job for downloading the veto table from the UNSC website,
and starts processing it
"""
url = "https://www.un.org/depts/dhl/resguide/scact_veto_table_en.htm"
download_job = DownloadJob(
url=url,
dest_file=f"scact_veto_table_en.html",
description=f"UNSC Veto Table",
)
self.job_queue.enqueue(download_job)
self.job_queue.process(self.download)
def download(self, job: DownloadJob) -> None:
"""
Downloads the web page requested in a given DownloadJob
and writes it to disk as the file specified in the DownloadJob
This method is passed to the Job Queue process function to do the real heavy lifting
:param job: the DownloadJob we use to download the real data we need
"""
logger.info("Downloading %s - BEGIN", job.info())
# No try..except here.. let it crash...
# the job queue processing jobs needs to know when it failed
res = self.fetch_url(job.url)
logger.info("Downloading %s - END", job.info())
if res.status_code == 200:
self.write_to_disk(content=res.text, file=job.dest_file)
else:
raise DownloadFailed(
f"Unable to download the html file for {job.dest_file}...status code was {res.status_code}"
)
def fetch_url(self, url: str) -> requests.Response:
"""
Fetch a given url and return a requests Response
:param url: the URL we want to fetch
:return: the requests Response
"""
logger.info("Fetching %s", url)
return requests.get(url)
def write_to_disk(self, content: str, file: str) -> None:
"""
Writes a given content for a given year, to a file called
scact<year>_table_en.html in the SCRATCH_FOLDER location.
:param content: The content that needs to be written to file
:param file: the filename in which the content will be written
"""
output_file = f"{self.path}/{file}"
logger.info("Writing content to file '%s' BEGIN", output_file)
with open(output_file, "w") as f:
f.write(content)
logger.info("Writing content to file '%s' END", output_file)