-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_downloader.py
154 lines (122 loc) · 5.93 KB
/
pdf_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from fake_useragent import UserAgent
import logging
import os
import shutil
import requests
import magic
import mimetypes
import requests
from lxml import html
from urllib.parse import urljoin
logger = logging.getLogger("unsc_db_filler")
class DownloadFailed(Exception):
pass
class PDFDownloader:
"""
This class is responsible for Downloading the PDFs holding meeting and resolution transcripts we need
to build our dataset.
**Update 03 July 2022**: The UN has changed how they're serving files to people in an 'interesting'
way. Downloading files goes through multiple 'redirects'. Not pure 302 responses, but actual 200 OKs,
with a <meta> tag refreshing the page to another page 1 second later.
Tracing the events in a browser also reveal that it's loggin in into a specific server using a username
and password (that apparently is not secret..the point of it is void as such), to receive a bunch of cookies
needed to ask the final PDF download.
"""
def __init__(self, path: str = "./") -> None:
self.path: str = path
def get_cookie_string(self) -> str:
"""
Return a cookie string to authenticate the PDF download against the UN.org data serving platform
:return: the cookie string
"""
uri = "https://documents-dds-ny.un.org/prod/ods_mother.nsf?Login&Username=freeods2&Password=1234"
# Do NOT follow the redirect..the cookies we need are returned in the 302!!
r = requests.get(uri, allow_redirects=False)
cookie_string = ""
for cookie, cookie_val in r.cookies.items():
cookie_string += f"{cookie}={cookie_val};"
return cookie_string
def test_for_meta_redirections(
self, r: requests.Response
) -> (bool, requests.Response):
"""
Test if in a HTTP response there are Meta-Refresh tags used.
:param r: the response to be evaluated
:return: (True, new URL) if there is a redirect in the response found, (False, None) if no redirects can be found any longer.
"""
# Credits to https://stackoverflow.com/questions/2318446/how-to-follow-meta-refreshes-in-python
mime = magic.from_buffer(r.content, mime=True)
extension = mimetypes.guess_extension(mime)
if extension == ".html":
html_tree = html.fromstring(r.text)
refresh_attribute = html_tree.xpath(
"//meta[translate(@http-equiv, 'REFSH', 'refsh') = 'refresh']/@content"
)
# In the UN their amazing meta-redirects, you can end up with a 200 OK that doesn't return a PDF.
# Instead, you can get a web page saying the document doesn't exist.
# The Digital Library personnel informed me that all documents need to be translated to multiple languages
# and this can take a few days... . As such
if len(refresh_attribute) == 0:
raise DownloadFailed(f"The download failed...is the document you're trying to fetch maybe not yet available?")
# example attr: ['1; URL=/tmp/6596179.00848389.html']
refresh_attribute = refresh_attribute[0]
wait, text = refresh_attribute.split(";")
text = text.strip()
if text.lower().startswith("url="):
url = text[4:]
if not url.startswith("http"):
# Relative URL, adapt
url = urljoin(r.url, url)
return True, url
return False, None
def follow_redirections(
self, r: requests.Response, s: requests.Session
) -> requests.Response:
"""
Recursive function that follows meta refresh redirections if they exist in a given HTTP response
:param r: The Response to be evaluated for meta refresh redirections
:param s: The Session to use
"""
redirected, url = self.test_for_meta_redirections(r)
if redirected:
cookie = self.get_cookie_string()
headers = {"Cookie": cookie}
r = self.follow_redirections(s.get(url, headers=headers), s)
return r
def download_pdf(self, what_to_fetch: str, uri: str = None) -> str:
"""
Downloads a resolution or meeting transcript, and saves it to disk.
It returns the filename written.
:param what_to_fetch: the original ID of the meeting or resolution to download
:param
:return: the filename written to disk
"""
file_to_write = what_to_fetch.replace(".", "_")
file_to_write = file_to_write.replace("/", "_")
if not os.path.exists(f"{self.path}"):
os.makedirs(f"{self.path}")
file_to_write = f"{self.path}/{file_to_write}"
if uri == None:
uri = f"http://www.undocs.org/en/{what_to_fetch}"
s = requests.Session()
r = s.get(uri, allow_redirects=True)
with self.follow_redirections(r, s) as r:
if r.status_code == 200:
# Of course... the UN now always returns a 200 OK... even if what we ask is nonsense...
# great....... So now we need to check if what they return is actually a PDF or not.
content_type = r.headers.get('Content-Type')
logger.info(f"Content-type for '{what_to_fetch}' is: %s", content_type)
if content_type != 'application/pdf':
raise DownloadFailed(
f"Unable to download {what_to_fetch}...we received content-type '{content_type}' instead of 'application/pdf'"
)
with open(file_to_write, "wb") as f:
f.write(r.content)
else:
logger.info(
f"Unable to download {what_to_fetch}...status code was {r.status_code}"
)
raise DownloadFailed(
f"Unable to download {what_to_fetch}...status code was {r.status_code}"
)
return file_to_write