-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathfetch.py
301 lines (276 loc) · 10.6 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
from bs4 import BeautifulSoup
import datetime
import json
import os
import requests
import time
import urllib
def writeStdout(s):
print(s)
def countryListForFile(filelocation):
countryPages = []
f = open(filelocation)
content = f.read()
f.close()
soup = BeautifulSoup(content, 'html.parser')
countryEls = soup.select("select option")
for countryEl in countryEls:
if "value" in countryEl.attrs and countryEl.attrs["value"]:
page = countryEl.attrs["value"].split("/")[-1]
if not page.endswith(".html"):
continue
countryPages.append(page)
return countryPages
# Saves the current world factbook as raw html
ajaxUrl = "https://web.archive.org/__wb/calendarcaptures?url=https%%3A%%2F%%2Fwww.cia.gov%%2Flibrary%%2Fpublications%%2Fthe-world-factbook%%2Fgeos%%2F%s.html&selected_year=%s"
f = open("config.json")
configStr = f.read()
f.close()
config = json.loads(configStr)
dstRoot = config["country_html_root"]
blacklistRoot = config["country_html_blacklist"]
archiveRoot = config["country_html_yearly_summaries"]
goodCitizenDelay = 2
errorDelay = 6
yearForToday = datetime.datetime.utcnow().year
def getPage(pageFilename):
# set the date as the end of this year
currentYear = datetime.datetime.utcnow().year
currentDate = datetime.datetime.utcnow().date()
minYear = 2007
while currentYear >= minYear:
pageCode = pageFilename.replace(".html", "")
yearlySummaryUrl = ajaxUrl % (pageCode, currentYear)
# if year is this year, remove yearly summary from local cache
if currentYear == yearForToday:
removeArchiveYearlySummary(yearlySummaryUrl)
# get the archive history for that year
yearlySummaryContent = saveArchiveYearlySummary(yearlySummaryUrl)
if yearlySummaryContent is None:
currentDate = datetime.datetime(currentDate.year - 1, 12, 31).date()
continue
data = json.loads(yearlySummaryContent)
# get links to archive pages
# specifically get only
# * https pages - http links to redirect with no content
# * from the popup on the date (not the date itself)
# * the earliest time from the popup
links = []
for month in data:
for week in month:
for day in week:
if day and "ts" in day:
for i, st in enumerate(day["st"]):
if st != 200:
continue
url = "https://web.archive.org/web/%s/https://www.cia.gov/library/publications/the-world-factbook/geos/%s.html"
ts = day["ts"][i]
link = url % (ts, pageCode)
links.append(link)
break
# iterate backward through the year getting archive files
while currentDate.year == currentYear:
# look for previous monday in archive
currentDate = getPrevMonday(currentDate)
# find file that's most recent and before this date
latestLinkDate = None
latestLinkHref = None
for link in links:
linkHref = link
dateStr = linkHref.split("/")[4][:8]
linkDate = datetime.datetime.strptime(dateStr, "%Y%m%d").date()
isEarlierThanPrevMonday = linkDate <= currentDate
isLaterThanLatestDate = latestLinkDate is None or linkDate > latestLinkDate
if isEarlierThanPrevMonday and isLaterThanLatestDate:
latestLinkDate = linkDate
latestLinkHref = linkHref
if latestLinkHref is not None:
# save file for this page
pageContent = savePageForUrl(latestLinkHref, latestLinkDate)
# ensure next fetch is prior to this file date
if latestLinkDate is not None:
currentDate = latestLinkDate
# set year to previous year
currentYear = currentDate.year
def yearlySummaryFilenameForUrl(url):
pageFilename = urlToFilename(url)
dstFilename = os.path.join(archiveRoot, pageFilename)
return dstFilename
def urlToFilename(s):
return urllib.parse.quote(s, safe='')
def removeArchiveYearlySummary(url):
filename = yearlySummaryFilenameForUrl(url)
# only remove if created more than 24h ago
if not os.path.isfile(filename):
return
modified = os.path.getmtime(filename)
cacheExpiry = int(time.time()) - 3 * 24 * 60 * 60 # 3 days
if modified < cacheExpiry:
writeStdout("Removing outdated yearly summary: %s" % filename)
os.remove(filename)
else:
writeStdout("Using recently cached yearly summary: %s" % filename)
def saveArchiveYearlySummary(url):
# create the filename for this page
dstFilename = yearlySummaryFilenameForUrl(url)
# create the directory to store this page
os.makedirs(archiveRoot, exist_ok=True)
if not os.path.isfile(dstFilename):
print("Fetching", url)
try:
r = requests.get(url)
yearlySummaryContent = r.text
except:
yearlySummaryContent = None
while shouldRetry(yearlySummaryContent):
writeStdout("Error getting yearly summary, sleeping %s seconds" % errorDelay)
time.sleep(errorDelay)
print("Fetching", url)
r = requests.get(url)
yearlySummaryContent = r.text
f = open(dstFilename, 'w')
f.write(yearlySummaryContent)
f.close()
time.sleep(goodCitizenDelay)
else:
print("Reading", url)
f = open(dstFilename)
yearlySummaryContent = f.read()
f.close()
return yearlySummaryContent
def savePageForUrl(url, date):
# get the filename for the page
pageFilename = None
bits = url.split("?")[0].split("/")
bits.reverse()
for bit in bits:
if bit.endswith(".html"):
pageFilename = bit
break
if pageFilename is None:
print("Not saving blank page for", url)
return
# Create the directory for this date to store this set of pages
dateStr = date.isoformat()
dstDir = os.path.join(dstRoot, dateStr)
os.makedirs(dstDir, exist_ok=True)
blacklistDir = os.path.join(blacklistRoot, dateStr)
# Create the filename for this page
pageFilename = urlToFilename(url)
dstFilename = os.path.join(dstDir, pageFilename)
# Prepare the blacklist filename in case the page is blacklisted
dstBlacklist = os.path.join(blacklistDir, pageFilename)
# Fetch it if required
if not os.path.isfile(dstFilename) and not os.path.isfile(dstBlacklist):
needsRetry = True
while needsRetry:
print("Fetching %s" % url)
try:
r = requests.get(url)
content = r.text
except:
content = None
# decide whether or not to retry fetching this file based on content
needsRetry = shouldRetry(content)
if needsRetry:
writeStdout("Error getting page, sleeping %s seconds" % errorDelay)
time.sleep(errorDelay)
# decide whether or not to blacklist this file based on content
if shouldBlacklist(content):
os.makedirs(blacklistDir, exist_ok=True)
dstFilename = dstBlacklist
print("Blacklisting %s" % url)
# save the page content
f = open(dstFilename, 'w')
f.write(content)
f.close()
time.sleep(goodCitizenDelay)
else:
if os.path.isfile(dstFilename):
print("Reading %s" % url)
f = open(dstFilename)
content = f.read()
f.close()
elif os.path.isfile(dstBlacklist):
print("Blacklisted %s" % url)
f = open(dstBlacklist)
content = f.read()
f.close()
return content
def shouldRetry(content):
if content is None:
return True
# detect illegal strings
mustNotInclude = [
"504 Gateway Time-out",
]
needsRetry = False
for s in mustNotInclude:
foundForbidden = content.find(s) > -1
if foundForbidden:
print("Found forbidden retry phrase: %s" % s)
needsRetry = needsRetry or foundForbidden
return needsRetry
# some files have no content, in which case they should be blacklisted
def shouldBlacklist(content):
# detect illegal strings
mustNotInclude = [
"HTTP 301",
"404 Not Found",
"404 - Not Found",
"Access Denied",
"meta http-equiv=\"refresh\"",
"Connection Failure",
"Connection Timeout",
"coldfusion.bootstrap",
]
isBlacklisted = False
for s in mustNotInclude:
foundForbidden = content.find(s) > -1
if foundForbidden:
print("Found forbidden blacklist phrase: %s" % s)
isBlacklisted = isBlacklisted or foundForbidden
# detect missing strings
mustInclude = [
#"<!-- InstanceEnd -->",
]
for s in mustInclude:
missingRequired = content.find(s) == -1
if missingRequired:
print("Missing required blacklist phrase: %s" % s)
isBlacklisted = isBlacklisted or missingRequired
return isBlacklisted
def getPrevMonday(d):
daysAfterPrevMonday = d.isoweekday() - 1 % 7
if daysAfterPrevMonday == 0:
daysAfterPrevMonday = 7
return d - datetime.timedelta(days=daysAfterPrevMonday)
worldPage = "xx.html"
worldContent = getPage(worldPage)
print("Getting country list")
# get all other countries from all world pages
# could use set instead of list, but want to preserve order
countryPages = []
for dirDate in os.listdir(dstRoot):
worldDir = os.path.join(dstRoot, dirDate)
possibleFiles = os.listdir(worldDir)
worldFiles = [x for x in possibleFiles if x.endswith(worldPage)]
if len(worldFiles) > 1:
print("WARNING found multiple world files in", worldDir, worldFiles)
if len(worldFiles) == 1:
# get list of countries for the world page on this date
worldFile = os.path.join(worldDir, worldFiles[0])
countries = countryListForFile(worldFile)
# add any new countries to countryPages
for country in countries:
if country not in countryPages:
countryPages.append(country)
# don't repeat fetch for the world
countryPages.remove(worldPage)
# don't fetch Baker Island
countryPages.remove("fq.html")
# fetch the historical files for the rest of the countries
print("Parsing %s countries" % len(countryPages))
for i, countryPage in enumerate(countryPages):
print("Parsing %s of %s" % ((i+1), len(countryPages)))
getPage(countryPage)