-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy paths1_download.py
66 lines (55 loc) · 2.09 KB
/
s1_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#Name: s1_download.py
#Purpose: Download PDFs discovered during web crawling
#Invocation: python3 s1_download.py <projName>
import codecs
import csv
import os
import re
import sys
#Name: valid_arguments
#Purpose: Check whether the command-line arguments are valid
#Parameters: sys.argv (globally defined list of command-line arguments)
#Returns: True (all arguments are valid) or False (at least one argument is invalid)
def valid_arguments():
if len(sys.argv) == 2 and re.search(r"^[a-zA-Z][a-zA-Z_-]*$", sys.argv[1]):
return True
return False
#Name: is_pdf
#Purpose: Determine whether the URL points to a PDF
#Parameters: url
# metadata
#Returns: Regular expression match object
def is_pdf(url, metadata):
urlMatch = re.search(r"^(\S+)\.([pP][dD][fF])$", url)
metadataMatch = re.search(r"Content-Type:application/pdf", metadata)
return urlMatch or metadataMatch
#Name: download_pdf
#Purpose: Download the PDF
#Parameters: url
# projName (project name)
#Returns:
def download_pdf(url, projName):
#Use the Linux/Unix utility wget to download the PDF
os.system("wget --no-check-certificate -nv --user-agent=\"SABLE (U.S. Census Bureau research to find alternative data sources and reduce respondent burden) https://github.com/uscensusbureau/sable/; [email protected]; For more information, go to www.census.gov/scraping/\" -P /{}/download/ {}".format(projName, url))
return
#Name: download_pdfs
#Purpose: Download PDFs
#Parameters: projName (project name)
#Returns:
def download_pdfs(projName):
#Read in the list of URLs crawled by Apache Nutch and download the PDFs
f = codecs.open("/{}/dump/dump.csv".format(projName), "r")
rdr = csv.DictReader(f)
for row in rdr:
if is_pdf(row["Url"], row["Metadata"]):
download_pdf(row["Url"], projName)
f.close()
return
def main():
if valid_arguments():
download_pdfs(sys.argv[1])
else:
print("\nInvalid arguments\n")
return
if __name__ == "__main__":
main()