-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy paths1_download.py
More file actions
68 lines (57 loc) · 2.21 KB
/
s1_download.py
File metadata and controls
68 lines (57 loc) · 2.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Name: s1_download.py
# Purpose: Download PDFs discovered during web crawling
# Invocation: python3 s1_download.py <projName>
import codecs
import csv
import os
import re
import sys
# Name: valid_arguments
# Purpose: Check whether the command-line arguments are valid
# Parameters: sys.argv (globally defined list of command-line arguments)
# Returns: True (all arguments are valid) or False (at least one argument is invalid)
def valid_arguments():
if len(sys.argv) == 2 and re.search(r"^[a-zA-Z][a-zA-Z_-]*$", sys.argv[1]):
return True
return False
# Name: is_pdf
# Purpose: Determine whether the URL points to a PDF
# Parameters: url
# metadata
# Returns: Regular expression match object
def is_pdf(url, metadata):
urlMatch = re.search(r"^(\S+)\.([pP][dD][fF])$", url)
metadataMatch = re.search(r"Content-Type:application/pdf", metadata)
return urlMatch or metadataMatch
# Name: download_pdf
# Purpose: Download the PDF
# Parameters: url
# projName (project name)
# Returns:
def download_pdf(url, projName):
# Use the Linux/Unix utility wget to download the PDF
os.system("wget --no-check-certificate -nv --user-agent=\"SABLE (U.S. Census Bureau research to find alternative data sources and reduce respondent burden) https://github.com/uscensusbureau/sable/; census-aidcrb-support-team@census.gov; For more information, go to www.census.gov/scraping/\" -P ./{}/download \"{}\"".format(projName, url))
return
# Name: download_pdfs
# Purpose: Download PDFs
# Parameters: projName (project name)
# Returns:
def download_pdfs(projName):
# Read in the list of URLs crawled by Apache Nutch and download the PDFs
f = codecs.open("./{}/dump/dump.csv".format(projName), "r")
rdr = csv.DictReader(f)
for row in rdr:
if is_pdf(row["Url"], row["Metadata"]):
download_pdf(row["Url"], projName)
f.close()
return
def main():
# Check valid arguments
if valid_arguments():
download_pdfs(sys.argv[1])
else:
print("\nInvalid arguments")
print("Invocation: python3 s1_download.py <projName>\n")
return
if __name__ == "__main__":
main()