-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscrape_socrata.py
57 lines (51 loc) · 1.97 KB
/
scrape_socrata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import click
import httpx
import time
import pathlib
import json
@click.command()
@click.argument("directory", type=click.Path(file_okay=False, dir_okay=True))
@click.option("save_stats", "--stats", is_flag=True, help="Also write out stats")
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
def scrape_socrata(directory, save_stats, verbose):
"Scrape all of Socrata for dataset listings and write results to directory"
domain_files = {}
stats_files = {}
root = pathlib.Path(directory)
if not root.exists():
root.mkdir(parents=True)
for record in fetch_all(verbose):
stats = {"id": record["resource"]["id"], "stats": {}}
if "page_views" in record["resource"]:
stats["stats"].update(record["resource"].pop("page_views"))
stats["stats"]["download_count"] = record["resource"].pop("download_count")
domain = record["metadata"]["domain"]
if domain not in domain_files:
domain_files[domain] = (root / "{}.jsonl".format(domain)).open("w")
domain_files[domain].write(json.dumps(record) + "\n")
if save_stats:
if domain not in stats_files:
stats_files[domain] = (root / "{}.stats.jsonl".format(domain)).open("w")
stats_files[domain].write(json.dumps(stats) + "\n")
def fetch_all(verbose=False):
base_url = (
"http://api.us.socrata.com/api/catalog/v1?limit=1000&only=dataset&only=calendar"
)
scroll_id = None
while True:
url = base_url
if scroll_id is not None:
url = base_url + "&scroll_id=" + scroll_id
if verbose:
click.echo(url, err=True)
response = httpx.get(url, timeout=20)
response.raise_for_status()
data = response.json()
results = data["results"]
if not results:
break
scroll_id = results[-1]["resource"]["id"]
yield from results
time.sleep(0.5)
if __name__ == "__main__":
scrape_socrata()