-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcrawler_yearly.py
96 lines (64 loc) · 2.55 KB
/
crawler_yearly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
# encoding: utf-8
# 修改自 https://gist.github.com/wy36101299/e3b32c674d9e86ba581f
import requests
from time import sleep
from bs4 import BeautifulSoup
import csv
import json
import os
OVER_WRITE = False
def generate_query_year():
return range(2005, 2018)
# https://stackoverflow.com/questions/209840/map-two-lists-into-a-dictionary-in-python
def mapping_two_list_to_dict(keys, values):
return dict(zip(keys, values))
def crawler(url):
json_data = {}
resp = requests.get(url)
soup = BeautifulSoup(resp.text)
trs = soup.findAll('tr')
ths = trs[2].findAll('th')
title = [th.text.split(')')[1].strip() for th in ths]
# Fix: No english column name in the field
title[title.index('')] = 'Precp1DayMaxTime'
for tr in trs[3:]:
tds = tr.findAll('td')
row = [td.text.strip() for td in tds]
dictionary = mapping_two_list_to_dict(title, row)
json_data[dictionary['ObsTime']] = dictionary
return json_data
def write_json(json_data, save_dir_path, filename):
if not os.path.exists(save_dir_path):
os.mkdir(save_dir_path)
with open(save_dir_path + filename, 'w') as outfile:
json.dump(json_data, outfile, indent=4)
if __name__ == "__main__":
year_generated = generate_query_year()
hostUrl = "http://e-service.cwb.gov.tw/HistoryDataQuery/YearDataController.do?command=viewMain"
# read stations
with open('CWB_Stations_171226.csv', 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
firstline = True
for row in spamreader:
if firstline: # skip first line
firstline = False
continue
station_id, station_name = row[0], row[1]
print station_id, station_name
# crawler
for year in year_generated:
url = '%s&station=%s&stname=%s&datepicker=%s' % (hostUrl, station_id, station_name, year)
print(url)
save_dir_path = './CODiS-data/%s_%s/' % (station_id, station_name)
filename = '%s.json' % year
if (not os.path.exists(save_dir_path + filename)) or (OVER_WRITE is True):
print save_dir_path + filename
try:
json_data = crawler(url)
write_json(json_data, save_dir_path, filename)
except Exception, e:
print e
sleep(0.4)
else:
print 'pass'