-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathclimate_crawler.py
121 lines (101 loc) · 3.45 KB
/
climate_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
# encoding: utf-8
# 修改自 https://gist.github.com/wy36101299/e3b32c674d9e86ba581f
import requests
import pandas as pd
from time import sleep
from bs4 import BeautifulSoup
import csv
import os
import datetime
# 產生data List , data List為兩年份
def date():
month31=[1,3,5,7,8,10,12]
month30=[4,6,9,11]
year2=['2016','2017']
nday31=range(1,32)
nday30=range(1,31)
nday28=range(1,29)
day10=['01','02','03','04','05','06','07','08','09']
month12=day10+['10','11','12']
nday31 = map(str,nday31[9:])
nday30 = map(str,nday30[9:])
nday28 = map(str,nday28[9:])
day31 = day10 + list(nday31)
day30 = day10 + list(nday30)
day28 = day10 + list(nday28)
output=[]
s=""
for year in year2:
for month,strmonth in zip(range(1,13),month12):
if month in month31:
for day in day31:
s = year+'-'+strmonth+'-'+day
output.append(s)
elif month in month30:
for day in day30:
s = year+'-'+strmonth+'-'+day
output.append(s)
else:
for day in day28:
s = year+'-02-'+day
output.append(s)
return output
# 爬取主函式
def crawler(url, name, save_path):
resp = requests.get(url)
soup = BeautifulSoup(resp.text)
form =[]
# title
second_tr = soup.find(class_="second_tr")
titles = soup.find_all("th")
titles = titles[9:]
strtitle=[]
for title in titles:
title = title.contents
title=title[0]+title[2]+title[4]
strtitle.append(title)
# parameter
soup = soup.tbody
tmps = soup.find_all("tr")
tmps = tmps[2:]
for tmp in tmps:
tmp = tmp.find_all("td")
parameter =[]
for strtmp in tmp:
strtmp = strtmp.string
parameter .append(strtmp)
form.append(parameter)
form = pd.DataFrame(form, columns=strtitle)
form.to_csv(save_path + name + ".txt", encoding ="utf-8")
sleep(0.5)
if __name__ == "__main__":
start = datetime.datetime.strptime("2017-01-01", "%Y-%m-%d")
end = datetime.datetime.strptime("2018-01-01", "%Y-%m-%d")
date_generated = [(start + datetime.timedelta(days=x)).strftime("%Y-%m-%d") for x in range(0, (end-start).days)]
download = date_generated
# download = date()
hostUrl = "http://e-service.cwb.gov.tw/HistoryDataQuery/DayDataController.do?"
fixedParameter = "command=viewMain"
with open('CWB_Stations_171226.csv', 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
firstline = True
for row in spamreader:
if firstline: #skip first line
firstline = False
continue
# print ', '.join(row)
station_id = row[0]
station_name = row[1]
save_path = './data/%s_%s/' % (station_id, station_name)
if not os.path.exists(save_path):
os.mkdir(save_path)
for date in download:
url = '%s%s&station=%s&stname=%s&datepicker=%s' % (hostUrl, fixedParameter, station_id, station_name, date)
try:
print(url)
crawler(url, date, save_path)
except:
# 若是爬取失敗把該日期寫入error.txt
with open(save_path + "error.txt", 'a') as f:
f.write(date+'\n')