从环境云网站爬取数据

136 阅读 0 评论 90 点赞

我是靠谱客的博主幸福发带，这篇文章主要介绍从环境云网站爬取数据，现在分享给大家，希望可以做个参考。

1、环境云网站：http://www.envicloud.cn/
网站中包含全国各地关于环境方面的数据，我们这次只读取了广东省24小时天气历史数据。
2、关于接口参考环境云帮助页面
3、代码使用python语言实现
4、爬取需要key，免费申请
5、爬取后的数据保存在oracle中，所以需要cx_Oracle插件。
安装方式在https://oracle.github.io/python-cx_Oracle/中查询
我使用命令行执行python -m pip install cx_Oracle --upgrade

即可安装完毕

复制代码

#coding:utf-8 #用于显示代码中的汉字
'''
简介：
天气数据获取脚本
数据获取流程(需要条件：环境云密钥)：
1.首先在数据库中读取需要抓取气象数据的城市数据，这部分也可以直接在环境云网站爬取
2.通过环境云API接口获取天气数据
3.对第二步中产生的JSON格式数据进行处理存入到数据库
'''
import json,sys,urllib2,time,os
import pandas as pd
import cx_Oracle
import datetime
reload(sys)#调用setdefaultencoding时必须要先reload一次sys模块
sys.setdefaultencoding('utf8')
''' 获取监测站信息（即城市信息。
这里注意，以广州为例，广州分为广州、番禺、从化、增城、花都等几部分，
其中广州代表了广州城区。所以这里使用监测站更准确）
数据表字段说明
FID
VARCHAR2(20)
Y
序号
FSITEID
VARCHAR2(20)
Y
监测站号
FSITENAME
VARCHAR2(20)
Y
监测站名
FCITYTYPE
VARCHAR2(20)
Y
行政级别
FCITY
VARCHAR2(20)
Y
市
FLON
NUMBER(5,2)
Y
经度
FLAT
NUMBER(5,2)
Y
纬度
FABOVESEALEVEL
VARCHAR2(20)
Y
海拔
FADDRNO
VARCHAR2(20)
Y
地域编码
FDATASOURCE
VARCHAR2(20)
Y
数据来源（1为气候中心数据；2为环境云数据）
'''
def getsitedata(tablename,datasource):
sitedata = cur.execute("select fsiteid,fsitename,faddrno from %s where fdatasource = '%s'"%(tablename,datasource))
sitedata = sitedata.fetchall()#接收全部的返回结果行
return sitedata
''' 获取数据
参数1：监控站
参数2：开始日期
参数3：结束日期
参数4：环境云key
'''
def getapidata(sitedata,begin_date,end_date,key):
for site in map(None,sitedata):
siteid = site[0]#参考getsitedata函数注释，编号
sitename = site[1]#名字
siteaddrno = site[2]#所属城市
site_index = pd.Index(site)
site_index = site_index.get_loc(siteid)#设置编号。
''' 这里或许直接用函数替代none，没有试过
def add(num):
return num + 1
rs = map(add, lt)
'''
for querydate in pd.date_range(begin_date,end_date):#拆分成每一天
querydate = str(querydate)[:10]#不使用时间，只保留日期
querydate_true = querydate.replace('-','')#将2017-08-17变为20170817
#24小时历史天气。返回结构详见：http://www.envicloud.cn/pages/guide.html#v2dailyweatherhistory
headers={'cache-control': "no-cache"}#这行我也不明白
result='bad'
try:
for hour in range(24):
sHour = str(hour)
print sHour
if (len(sHour) == 1):
sHour = "0" + sHour#将小时变为两位数字
url = 'http://service.envicloud.cn:8082/v2/weatherhistory/%s/%s/%s/%s' % (
key, siteid, querydate_true,sHour)
request=urllib2.Request(url,headers=headers)
content=urllib2.urlopen(request,timeout = 3 )
content=content.read()
data=json.loads(content)
#weather_data = pd.DataFrame(data,index=[0])
#指定日期历史天气
#columns 更新时间 天气现象 气温(℃) 体感温度(℃) 气压(hPa) 相对湿度(%) 降雨量(mm) 风向 风力 风速(m/s)
if data['rdesc']=="Success":
''' 检查某监测站在数据库内，该日期已存在多少条数据。参数：检测点ID，要查询的日期
这个判断是有问题的。因为这个程序由一个旧的程序改的。旧程序是获取某一天的数据（不具体到某个小时），
所以只要数据库内该日期有数据，及可以不再插入了。
但现在程序已经变为按小时查询了，所以即使该日期有数据，也数据不一定完整。
另外也不可以用数据量=24来判断，因为有些时间环境云网站也没有数据，所以上一行才需要判断返回结果是Success
'''
checkresult = checksitedata(siteid, data['updatetime'])
if checkresult > 0:
print 'data already exists'
continue
#插入到数据库
insertintooracle(siteid, data['updatetime'],
data['phenomena'],
data['temperature'],
data['feelst'],
data['airpressure'],
data['humidity'],
data['rain'],
data['winddirect'],
data['windpower'],
data['windspeed']
)
result = 'good'
print 'result is good'
'''这部分注释是旧代码，用于获取过去24小时的历史数据（注意：是当前时间的过去24小时，所以不需要传递日期和时间）
这部分的返回json格式和上面不同，详细参考环境云帮助页面
weather_data = pd.DataFrame(data['history'],
columns=['updatetime','phenomena','temperature',
'feelst','airpressure','humidity','rain',
'winddirect','windpower','windspeed'])#24小时历史天气
result='good'
print 'result is good'
for ind in weather_data.index:
checkresult = checksitedata(siteid,weather_data.at[ind,'updatetime'])#检查某监测站在该日期已存在多少条数据。参数：检测点ID，要查询的日期
#print checkresult
if checkresult>0:
print 'data already exists'
continue
insertintooracle(siteid,weather_data.at[ind,'updatetime'],
weather_data.at[ind,'phenomena'],
weather_data.at[ind,'temperature'],
weather_data.at[ind,'feelst'],
weather_data.at[ind,'airpressure'],
weather_data.at[ind,'humidity'],
weather_data.at[ind,'rain'],
weather_data.at[ind,'winddirect'],
weather_data.at[ind,'windpower'],
weather_data.at[ind,'windspeed']
)
'''
except:
result='bad'
#print request
print siteid
print 'result is bad'
continue
'''这部分是旧代码，用于判断环境云中的错误信息，例如空气湿度值为100。
注意：环境云中很多错误信息
这部分代码未经测试，不保证正确性
windspeed=float(str(weather_data.wspd_avg)[4:8].replace(' ','0'))
if windspeed>60:
windspeed=''
airpressure=weather_data.pressure_avg.astype(float).mean()
if airpressure>1300:
airpressure=''
humidity=weather_data.hum_avg.astype(float).mean()
if humidity>100:
humidity=''
rain=weather_data.rain_full.astype(float).mean()
if rain>200:
rain=''
#temperature_max=str(weather_data.tem_max)[5:9]
#temperature_min=str(weather_data.tem_min)[5:9]
#temperature_mean=''
temperature_max=weather_data.tem_max.astype(float).max()
if temperature_max>50:
temperature_max=''
temperature_min=weather_data.tem_min.astype(float).min()
if temperature_min>50:
temperature_min=''
temperature_mean=''
'''
print 'one city inserted'
print 'all citys inserted'
''' 查询该监测站在某个日期一共有多少条监测记录'''
def checksitedata(siteid,querydate):
sql="select count(*) from imp_weather_data where siteid = '"+siteid+"' and updatetime =to_date('"+querydate+":00','yyyy-mm-dd hh24:mi:ss')"
result = cur.execute(sql)
result = result.fetchall()
result = result[0][0]
return result
# conn.commit()
'''找出数据库表中的记录总数，用于判断插入时的id'''
def checkAllData():
result = cur.execute("select count(*) from imp_weather_data ")
result = result.fetchall()
result = result[0][0]
return result
conn.commit()#如果sql语句不是insert、delete、uodate等不需要提交。参考checksitedata函数
#return 0
def insertintooracle(siteid,updatetime,phenomena,temperature,feelst,airpressure,humidity,rain,winddirect,windpower,windspeed):
table_num=checkAllData()+1
#print table_num
sql="""INSERT INTO imp_weather_data(dataid,siteid,updatetime,phenomena,temperature,
feelst,airpressure,humidity,rain,winddirect,windpower,windspeed)
VALUES (to_number(nvl('"""+str(table_num)+"""',null)),'"""+siteid+"""',
to_date('"""+updatetime+""":00','YYYY-MM-DD HH24:MI:ss'),'"""+phenomena+"""',
to_number(nvl('"""+temperature+"""',null)),to_number(nvl('"""+feelst+"""',null)),
to_number(nvl('"""+airpressure+"""',null)),to_number(nvl('"""+humidity+"""',null)),
to_number(nvl('"""+rain+"""',null)),'"""+winddirect+"""','"""+windpower+"""',
to_number(nvl('"""+windspeed+"""',null)))"""
cur.execute(sql)
'''这是两种sql语句的写法，个人喜欢上面那种
cur.execute("""INSERT INTO imp_weather_data
(dataid,siteid,updatetime,phenomena,temperature,feelst,airpressure,humidity,rain,winddirect,windpower,windspeed)
VALUES (to_number(nvl('%s',null)),'%s',to_date('%s','YYYY-MM-DD HH24:MI'),'%s',to_number(nvl('%s',null)),to_number(nvl('%s',null)),to_number
(nvl('%s',null)),to_number(nvl('%s',null)),to_number(nvl('%s',null)),,'%s','%s',
to_number(nvl('%s',null)))"""
%(table_num,siteid,updatetime,phenomena,temperature,feelst,airpressure,humidity,rain,winddirect,windpower,windspeed)
)'''
conn.commit()
if __name__=='__main__':
connect='parName/parPw@parIP/ParDbName'#parName、parPw、parIP、ParDbName分别代表数据库账号、密码、ip地址、数据库名。
conn = cx_Oracle.connect(connect)
cur = conn.cursor()
#获取所有监控站
sitedata = getsitedata('imp_weather_sites','2')#数据来源（1为气候中心数据；2为环境云数据）
sysdate = datetime.datetime.now()
end_date = sysdate+datetime.timedelta(days=-1)#前一天
begin_date = sysdate+datetime.timedelta(days=-2)#前两天
end_date = str(end_date)[0:10]
begin_date = str(begin_date)[0:10]#去除时间，保留日期
end_date = end_date.replace('-','')
begin_date = begin_date.replace('-','')
getapidata(sitedata,begin_date,end_date,key)#key是环境云的key，大家自行申请。目前免费
conn.close()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#coding:utf-8 #用于显示代码中的汉字
'''
简介：
天气数据获取脚本
数据获取流程(需要条件：环境云密钥)：
1.首先在数据库中读取需要抓取气象数据的城市数据，这部分也可以直接在环境云网站爬取
2.通过环境云API接口获取天气数据
3.对第二步中产生的JSON格式数据进行处理存入到数据库
'''
import json,sys,urllib2,time,os
import pandas as pd
import cx_Oracle
import datetime
reload(sys)#调用setdefaultencoding时必须要先reload一次sys模块
sys.setdefaultencoding('utf8')
''' 获取监测站信息（即城市信息。
这里注意，以广州为例，广州分为广州、番禺、从化、增城、花都等几部分，
其中广州代表了广州城区。所以这里使用监测站更准确）
数据表字段说明
FID
VARCHAR2(20)
Y
序号
FSITEID
VARCHAR2(20)
Y
监测站号
FSITENAME
VARCHAR2(20)
Y
监测站名
FCITYTYPE
VARCHAR2(20)
Y
行政级别
FCITY
VARCHAR2(20)
Y
市
FLON
NUMBER(5,2)
Y
经度
FLAT
NUMBER(5,2)
Y
纬度
FABOVESEALEVEL
VARCHAR2(20)
Y
海拔
FADDRNO
VARCHAR2(20)
Y
地域编码
FDATASOURCE
VARCHAR2(20)
Y
数据来源（1为气候中心数据；2为环境云数据）
'''
def getsitedata(tablename,datasource):
sitedata = cur.execute("select fsiteid,fsitename,faddrno from %s where fdatasource = '%s'"%(tablename,datasource))
sitedata = sitedata.fetchall()#接收全部的返回结果行
return sitedata
''' 获取数据
参数1：监控站
参数2：开始日期
参数3：结束日期
参数4：环境云key
'''
def getapidata(sitedata,begin_date,end_date,key):
for site in map(None,sitedata):
siteid = site[0]#参考getsitedata函数注释，编号
sitename = site[1]#名字
siteaddrno = site[2]#所属城市
site_index = pd.Index(site)
site_index = site_index.get_loc(siteid)#设置编号。
''' 这里或许直接用函数替代none，没有试过
def add(num):
return num + 1
rs = map(add, lt)
'''
for querydate in pd.date_range(begin_date,end_date):#拆分成每一天
querydate = str(querydate)[:10]#不使用时间，只保留日期
querydate_true = querydate.replace('-','')#将2017-08-17变为20170817
#24小时历史天气。返回结构详见：http://www.envicloud.cn/pages/guide.html#v2dailyweatherhistory
headers={'cache-control': "no-cache"}#这行我也不明白
result='bad'
try:
for hour in range(24):
sHour = str(hour)
print sHour
if (len(sHour) == 1):
sHour = "0" + sHour#将小时变为两位数字
url = 'http://service.envicloud.cn:8082/v2/weatherhistory/%s/%s/%s/%s' % (
key, siteid, querydate_true,sHour)
request=urllib2.Request(url,headers=headers)
content=urllib2.urlopen(request,timeout = 3 )
content=content.read()
data=json.loads(content)
#weather_data = pd.DataFrame(data,index=[0])
#指定日期历史天气
#columns 更新时间 天气现象 气温(℃) 体感温度(℃) 气压(hPa) 相对湿度(%) 降雨量(mm) 风向 风力 风速(m/s)
if data['rdesc']=="Success":
''' 检查某监测站在数据库内，该日期已存在多少条数据。参数：检测点ID，要查询的日期
这个判断是有问题的。因为这个程序由一个旧的程序改的。旧程序是获取某一天的数据（不具体到某个小时），
所以只要数据库内该日期有数据，及可以不再插入了。
但现在程序已经变为按小时查询了，所以即使该日期有数据，也数据不一定完整。
另外也不可以用数据量=24来判断，因为有些时间环境云网站也没有数据，所以上一行才需要判断返回结果是Success
'''
checkresult = checksitedata(siteid, data['updatetime'])
if checkresult > 0:
print 'data already exists'
continue
#插入到数据库
insertintooracle(siteid, data['updatetime'],
data['phenomena'],
data['temperature'],
data['feelst'],
data['airpressure'],
data['humidity'],
data['rain'],
data['winddirect'],
data['windpower'],
data['windspeed']
)
result = 'good'
print 'result is good'
'''这部分注释是旧代码，用于获取过去24小时的历史数据（注意：是当前时间的过去24小时，所以不需要传递日期和时间）
这部分的返回json格式和上面不同，详细参考环境云帮助页面
weather_data = pd.DataFrame(data['history'],
columns=['updatetime','phenomena','temperature',
'feelst','airpressure','humidity','rain',
'winddirect','windpower','windspeed'])#24小时历史天气
result='good'
print 'result is good'
for ind in weather_data.index:
checkresult = checksitedata(siteid,weather_data.at[ind,'updatetime'])#检查某监测站在该日期已存在多少条数据。参数：检测点ID，要查询的日期
#print checkresult
if checkresult>0:
print 'data already exists'
continue
insertintooracle(siteid,weather_data.at[ind,'updatetime'],
weather_data.at[ind,'phenomena'],
weather_data.at[ind,'temperature'],
weather_data.at[ind,'feelst'],
weather_data.at[ind,'airpressure'],
weather_data.at[ind,'humidity'],
weather_data.at[ind,'rain'],
weather_data.at[ind,'winddirect'],
weather_data.at[ind,'windpower'],
weather_data.at[ind,'windspeed']
)
'''
except:
result='bad'
#print request
print siteid
print 'result is bad'
continue
'''这部分是旧代码，用于判断环境云中的错误信息，例如空气湿度值为100。
注意：环境云中很多错误信息
这部分代码未经测试，不保证正确性
windspeed=float(str(weather_data.wspd_avg)[4:8].replace(' ','0'))
if windspeed>60:
windspeed=''
airpressure=weather_data.pressure_avg.astype(float).mean()
if airpressure>1300:
airpressure=''
humidity=weather_data.hum_avg.astype(float).mean()
if humidity>100:
humidity=''
rain=weather_data.rain_full.astype(float).mean()
if rain>200:
rain=''
#temperature_max=str(weather_data.tem_max)[5:9]
#temperature_min=str(weather_data.tem_min)[5:9]
#temperature_mean=''
temperature_max=weather_data.tem_max.astype(float).max()
if temperature_max>50:
temperature_max=''
temperature_min=weather_data.tem_min.astype(float).min()
if temperature_min>50:
temperature_min=''
temperature_mean=''
'''
print 'one city inserted'
print 'all citys inserted'
''' 查询该监测站在某个日期一共有多少条监测记录'''
def checksitedata(siteid,querydate):
sql="select count(*) from imp_weather_data where siteid = '"+siteid+"' and updatetime =to_date('"+querydate+":00','yyyy-mm-dd hh24:mi:ss')"
result = cur.execute(sql)
result = result.fetchall()
result = result[0][0]
return result
# conn.commit()
'''找出数据库表中的记录总数，用于判断插入时的id'''
def checkAllData():
result = cur.execute("select count(*) from imp_weather_data ")
result = result.fetchall()
result = result[0][0]
return result
conn.commit()#如果sql语句不是insert、delete、uodate等不需要提交。参考checksitedata函数
#return 0
def insertintooracle(siteid,updatetime,phenomena,temperature,feelst,airpressure,humidity,rain,winddirect,windpower,windspeed):
table_num=checkAllData()+1
#print table_num
sql="""INSERT INTO imp_weather_data(dataid,siteid,updatetime,phenomena,temperature,
feelst,airpressure,humidity,rain,winddirect,windpower,windspeed)
VALUES (to_number(nvl('"""+str(table_num)+"""',null)),'"""+siteid+"""',
to_date('"""+updatetime+""":00','YYYY-MM-DD HH24:MI:ss'),'"""+phenomena+"""',
to_number(nvl('"""+temperature+"""',null)),to_number(nvl('"""+feelst+"""',null)),
to_number(nvl('"""+airpressure+"""',null)),to_number(nvl('"""+humidity+"""',null)),
to_number(nvl('"""+rain+"""',null)),'"""+winddirect+"""','"""+windpower+"""',
to_number(nvl('"""+windspeed+"""',null)))"""
cur.execute(sql)
'''这是两种sql语句的写法，个人喜欢上面那种
cur.execute("""INSERT INTO imp_weather_data
(dataid,siteid,updatetime,phenomena,temperature,feelst,airpressure,humidity,rain,winddirect,windpower,windspeed)
VALUES (to_number(nvl('%s',null)),'%s',to_date('%s','YYYY-MM-DD HH24:MI'),'%s',to_number(nvl('%s',null)),to_number(nvl('%s',null)),to_number
(nvl('%s',null)),to_number(nvl('%s',null)),to_number(nvl('%s',null)),,'%s','%s',
to_number(nvl('%s',null)))"""
%(table_num,siteid,updatetime,phenomena,temperature,feelst,airpressure,humidity,rain,winddirect,windpower,windspeed)
)'''
conn.commit()
if __name__=='__main__':
connect='parName/parPw@parIP/ParDbName'#parName、parPw、parIP、ParDbName分别代表数据库账号、密码、ip地址、数据库名。
conn = cx_Oracle.connect(connect)
cur = conn.cursor()
#获取所有监控站
sitedata = getsitedata('imp_weather_sites','2')#数据来源（1为气候中心数据；2为环境云数据）
sysdate = datetime.datetime.now()
end_date = sysdate+datetime.timedelta(days=-1)#前一天
begin_date = sysdate+datetime.timedelta(days=-2)#前两天
end_date = str(end_date)[0:10]
begin_date = str(begin_date)[0:10]#去除时间，保留日期
end_date = end_date.replace('-','')
begin_date = begin_date.replace('-','')
getapidata(sitedata,begin_date,end_date,key)#key是环境云的key，大家自行申请。目前免费
conn.close()