气象数据爬取（全国温室数据系统）爬虫及逻辑回归

103 阅读 0 评论 68 点赞

我是靠谱客的博主雪白小白菜，这篇文章主要介绍气象数据爬取（全国温室数据系统）爬虫及逻辑回归，现在分享给大家，希望可以做个参考。

明确爬虫需求

爬取网站：全国温室数据系统

爬取字段：平均气温相对湿度风速日照时数

已知字段：代谢率h 吸收情况a 高度角cos∂ 单位照射R

计算字段：温湿指数风寒指数着衣指数综合指数

甘肃省2000-2019年夏季6.7.8月的数据利用气温，风速，日照时数，相对湿度对温湿指数，风寒指数，着衣指数，旅游气候舒适度进行计算。

复制代码

1
2
diqu={"马鬃山":"52323","鼎新":"52446","敦煌":"52418","玉门镇":"52436","张掖":"52652","永昌":"52674","民勤":"52681","环县":"53821","平凉":"53915"}

复制代码

# coding=gbk
import requests
import os

# Getfile类的代码引用自https://blog.51cto.com/eddy72/2106091?cid=732015
class Getfile:  # 下载文件
    def __init__(self, url):
        self.url = url
        self.header_flag = False  # 当为True时，设置header，断点续传

def downfile(self, filename):
        self.headers = {}
        self.mode = 'wb'
        if os.path.exists(filename) and self.header_flag:
            self.headers = {'Range': 'bytes=%d-' % os.path.getsize(filename)}
            self.mode = 'ab'
        self.r = requests.get(self.url, stream=True, headers=self.headers)
        with open(filename, self.mode) as code:
            for chunk in self.r.iter_content(chunk_size=1024):  # 边下载边存硬盘
                code.write(chunk)

def single_download(paras):
    # 确认访问地址
    url2 = "http://data.sheshiyuanyi.com/WeatherData/php/downloadWeatherData.php"
    # 请求头
    header = {
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 78.0.3904.108Safari / 537.36"
    }

requests.get(url2, headers=header, params=paras)
    filename = "{0}_{1}_{2}_{3}.xlsx".format(paras["staNum"], paras["subIndex"], paras["year"], paras["month"])
    down_url = "http://data.sheshiyuanyi.com/WeatherData/datafile/{0}".format(filename)
    temp = Getfile(url=down_url)
    temp.downfile(filename)

#根据网站结构

if __name__ == "__main__":
    # 确定请求参数
    in_paras = {"action": "one",
                "staNum": "52943",
                "index": "air_temperature",
                "subIndex": "max_tem",
                "year": 2005,
                "month": 0}
    single_download(paras=in_paras)
    print("Completed: {0}_{1}_{2}_{3}.xlsx".format(in_paras["staNum"], in_paras["subIndex"], in_paras["year"], in_paras["month"]))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# coding=gbk
import requests
import os


# Getfile类的代码引用自https://blog.51cto.com/eddy72/2106091?cid=732015
class Getfile:  # 下载文件
    def __init__(self, url):
        self.url = url
        self.header_flag = False  # 当为True时，设置header，断点续传

    def downfile(self, filename):
        self.headers = {}
        self.mode = 'wb'
        if os.path.exists(filename) and self.header_flag:
            self.headers = {'Range': 'bytes=%d-' % os.path.getsize(filename)}
            self.mode = 'ab'
        self.r = requests.get(self.url, stream=True, headers=self.headers)
        with open(filename, self.mode) as code:
            for chunk in self.r.iter_content(chunk_size=1024):  # 边下载边存硬盘
                code.write(chunk)

def single_download(paras):
    # 确认访问地址
    url2 = "http://data.sheshiyuanyi.com/WeatherData/php/downloadWeatherData.php"
    # 请求头
    header = {
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 78.0.3904.108Safari / 537.36"
    }

    requests.get(url2, headers=header, params=paras)
    filename = "{0}_{1}_{2}_{3}.xlsx".format(paras["staNum"], paras["subIndex"], paras["year"], paras["month"])
    down_url = "http://data.sheshiyuanyi.com/WeatherData/datafile/{0}".format(filename)
    temp = Getfile(url=down_url)
    temp.downfile(filename)

#根据网站结构

if __name__ == "__main__":
    # 确定请求参数
    in_paras = {"action": "one",
                "staNum": "52943",
                "index": "air_temperature",
                "subIndex": "max_tem",
                "year": 2005,
                "month": 0}
    single_download(paras=in_paras)
    print("Completed: {0}_{1}_{2}_{3}.xlsx".format(in_paras["staNum"], in_paras["subIndex"], in_paras["year"], in_paras["month"]))