Python爬取预决算公开文件

200 阅读 0 评论 132 点赞

我是靠谱客的博主畅快曲奇，这篇文章主要介绍Python爬取预决算公开文件，现在分享给大家，希望可以做个参考。

运行环境：

python3.7

参考代码：

# 引入模块
import json  # 解析JSON后将其转为Python字典或者列表
import os  # 文件和路径操作功能
import re  # 正则表达式
import requests  # 用于网络访问
import time  # 时间库
from io import BytesIO  # 用于读写文件

# 定义头部
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 "
                  "Safari/537.36 "
}  # Chrome浏览器


# 获取所有部门
def getDepartment(pid, startNum):
    # 江苏省预决算公开统一平台
    url = "http://yjsgk.jsczt.cn/front/department/getdepartmentname.do?pid=" + str(pid)
    res = requests.get(url, headers=headers)
    res = res.content
    json_res = json.loads(res)  # 将json格式数据转换为字典
    for i in range(len(json_res)):
        if i < startNum:
            continue
        data = json_res[i]
        # 部门id
        groupid = data["iid"]
        # 部门名称
        departmentName = data["name"]
        tempPath = "./" + departmentName + "/"
        print("编号:" + str(i), "部门:" + departmentName)
        # 尝试获取资料
        try:
            getTry(groupid, tempPath)
        except:
            # 二次尝试
            try:
                getTry(groupid, tempPath)
            except:
                print("获取失败", "编号:" + str(i), "部门:" + departmentName)


# 尝试获取资料
def getTry(groupid, path):
    # 部门预决算公开管理文件
    # getbmglwj(groupid, path, 1)
    # 部门预算
    getbudgetfinal(groupid, path, 2)
    # 部门决算
    # getbudgetfinal(groupid, path, 3)
    # 部门专项资金
    # getbmcontent(groupid, path, 4)
    # 部门资产信息
    # getbmcontent(groupid, path, 5)
    # 保命
    time.sleep(2)


# 部门预决算公开管理文件
def getbmglwj(groupID, path, typeid):
    path = path + "部门预决算公开管理文件" + "/"
    createDir(path)  # 创建存放目录
    # http://yjsgk.jsczt.cn/front/bmglwj/bmgkgkwj_page.do?page_num=1&groupid=4972&typeid=1
    url = "http://yjsgk.jsczt.cn/front/bmglwj/bmgkgkwj_page.do?page_num=1&groupid=" + str(groupID) + "&typeid=" + str(
        typeid)
    res = requests.get(url, headers=headers)
    res = res.content
    json_res = json.loads(res)  # 将json格式数据转换为字典
    # print(json_res)
    # 字典迭代
    for data in json_res["depPublicServices"]:
        file_uuid = data["file_uuid"]  # 文件唯一标识
        file_name = data["g_title"] + ".pdf"  # 文件名
        print("----" + file_name)
        file_url = "http://yjsgk.jsczt.cn/front/glwj/download.do?uuid=" + file_uuid  # 文件路径
        filepath = path + file_name
        download(file_url, filepath)  # 下载文件


# 部门预决算公开
def getbudgetfinal(groupID, path, typeid):
    # http://yjsgk.jsczt.cn/front/budgetfinal/itemsandpag.do?page_num=1&groupid=4972&typeid=2
    # http://yjsgk.jsczt.cn/front/budgetfinal/itemsandpag.do?page_num=1&groupid=4972&typeid=3
    if typeid == 2:
        path = path + "部门预算公开" + "/"
    if typeid == 3:
        path = path + "部门决算公开" + "/"
    createDir(path)
    url = "http://yjsgk.jsczt.cn/front/budgetfinal/itemsandpag.do?page_num=1&groupid=" + str(
        groupID) + "&typeid=" + str(typeid)
    res = requests.get(url, headers=headers)
    res = res.content
    json_res = json.loads(res)
    i = 0
    for data in json_res["budgetTemplates"]:
        uuid = data["uuid"]
        name = data["b_title"]
        print("----" + name)
        url = "http://yjsgk.jsczt.cn/front/budgetfinal/getTemporaryFiles.do?uuid=" + uuid
        # print(url)
        tempPath = path + name + "/"
        getbudgetUrl(url, tempPath)
        if i == 1:
            break
        i = i + 1


def getbudgetUrl(url, path):
    createDir(path)
    res = requests.get(url, headers=headers)
    res = res.content
    json_res = json.loads(res)
    for data in json_res:
        iid = data["iid"]
        file_name = data["t_oldname"]
        file_url = "http://yjsgk.jsczt.cn/front/budgetfinal/download.do?iid=" + str(iid)
        filepath = path + file_name
        download(file_url, filepath)


# 部门专项资金公开
def getbmcontent(groupID, path, typeid):
    # http://yjsgk.jsczt.cn/front/bmcontent/bmgcontent_page.do?page_num=1&groupid=4972&typeid=4
    if typeid == 4:
        path = path + "部门专项资金公开" + "/"
    if typeid == 5:
        path = path + "部门资产信息公开" + "/"
    createDir(path)
    url = "http://yjsgk.jsczt.cn/front/bmcontent/bmgcontent_page.do?page_num=1&groupid=" + str(
        groupID) + "&typeid=" + str(typeid)
    res = requests.get(url, headers)
    res = res.content
    json_res = json.loads(res)
    for data in json_res["departmentContent"]:
        subjectiid = data["iid"]
        file_name = data["d_title"]
        print("----" + file_name)
        file_content = data["d_content"]
        # http://yjsgk.jsczt.cn/front/bmcontent/file_list.do?subjectiid=45810
        url = "http://yjsgk.jsczt.cn/front/bmcontent/file_list.do?subjectiid=" + str(subjectiid)
        res = requests.get(url, headers=headers)
        res = res.content
        if len(res) == 2:
            # 无附件
            # 直接读取文本，保存为.txt
            file_content = file_content.replace("</p>", "n")
            file_content = file_content.replace("&nbsp;", " ")
            file_content = re.sub("<.+?>", "", file_content)  # 正则替换
            filepath = path + file_name + ".txt"
            with open(filepath, "w")as f:  # 只写模式打开file
                f.write(file_content)
        else:
            # 有附件
            json_res = json.loads(res)
            for j_data in json_res:
                file_iid = j_data["iid"]
                file_name = j_data["file_oldname"]
                filepath = path + file_name
                # http://yjsgk.jsczt.cn/front/bmcontent/download.do?iid=4228
                file_url = "http://yjsgk.jsczt.cn/front/bmcontent/download.do?iid=" + str(file_iid)
                download(file_url, filepath)


# 下载
def download(url, filepath):
    res = requests.get(url, headers=headers)
    data = res.content
    with open(filepath, "wb") as f:  # 以二进制写模式打开
        f.write(data)

    time.sleep(1)  # 保命....


# 创建目录
def createDir(path):
    if not os.path.exists(path):
        os.makedirs(path)  # 创建目录


pid = 121  # 南京市
# 编号，0为从头开始下载
startNum = 0

getDepartment(pid, startNum)