概述
运行环境:
python3.7
参考代码:
# 引入模块
import json # 解析JSON后将其转为Python字典或者列表
import os # 文件和路径操作功能
import re # 正则表达式
import requests # 用于网络访问
import time # 时间库
from io import BytesIO # 用于读写文件
# 定义头部
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 "
"Safari/537.36 "
} # Chrome浏览器
# 获取所有部门
def getDepartment(pid, startNum):
# 江苏省预决算公开统一平台
url = "http://yjsgk.jsczt.cn/front/department/getdepartmentname.do?pid=" + str(pid)
res = requests.get(url, headers=headers)
res = res.content
json_res = json.loads(res) # 将json格式数据转换为字典
for i in range(len(json_res)):
if i < startNum:
continue
data = json_res[i]
# 部门id
groupid = data["iid"]
# 部门名称
departmentName = data["name"]
tempPath = "./" + departmentName + "/"
print("编号:" + str(i), "部门:" + departmentName)
# 尝试获取资料
try:
getTry(groupid, tempPath)
except:
# 二次尝试
try:
getTry(groupid, tempPath)
except:
print("获取失败", "编号:" + str(i), "部门:" + departmentName)
# 尝试获取资料
def getTry(groupid, path):
# 部门预决算公开管理文件
# getbmglwj(groupid, path, 1)
# 部门预算
getbudgetfinal(groupid, path, 2)
# 部门决算
# getbudgetfinal(groupid, path, 3)
# 部门专项资金
# getbmcontent(groupid, path, 4)
# 部门资产信息
# getbmcontent(groupid, path, 5)
# 保命
time.sleep(2)
# 部门预决算公开管理文件
def getbmglwj(groupID, path, typeid):
path = path + "部门预决算公开管理文件" + "/"
createDir(path) # 创建存放目录
# http://yjsgk.jsczt.cn/front/bmglwj/bmgkgkwj_page.do?page_num=1&groupid=4972&typeid=1
url = "http://yjsgk.jsczt.cn/front/bmglwj/bmgkgkwj_page.do?page_num=1&groupid=" + str(groupID) + "&typeid=" + str(
typeid)
res = requests.get(url, headers=headers)
res = res.content
json_res = json.loads(res) # 将json格式数据转换为字典
# print(json_res)
# 字典迭代
for data in json_res["depPublicServices"]:
file_uuid = data["file_uuid"] # 文件唯一标识
file_name = data["g_title"] + ".pdf" # 文件名
print("----" + file_name)
file_url = "http://yjsgk.jsczt.cn/front/glwj/download.do?uuid=" + file_uuid # 文件路径
filepath = path + file_name
download(file_url, filepath) # 下载文件
# 部门预决算公开
def getbudgetfinal(groupID, path, typeid):
# http://yjsgk.jsczt.cn/front/budgetfinal/itemsandpag.do?page_num=1&groupid=4972&typeid=2
# http://yjsgk.jsczt.cn/front/budgetfinal/itemsandpag.do?page_num=1&groupid=4972&typeid=3
if typeid == 2:
path = path + "部门预算公开" + "/"
if typeid == 3:
path = path + "部门决算公开" + "/"
createDir(path)
url = "http://yjsgk.jsczt.cn/front/budgetfinal/itemsandpag.do?page_num=1&groupid=" + str(
groupID) + "&typeid=" + str(typeid)
res = requests.get(url, headers=headers)
res = res.content
json_res = json.loads(res)
i = 0
for data in json_res["budgetTemplates"]:
uuid = data["uuid"]
name = data["b_title"]
print("----" + name)
url = "http://yjsgk.jsczt.cn/front/budgetfinal/getTemporaryFiles.do?uuid=" + uuid
# print(url)
tempPath = path + name + "/"
getbudgetUrl(url, tempPath)
if i == 1:
break
i = i + 1
def getbudgetUrl(url, path):
createDir(path)
res = requests.get(url, headers=headers)
res = res.content
json_res = json.loads(res)
for data in json_res:
iid = data["iid"]
file_name = data["t_oldname"]
file_url = "http://yjsgk.jsczt.cn/front/budgetfinal/download.do?iid=" + str(iid)
filepath = path + file_name
download(file_url, filepath)
# 部门专项资金公开
def getbmcontent(groupID, path, typeid):
# http://yjsgk.jsczt.cn/front/bmcontent/bmgcontent_page.do?page_num=1&groupid=4972&typeid=4
if typeid == 4:
path = path + "部门专项资金公开" + "/"
if typeid == 5:
path = path + "部门资产信息公开" + "/"
createDir(path)
url = "http://yjsgk.jsczt.cn/front/bmcontent/bmgcontent_page.do?page_num=1&groupid=" + str(
groupID) + "&typeid=" + str(typeid)
res = requests.get(url, headers)
res = res.content
json_res = json.loads(res)
for data in json_res["departmentContent"]:
subjectiid = data["iid"]
file_name = data["d_title"]
print("----" + file_name)
file_content = data["d_content"]
# http://yjsgk.jsczt.cn/front/bmcontent/file_list.do?subjectiid=45810
url = "http://yjsgk.jsczt.cn/front/bmcontent/file_list.do?subjectiid=" + str(subjectiid)
res = requests.get(url, headers=headers)
res = res.content
if len(res) == 2:
# 无附件
# 直接读取文本,保存为.txt
file_content = file_content.replace("</p>", "n")
file_content = file_content.replace(" ", " ")
file_content = re.sub("<.+?>", "", file_content) # 正则替换
filepath = path + file_name + ".txt"
with open(filepath, "w")as f: # 只写模式打开file
f.write(file_content)
else:
# 有附件
json_res = json.loads(res)
for j_data in json_res:
file_iid = j_data["iid"]
file_name = j_data["file_oldname"]
filepath = path + file_name
# http://yjsgk.jsczt.cn/front/bmcontent/download.do?iid=4228
file_url = "http://yjsgk.jsczt.cn/front/bmcontent/download.do?iid=" + str(file_iid)
download(file_url, filepath)
# 下载
def download(url, filepath):
res = requests.get(url, headers=headers)
data = res.content
with open(filepath, "wb") as f: # 以二进制写模式打开
f.write(data)
time.sleep(1) # 保命....
# 创建目录
def createDir(path):
if not os.path.exists(path):
os.makedirs(path) # 创建目录
pid = 121 # 南京市
# 编号,0为从头开始下载
startNum = 0
getDepartment(pid, startNum)
运行结果:
最后
以上就是畅快曲奇为你收集整理的Python爬取预决算公开文件的全部内容,希望文章能够帮你解决Python爬取预决算公开文件所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复