概述
~~~
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from kunnanyuan.spider.spider.common import deal_date, transfrom, get_id
from ..items import XkItem
import json
class XkSdl10822Spider(scrapy.Spider):
name = 'XK-FJM-0102'
url = 'http://222.76.243.118:8090/publicity/get_double_publicity_record_list'
#构造请求头(postman,网站调试器,apipost,当然我推荐的是博客————爬虫骚操作之30秒写爬虫(实用)直接转化为python格式复制过来就行,几秒解决)
headers = {
'Origin': 'http://222.76.243.118:8090',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
#这里必须有,才会有下面的解析方式
'Content-Type': 'application/json; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'http://222.76.243.118:8090/page/double_publicity/allow.html',
'Connection': 'keep-alive',
}
#构造起始start_requests
def start_requests(self):
#偷懒没写遍历去限制每次请求的数据,直接一下拿了,主要是为了后面增量考虑
data = {
'listSql': '',
'linesPerPage': "6704",
'currentPage': "1",
'deptId': '',
'searchKeyword': '',
'tag': 'ALLOW'
}
yield scrapy.Request(url=self.url, body=json.dumps(data), method='POST', headers=self.headers,callback=self.parse_list)
#把数据切割分成一页多少个
# def parse_page(self, response):
# self.parse_list(response)
# if self.page == 1:
#以下省略
def parse_list(self, response):
#返回json,转化python字典类型
tr1 = json.loads(response.text)
#把 tr1看成一个大字典,取键拿值
if tr1.get("message") == "请求成功":
data = tr1.get('data')#也可以写成 data = tr1.【'data'】下面类似
list = data.get('list')
#这里就是遍历每个json里面的数据了
for i in list:
if i['legalPersonDocNumber'] is not None:
identifier = i['legalPersonDocNumber']
else:
identifier = i['naturalPersonDocNumber']
if i['jgFr'] is not None:
organization = i['jgFr']
else:
organization = i['jgZr']
businessId = i['businessId']
id = i['id']
objectType = i['objectType']
createdAt = deal_date(i['businessCreateDate'].split('000000')[0])
source_url = "http://222.76.243.118:8090/page/double_publicity/publicity_detail.html" + "?id={}&businessId={}&tag=ALLOW&objectType={}".format(str(id), str(businessId),str(objectType))
prPrincipal = i['objectName']
data = {
"businessId": businessId,
"id": id,
'objectType': objectType,
'tag': "ALLOW",
'pictureMinHeight': '628',
'pictureMinWidth': '1200'
}
url = "http://222.76.243.118:8090/publicity/get_publicity_detail_picture"
yield Request(url, callback=self.parse4, body=json.dumps(data), method='POST', headers=self.headers,
meta={"identifier": identifier, "organization": organization, "businessId": businessId,
"createdAt": createdAt, "source_url": source_url, "prPrincipal": prPrincipal})
#解析 并把item传出去
def parse4(self, response):
item = XkItem()
item['identifier'] = response.meta["identifier"]
item['organization'] = response.meta["organization"]
print(item['organization'])
# item['businessId'] = response.meta["businessId"]
item['createdAt'] = response.meta["createdAt"]
item['source_url'] = response.meta['source_url']
item['prPrincipal'] = response.meta['prPrincipal']
item['type'] = transfrom(str(item['organization']))
item['fileType'] = "jpg"
item['pid'] = get_id(str(item['identifier']))
item['idMethod'] = '2'
tr2 = json.loads(response.text)
if tr2.get("message") == "请求成功":
data = tr2.get('data')
path = data.get('path')
item['images'] = "http://222.76.243.118:8090/" + path
yield item
~~~
或者:
~~~
#coding=utf-8
import scrapy
import json
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.test.com/test/get_data"
]
def parse(self, response):
# 调用body_as_unicode()是为了能处理unicode编码的数据
sites = json.loads(response.body_as_unicode())
#print sites['k']
numbers = sites['k'].split(',')
print numbers
~~~
最后
以上就是欢喜大树为你收集整理的php获取网址返回json,4、Scrapy框架,爬取网站返回json数据(spider源码)的全部内容,希望文章能够帮你解决php获取网址返回json,4、Scrapy框架,爬取网站返回json数据(spider源码)所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复