概述
scrapy进阶
item
- Item是保存爬取数据的容器,它的使用方法和字典类似。不过,相比字典,Item提供了额外的保护机制,可以避免拼写错误或者定义字段错误
import scrapy
class Minimp4SpiderItem(scrapy.Item):
# define the fields for your item here like:
# 保存的键
title = scrapy.Field()
introduce = scrapy.Field()
link = scrapy.Field()
passwd = scrapy.Field()
爬虫文件的运用
from ..items import Minimp4SpiderItem
# 在回调函数中实例item对象,采用字典赋值的方式,最后yield item实例化对象
item = Minimp4SpiderItem()
item["title"] = title
item["introduce"] = response.meta
item["link"] = link
item["passwd"] = passwd
yield item
Pipeline保存数据
- Scrapy 提供了 pipeline 模块来执行保存数据的操作。在创建的 Scrapy 项目中自动创建了一个 pipeline.py 文件,同时创建了一个默认的 Pipeline 类
process_item:
该方法用来处理数据 筛选 过滤 保存等
item 是爬虫文件返回的数据
spider 获取该数据的爬虫
注意:
1. return item 不要删除
作用:为了以后的管道文件处理用的
2. 写好的管道文件 要激活 在配置文件中
激活:
ITEM_PIPELINES = {
'tencent.pipelines.TencentPipeline': 300,
'tencent.pipelines.TencentPipeline2': 299,
}
tencent.pipelines.TencentPipeline : 管道的路径
300: 代表激活的优先级 越小越优先
数字大小:【1-1000】
settings设置
如果要使用管道必须激活settings的ITEM_PIPELINES
# 激活管道
ITEM_PIPELINES = {
'minimp4spider.pipelines.Minimp4SpiderPipeline': 300,
}
# mysql数据库连接信息
MYSQL_HOST = '127.0.0.1'
MYSQL_KEY = '1234'
MYSQL_USER = 'root'
MYSQL_DB = 'scrapy'
# 图片保存的路径
IMAGES_STORE = r'F:python08282019-01-19-Scrapy进阶CodeWuZhuLiangYuanPictures'
保存到mysql数据库
import pymysql
class Minimp4SpiderPipeline(object):
# 保存到mysql
def __init__(self, host, user, passwd, db):
self.host = host
self.user = user
self.passwd = passwd
self.db = db
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
user=crawler.settings.get('MYSQL_USER'),
passwd=crawler.settings.get('MYSQL_KEY'),
db=crawler.settings.get('MYSQL_DB'),
)
# 保存到mysql
def open_spider(self, spider):
# 打开数据库连接
self.mysql = pymysql.connect(host=self.host, user=self.user, passwd=self.passwd, port=3306, db=self.db,
charset="utf8")
def process_item(self, item, spider):
data = dict(item)
title = data['title']
introduce = data['introduce']
link = data['link']
passwd = data['passwd']
cursor = self.mysql.cursor()
sql = "insert into minimp4(title,introduce,link,passwd) values ('{}','{}','{}','{}') ".format(title, introduce, link, passwd)
cursor.execute(sql)
self.mysql.commit()
return item
def close_spider(self, spider):
self.mysql.close()
保存到redis
import redis
class FirstspiderPipeline2(object):
# 保存到redis
def __init__(self):
self.myredis = redis.StrictRedis(host='127.0.0.1', port=6379, db=2, password="密码")
def process_item(self, item, spider):
data = dict(item)
self.myredis.lpush("liangyuan", data)
return item
图片的下载与保存
- 【图片文件】
- 先继承ImagesPipeline
- 发请求
- 激活 配置文件激活
- 指定图片路径 配置文件中指定
from scrapy.pipelines.images import ImagesPipeline
class ImagePipeline(ImagesPipeline):
# 下载图片
def get_media_requests(self, item, info):
data = dict(item)
# 获取图片链接
avatar = data['avatar']
yield scrapy.Request(avatar)
def file_path(self, request, response=None, info=None):
# 获取图片名
imgname = request.url.split('/')[-1]
if ".jpg" or ".png" in imgname:
name = imgname
else:
name = imgname + '.jpg'
# 图片保存路径
return './%s.jpg' % (name)
下载器中间件Downloader Middlewares
'''
下载器中间件:
【每一个请求都会经过下载器中间件】
【每一个响应都会经过下载器中间件】
请求:
1. 引擎-->请求-->[下载器中间件]-->请求-->下载器
设置请求:
1. 随机替换请求头
2. IP替换
3. Cookie池
在下载器的时候请求
响应:
2. 下载器-->响应-->[下载器中间件]-->响应-->引擎
【process_request】
每一个请求都会经过这个方法
作用:
用来处理请求
1. None --> 下载器
2. Response --> process_response --> 引擎 --> 爬虫文件
3. Request --> 交给下载器
4. 异常 process_exception 处理异常
【process_response】
每个响应都会经过这个方法
'''
'''
自定义中间件:
1. 定义好的中间件要激活
2. 激活的中间件的顺序不要随便设置
3. 设置的顺序最好在上个中间件和下个中间件之间。
4. 最好将原来的中间件改为None
DOWNLOADER_MIDDLEWARES_BASE = {
# Engine side
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 400,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 500,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560,
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590,
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,
# Downloader side
}
'''
'''
【断点续爬】
当Scrapy停掉以后 下次开启的时候接着上次的爬
scrapy crawl spiderName -s JOBDIR=DIR
'''
# middelwares文件中配置
from fake_useragent import UserAgent
import requests
# 随机替换请求头
class UserAgentMiddleware(object):
def __init__(self):
self.UA = UserAgent()
def process_request(self, request, spider):
request.headers["User-Agent"] = self.UA.random
# 随机替换ip
# 这里使用了ip池
class ProxyMiddleware(object):
def __init__(self):
self.IP = requests.get("http://127.0.0.1:8080/get/")
def process_request(self, request, spider):
request.headers["Proxy"] = "https://{}".format(self.IP)
激活settings的DOWNLOADER_MIDDLEWARES
DOWNLOADER_MIDDLEWARES = {
# 'minimp4spider.middlewares.Minimp4SpiderDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
'minimp4spider.middlewares.UserAgentMiddleware': 500,
# 值不要乱写,根据源码的配置写
'minimp4spider.middlewares.ProxyMiddleware': 750,
}
'''
DOWNLOADER_MIDDLEWARES_BASE = {
# Engine side
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 400,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 500,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560,
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590,
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,
# Downloader side
}
'''
最后
以上就是无限未来为你收集整理的scrapy进阶scrapy进阶的全部内容,希望文章能够帮你解决scrapy进阶scrapy进阶所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复