概述
import requests as reqs
import threading
import time
#Some User Agents
hds={'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}
from scrapy import Selector
def html_parse(html_str,xpath_expr):
sel = Selector(text=htmltext)
xp = lambda x: sel.xpath(x).extract()
return xp(xpathstr)
class myThread_detail(threading.Thread):
"""docstring for myThead"""
def __init__(self, url,people_nums):
super(myThread_detail, self).__init__()
self.url = url
self.people_nums=people_nums
def run(self,):
loc=threading.Lock()
data=reqs.get(self.url,headers=hds).content.decode()
#返回评论人数
loc.acquire()
try:
self.people_nums.append(html_parse(data,'//a[@class="rating_people"]//span/text()')[0])
except IndexError as e:
self.people_nums.append(" ")
loc.release()
# 根据标签爬取单个标签下的图书列表
def book_spider(book_tag):
#url='http://www.douban.com/tag/小说/book?start=0' # For Test
page_num=0
url=r'http://www.douban.com/tag/'+book_tag+'/book?start='
for i in range(0,15*10,15):
urls=url+str(i)
print(urls)
data=reqs.get(urls,headers=hds).content.decode()
# ==============以下是要爬取的三个字段,要爬取更多的字段可以在这里扩展================
# 书名
booknames
=html_parse(data,'//a[@class="title"]/text()')
#描述
descs
=html_parse(data,'//div[@class="desc"]/text()')
detail_links
=html_parse(data,'''//a[@class="title"]/@href''')
# 评论人数
people_nums=[]
#进入详情页面抓取,评论数
tt=[]
for link in detail_links:
t=myThread_detail(link,people_nums)
t.start()
time.sleep(0.1)
tt.append(t)
for th in tt:
th.join()
yield (booknames,descs,people_nums)
#获取评论人数,获取详细信息
# https://book.douban.com/subject/6082808/?from=tag_all for test
def get_people_num(url):
data=reqs.get(url,headers=hds).content.decode()
#返回评论人数
return html_parse(data,'//a[@class="rating_people"]//span/text()')
#爬取多个标签(需要传入标签列表参数)
def do_spider(book_tag_lists):
for i in book_tag_lists:
for item in book_spider(i):
# 把搞到的数据打印出来
print(item)
# 返回的是个复数形式,也就是说是个多个标签的最终结果
# return book_lists
return 0
#把数据写到电子表格中。
def print_book_lists_excel(book_lists,book_tag_lists):
'''每个标签一张表,每张表就一个标签的书籍列表;
如果你有兴趣自己写吧,也可以问我!'''
pass
if __name__=='__main__':
#book_tag_lists = ['心理','判断与决策','算法','数据结构','经济','历史']
#book_tag_lists = ['传记','哲学','编程','创业','理财','社会学','佛教']
#book_tag_lists = ['思想','科技','科学','web','股票','爱情','两性']
#book_tag_lists = ['计算机','机器学习','linux','android','数据库','互联网']
#book_tag_lists = ['数学']
#book_tag_lists = ['摄影','设计','音乐','旅行','教育','成长','情感','育儿','健康','养生']
#book_tag_lists = ['商业','理财','管理']
#book_tag_lists = ['名著']
#book_tag_lists = ['科普','经典','生活','心灵','文学']
#book_tag_lists = ['科幻','思维','金融']
# book_tag_lists = ['小说','时间管理','投资','文化','宗教']
book_tag_lists = ['小说']
book_lists=do_spider(book_tag_lists)
print_book_lists_excel(book_lists,book_tag_lists)
###########################################################################
# 运行一次就好,若果你运行第二次,立马会被封ip ,当然你可以换一个ip;继续运行。 ##
参考地址:
https://github.com/lanbing510/DouBanSpider/blob/master/doubanSpider.py
最后
以上就是完美大地为你收集整理的python++++多线程爬取豆瓣网的书单的全部内容,希望文章能够帮你解决python++++多线程爬取豆瓣网的书单所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复