概述
本章学习内容:将网站上的小说都爬下来,存储到本地。
目标网站:www.cuiweijuxs.com
分析页面,发现一共4步:从主页进入分版打开分页列表、打开分页下所有链接、打开作品页面、打开单章内容。
所以实现步骤如下:
1、进入分版页面,www.cuiweijuxs.com/jingpinxiaoshuo/
找到最大分页数
122
循环打开每个页面
href="http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
2、找到当页所有链接,循环打开单页链接,下为可定位元素
div id="newscontent"
div class="l"
标题
3、打开单页链接,找到章节列表,下为可定位元素
第一章
4、打开单章链接,读取内容
内容
setup1:创建class,初始化参数,抽象化获取beautifulsoup解析后到网页
# -*- coding: UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import os
'''
使用BeautifulSoup抓取网页
'''
class Capture():
def __init__(self):
self.index_page_url = 'http://www.cuiweijuxs.com/'
self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
self.folder_path = '小说/'
self.head = {}
# 写入User Agent信息
self.head[
'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'
# 获取BeautifulSoup
def getSoup(self, query_url):
req = request.Request(query_url, headers=self.head)
webpage = request.urlopen(req)
html = webpage.read()
#soup = BeautifulSoup(html, 'html.parser')
soup = BeautifulSoup(html, 'html5lib')
return soup
# end getSoup
setup2:创建进入分版页面,找到最大分页数,并循环打开每个页面
# 读取更新列表
def readPageOne(self):
soup = self.getSoup(self.one_page_url)
last = soup.find("a","last")
itemSize = int(last.string)
page_url = str(self.two_page_url)
for item in range(itemSize):
print( item )
new_page_url = page_url.replace( "?",str(item+1) )
self.readPageTwo(new_page_url)
# end readPageOne
使用getSoup方法获取解析后到html网页,使用find方法找到class是“last”的a标签,获取最大分页数
循环分页,从1开始
setup3:读取单页链接
#读取单页链接
def readPageTwo(self,page_url):
soup = self.getSoup(page_url)
con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'})
a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a')
print(a_list)
for a_href in a_list:
#print(child)
href = a_href.get('href')
folder_name = a_href.get_text()
print('a_href',href,'---folder_name',folder_name)
path = self.folder_path + folder_name
self.createFolder(path)
self.readPageThree(href,path)
# end for
# end readPageTwo
找到div下id是newscontent的标签,再往下找到class是“l”的div,再找到所有class是“s2”的span,找到此span下的a标签,循环打开a标签
并找到标签名( a_href.get_text() )作为文件夹名称
setup4:打开作品页面,循环章节链接,拼接文件名称
#打开作品页面
def readPageThree(self,page_url,path):
soup = self.getSoup(page_url)
print('readPageThree--',page_url)
a_list = soup.find('div', {'id': 'list'}).find_all('a')
idx = 0
for a_href in a_list:
idx = idx+1
href = self.index_page_url + a_href.get('href')
txt_name = path + '/' + str(idx) + '_'+ a_href.get_text() + '.txt'
print('a_href', href, '---path', txt_name)
isExists = os.path.exists(txt_name)
if isExists:
print(txt_name, '已存在')
else:
self.readPageFour(href,txt_name)
setup5:打开章节链接,读取id=content的div下所有内容,写入文件中
#读取单章内容并写入
def readPageFour(self,page_url,path):
soup = self.getSoup(page_url)
con_div = soup.find('div', {'id': 'content'})
content = con_div.get_text().replace('
', 'n').replace(' ', ' ')
self.writeTxt(path,content)
完整代码实现如下:
1 #-*- coding: UTF-8 -*-
2 from urllib importrequest3 from bs4 importBeautifulSoup4 importos5
6 '''
7 使用BeautifulSoup抓取网页8 '''
9
10 classCapture():11
12 def __init__(self):13 self.index_page_url = 'http://www.cuiweijuxs.com/'
14 self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
15 self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
16 self.folder_path = '小说/'
17 self.head ={}18 #写入User Agent信息
19 self.head[20 'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'
21
22 #获取BeautifulSoup
23 defgetSoup(self, query_url):24 req = request.Request(query_url, headers=self.head)25 webpage =request.urlopen(req)26 html =webpage.read()27 #soup = BeautifulSoup(html, 'html.parser')
28 soup = BeautifulSoup(html, 'html5lib')29 returnsoup30 #end getSoup
31
32 #读取更新列表
33 defreadPageOne(self):34 soup =self.getSoup(self.one_page_url)35 last = soup.find("a","last")36 itemSize =int(last.string)37 page_url =str(self.two_page_url)38
39 for item inrange(itemSize):40 print( item )41 new_page_url = page_url.replace( "?",str(item+1) )42 self.readPageTwo(new_page_url)43
44 #end readPageOne
45
46 #读取单页链接
47 defreadPageTwo(self,page_url):48 soup =self.getSoup(page_url)49 con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'})50 a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a')51 print(a_list)52 for a_href ina_list:53 #print(child)
54 href = a_href.get('href')55 folder_name =a_href.get_text()56 print('a_href',href,'---folder_name',folder_name)57 path = self.folder_path +folder_name58 self.createFolder(path)59 self.readPageThree(href,path)60 #end for
61
62 #end readPage
63
64 #打开单章链接
65 defreadPageThree(self,page_url,path):66 soup =self.getSoup(page_url)67 print('readPageThree--',page_url)68 a_list = soup.find('div', {'id': 'list'}).find_all('a')69 idx =070 for a_href ina_list:71 idx = idx+1
72 href = self.index_page_url + a_href.get('href')73 txt_name = path + '/' + str(idx) + '_'+ a_href.get_text() + '.txt'
74 print('a_href', href, '---path', txt_name)75 isExists =os.path.exists(txt_name)76 ifisExists:77 print(txt_name, '已存在')78 else:79 self.readPageFour(href,txt_name)80
81
82 #读取单章内容并写入
83 defreadPageFour(self,page_url,path):84 soup =self.getSoup(page_url)85 con_div = soup.find('div', {'id': 'content'})86 content = con_div.get_text().replace('
', 'n').replace(' ', ' ')87 self.writeTxt(path,content)88
89 defreadPageHtml(self,page_url,path):90 soup =self.getSoup(page_url)91 con_div = soup.find('div', {'id': 'content'})92 content = con_div.get_text().replace('
', 'n').replace(' ', ' ')93
94
95 defcreateFolder(self,path):96 path =path.strip()97 #去除尾部 符号
98 path = path.rstrip("\")99 isExists =os.path.exists(path)100 #不存在则创建
101 if notisExists:102 os.makedirs(path)103 print(path + 'create')104 else:105 print( path + '目录已存在')106 #end createFolder
107
108 defwriteTxt(self,file_name,content):109 isExists =os.path.exists(file_name)110 ifisExists:111 print(file_name,'已存在')112 else:113 file_object = open(file_name, 'w',encoding='utf-8')114 file_object.write(content)115 file_object.close()116
117 defrun(self):118 try:119 self.readPageOne()120 exceptBaseException as error:121 print('error--',error)122
123
124 Capture().run()
View Code
最后
以上就是粗暴金鱼为你收集整理的python3 从尾部读取_python3+beautifulSoup4.6抓取某网站小说(三)网页分析,BeautifulSoup解析...的全部内容,希望文章能够帮你解决python3 从尾部读取_python3+beautifulSoup4.6抓取某网站小说(三)网页分析,BeautifulSoup解析...所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复