python3 从尾部读取_python3+beautifulSoup4.6抓取某网站小说（三）网页分析，BeautifulSoup解析...

98 阅读 0 评论 65 点赞

我是靠谱客的博主粗暴金鱼，最近开发中收集的这篇文章主要介绍python3 从尾部读取_python3+beautifulSoup4.6抓取某网站小说（三）网页分析，BeautifulSoup解析...，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

本章学习内容：将网站上的小说都爬下来，存储到本地。

目标网站：www.cuiweijuxs.com

分析页面，发现一共4步：从主页进入分版打开分页列表、打开分页下所有链接、打开作品页面、打开单章内容。

所以实现步骤如下：

1、进入分版页面，www.cuiweijuxs.com/jingpinxiaoshuo/

找到最大分页数

122

循环打开每个页面

href="http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"

2、找到当页所有链接，循环打开单页链接，下为可定位元素

div id="newscontent"

div class="l"

标题

3、打开单页链接，找到章节列表，下为可定位元素

第一章

4、打开单章链接，读取内容

内容

setup1：创建class，初始化参数，抽象化获取beautifulsoup解析后到网页

# -*- coding: UTF-8 -*-

from urllib import request

from bs4 import BeautifulSoup

import os

'''

使用BeautifulSoup抓取网页

'''

class Capture():

def __init__(self):

self.index_page_url = 'http://www.cuiweijuxs.com/'

self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'

self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"

self.folder_path = '小说/'

self.head = {}

# 写入User Agent信息

self.head[

'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'

# 获取BeautifulSoup

def getSoup(self, query_url):

req = request.Request(query_url, headers=self.head)

webpage = request.urlopen(req)

html = webpage.read()

#soup = BeautifulSoup(html, 'html.parser')

soup = BeautifulSoup(html, 'html5lib')

return soup

# end getSoup

setup2：创建进入分版页面，找到最大分页数，并循环打开每个页面

# 读取更新列表

def readPageOne(self):

soup = self.getSoup(self.one_page_url)

last = soup.find("a","last")

itemSize = int(last.string)

page_url = str(self.two_page_url)

for item in range(itemSize):

print( item )

new_page_url = page_url.replace( "?",str(item+1) )

self.readPageTwo(new_page_url)

# end readPageOne

使用getSoup方法获取解析后到html网页，使用find方法找到class是“last”的a标签，获取最大分页数

循环分页，从1开始

setup3：读取单页链接

#读取单页链接

def readPageTwo(self,page_url):

soup = self.getSoup(page_url)

con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'})

a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a')

print(a_list)

for a_href in a_list:

#print(child)

href = a_href.get('href')

folder_name = a_href.get_text()

print('a_href',href,'---folder_name',folder_name)

path = self.folder_path + folder_name

self.createFolder(path)

self.readPageThree(href,path)

# end for

# end readPageTwo

找到div下id是newscontent的标签，再往下找到class是“l”的div，再找到所有class是“s2”的span，找到此span下的a标签，循环打开a标签

并找到标签名( a_href.get_text() )作为文件夹名称

setup4：打开作品页面，循环章节链接，拼接文件名称

#打开作品页面

def readPageThree(self,page_url,path):

soup = self.getSoup(page_url)

print('readPageThree--',page_url)

a_list = soup.find('div', {'id': 'list'}).find_all('a')

idx = 0

for a_href in a_list:

idx = idx+1

href = self.index_page_url + a_href.get('href')

txt_name = path + '/' + str(idx) + '_'+ a_href.get_text() + '.txt'

print('a_href', href, '---path', txt_name)

isExists = os.path.exists(txt_name)

if isExists:

print(txt_name, '已存在')

else:

self.readPageFour(href,txt_name)

setup5：打开章节链接，读取id=content的div下所有内容，写入文件中

#读取单章内容并写入

def readPageFour(self,page_url,path):

soup = self.getSoup(page_url)

con_div = soup.find('div', {'id': 'content'})

content = con_div.get_text().replace('
', 'n').replace(' ', ' ')

self.writeTxt(path,content)

完整代码实现如下：

1 #-*- coding: UTF-8 -*-

2 from urllib importrequest3 from bs4 importBeautifulSoup4 importos5

6 '''

7 使用BeautifulSoup抓取网页8 '''

10 classCapture():11

12 def __init__(self):13 self.index_page_url = 'http://www.cuiweijuxs.com/'

14 self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'

15 self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"

16 self.folder_path = '小说/'

17 self.head ={}18 #写入User Agent信息

19 self.head[20 'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'

22 #获取BeautifulSoup

23 defgetSoup(self, query_url):24 req = request.Request(query_url, headers=self.head)25 webpage =request.urlopen(req)26 html =webpage.read()27 #soup = BeautifulSoup(html, 'html.parser')

28 soup = BeautifulSoup(html, 'html5lib')29 returnsoup30 #end getSoup

32 #读取更新列表

33 defreadPageOne(self):34 soup =self.getSoup(self.one_page_url)35 last = soup.find("a","last")36 itemSize =int(last.string)37 page_url =str(self.two_page_url)38

39 for item inrange(itemSize):40 print( item )41 new_page_url = page_url.replace( "?",str(item+1) )42 self.readPageTwo(new_page_url)43

44 #end readPageOne

46 #读取单页链接

47 defreadPageTwo(self,page_url):48 soup =self.getSoup(page_url)49 con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'})50 a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a')51 print(a_list)52 for a_href ina_list:53 #print(child)

54 href = a_href.get('href')55 folder_name =a_href.get_text()56 print('a_href',href,'---folder_name',folder_name)57 path = self.folder_path +folder_name58 self.createFolder(path)59 self.readPageThree(href,path)60 #end for

62 #end readPage

64 #打开单章链接

65 defreadPageThree(self,page_url,path):66 soup =self.getSoup(page_url)67 print('readPageThree--',page_url)68 a_list = soup.find('div', {'id': 'list'}).find_all('a')69 idx =070 for a_href ina_list:71 idx = idx+1

72 href = self.index_page_url + a_href.get('href')73 txt_name = path + '/' + str(idx) + '_'+ a_href.get_text() + '.txt'

74 print('a_href', href, '---path', txt_name)75 isExists =os.path.exists(txt_name)76 ifisExists:77 print(txt_name, '已存在')78 else:79 self.readPageFour(href,txt_name)80

82 #读取单章内容并写入

83 defreadPageFour(self,page_url,path):84 soup =self.getSoup(page_url)85 con_div = soup.find('div', {'id': 'content'})86 content = con_div.get_text().replace('
', 'n').replace(' ', ' ')87 self.writeTxt(path,content)88

89 defreadPageHtml(self,page_url,path):90 soup =self.getSoup(page_url)91 con_div = soup.find('div', {'id': 'content'})92 content = con_div.get_text().replace('
', 'n').replace(' ', ' ')93

95 defcreateFolder(self,path):96 path =path.strip()97 #去除尾部符号

98 path = path.rstrip("\")99 isExists =os.path.exists(path)100 #不存在则创建

101 if notisExists:102 os.makedirs(path)103 print(path + 'create')104 else:105 print( path + '目录已存在')106 #end createFolder

107

108 defwriteTxt(self,file_name,content):109 isExists =os.path.exists(file_name)110 ifisExists:111 print(file_name,'已存在')112 else:113 file_object = open(file_name, 'w',encoding='utf-8')114 file_object.write(content)115 file_object.close()116

117 defrun(self):118 try:119 self.readPageOne()120 exceptBaseException as error:121 print('error--',error)122

123

124 Capture().run()

View Code