# url -- 页面 -- 内容(分析)
import requests
from lxml import html
import math # 页面个数的获取
from book import Book
import xlwt # 用来给excel进行写内容的
'''
补充:
1-添加表头(书名,售价,作者)
2-每一列的宽度进行调整
3-每一行的高度进行调整
4-表头或表内容的字体样式进行设置
'''
# 1-获取页面的代码
def download(url):
# 请求指定的网站,获取响应结果
response = requests.get(url)
# 获取页面的文本--源码
content = response.text # str
# 文本|源码 进行格式化 -- HTMLElement
html_content = html.fromstring(content)
return html_content
# 2-从源码中获取有价值的信息 count=总条目数 num=累计条目数
def getvalues(html_content,count,num):
books_page = [] # 存储指定页码的书籍
# 60 条信息
#list_book = html_content.cssselect('ul.bigimg>li')
list_book = html_content.xpath(
'//ul[@class="bigimg"]/li')
# 检索60条信息的具体内容
for index,li in enumerate(list_book):
# 当前书籍的存储域 -- li
# //*[@id="p23214590"]/p[1]/a
bookname = li.xpath('p[@class="name"]/a/@title')
newprice = li.xpath('p[contains(@class,"price")]/span[@class="search_now_price"]/text()')
if len(newprice) == 0:
newprice = li.xpath('div[@class="ebook_buy"]/p[@class="price e_price"]/span/text()')
bookauthor = li.xpath('p[@class="search_book_author"]/span[1]/a[1]/@title')
if len(bookauthor)==0:
bookauthor=li.xpath('p[@class="search_book_author"]/span[1]/text()')
if len(bookauthor) == 0:
bookauthor.append("作者名不存在")
book = Book(bookname[0],newprice[0],bookauthor[0])
print(index,book)
books_page.append(book)
#print(book)
#print(num,bookname,newprice[0])
# 获取数据--累计书籍的条目数num累加
num += 1 # num = num+1
if num > count :
return books_page # 最后一个执行过程中,可能存储不足60本书籍
# print(oldprice)
return books_page # 长度=60
# 3-下载多页书籍信息,bname-任意指定书籍 count-任意总条目数
def getvaluespages(bname,count):
books = [] # 存储多个页面的指定书籍
# 根据总条目书count ,确定翻页的次数
page = math.ceil(count / 60)
# 页数确定,循环执行多页信息的检索
for index in range(1,page+1):
print("----正在下载第%d页书籍信息"%index)
# 判断结束时机
num = 1+60*(index-1)
url = "http://search.dangdang.com/?key=%s&act=input&page_index=%d"%(bname,index)
# 下载当前url的请求后的响应源码
html_code = download(url)
# 对源码的信息进行价值信息的检索
books_page = getvalues(html_code,count,num)
books.extend(books_page)
return books # list
# 4- 存储--excel
def savevalues_excel(books):
# 4-0 导入xlwt模块 excel的写模块 xl-excel wt-write
# 4-1 新建一个工作表-工作簿-excel
work = xlwt.Workbook(encoding='utf-8')
# 4-2 确定sheet文件是哪一个,默认不存在,创建-sheet
sheet = work.add_sheet('sheet1',True)
# 4-3 写内容 - (行,列,值)
for index,book in enumerate(books):
sheet.write(index, 0, book.bname)
sheet.write(index, 1, book.bprice)
sheet.write(index, 2, book.bauthor)
# 4-4 关闭并保存excel文件
work.save("当当书籍信息.xls")
if books:
return True # books有值
else:
return False # books为空
if __name__ == "__main__":
books = getvaluespages("python",130)
flag = savevalues_excel(books)
if flag:
print("下载成功")
else:
print("下载失败")
# for index, book in enumerate(books):
# print(index+1,book)
#https://search.yhd.com/c0-0/mbname-b/a-s1-v4-p2-price-d0-f0b-m1-rt0-pid-mid0-color-size-kpython/
#https://search.yhd.com/c0-0/mbname-b/a-s1-v4-p4-price-d0-f0b-m1-rt0-pid-mid0-color-size-kpython/
#https://search.yhd.com/c0-0/mbname-b/a-s1-v4-p6-price-d0-f0b-m1-rt0-pid-mid0-color-size-kpython/
最后
以上就是炙热画板最近收集整理的关于Python爬虫-抓取当当网指定图书信息,并存储到excel文件的全部内容,更多相关Python爬虫-抓取当当网指定图书信息内容请搜索靠谱客的其他文章。
发表评论 取消回复