Python3中利用Beautiful Soup爬取豆瓣图片

200 阅读 0 评论 132 点赞

我是靠谱客的博主寒冷滑板，这篇文章主要介绍Python3中利用Beautiful Soup爬取豆瓣图片，现在分享给大家，希望可以做个参考。

#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 设置编码格式为utf-8，为了可以打印出中文字符
import sys
import os
# 导入urllib2模块，用于通过url获取网页的内容
import urllib
# 导入BeautifulSoup模块(需要安装)，用于解析网页的内容
from bs4 import BeautifulSoup

# 设置爬取的初始url
base_url = 'https://www.dbmeinv.com/dbgroup/show.htm?cid=4'
# 构造发送请求
request = urllib.request.Request(base_url)
# 发出请求并取得响应
response = urllib.request.build_opener().open(request)
#或
response = urllib.request.urlopen(request)
# 获取网页内容
html = response.read()
print(html)
# 把内容解析成BeautifulSoup结构
soup = BeautifulSoup(html, 'html.parser')
print(soup)
# 查看网页可以看到我们要获取的信息都在class='thumbnails'里面，所以获取到它，再获取到其中所有的li标签，组成一个list
content_list = soup.find(class_='thumbnails').find_all('li')
#print(content_list)
# 循环li标签列表
for pic_item in content_list:
# 因为通过查看网页，可以看见一个是图片链接，一个是名称
pic_name = pic_item.find(class_='img_single').find_all('img')[0].get('alt')
pic_url = pic_item.find(class_='img_single').find_all('img')[0].get('src')
# 好了，就先获取这些信息，现在把爬到的信息打印一下
print ("正在爬取的信息如下：图片名称："+pic_name+"，图片地址："+pic_url)
#保存图片并命名
save_path = '..知乎美女\'
if not os.path.exists(save_path):
os.makedirs(save_path)
f = urllib.request.urlopen(pic_url)
with open(save_path+pic_name+'.jpg', "wb") as code:
code.write(f.read())
print('保存成功')
#获取后续页码内的图片
while soup.find(class_='next next_page').find('a').text=='下一页 →':
link = soup.find(class_='next next_page').find('a').get('href')
url = 'https://www.dbmeinv.com'+link
#重新构造发送请求
request = urllib.request.Request(url)
#重新发出请求并取得响应
response = urllib.request.build_opener().open(request)
#重新获取网页内容
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
print(soup)
#重复获取信息
content_list = soup.find(class_='thumbnails').find_all('li')
#print(content_list)
# 循环li标签列表
for pic_item in content_list:
#重新获取图片链接、名称
pic_name = pic_item.find(class_='img_single').find_all('img')[0].get('alt')
pic_url = pic_item.find(class_='img_single').find_all('img')[0].get('src')
#打印一下
print ("正在爬取的信息如下：图片名称："+pic_name+"，图片地址："+pic_url)
#保存图片并命名
f = urllib.request.urlopen(pic_url)
try:
with open(save_path+pic_name+'.jpg', "wb") as code:
code.write(f.read())
print('保存成功')
except OSError:#文件命名异常
pass#跳过
continue