Beautifulsoup-基础知识

89 阅读 0 评论 59 点赞

我是靠谱客的博主美丽皮皮虾，最近开发中收集的这篇文章主要介绍Beautifulsoup-基础知识，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

soup = BeautifulSoup(html_doc,features='lxml')
tag1 = soup.find(name='a')
#找到第一个a标签，返回一soup对象
tag2 = soup.find_all(name='a')
#找到所有a标签，返回一列表，列表中所有元素为soup对象
tag3 = soup.select('#link2')
#找到id=link2的标签
name = tag3.name
#获取标签名
attrs = tag3.attrs
#获取属性，返回字典类型
tag3.attrs = {'href'='www.baidu.com'}
#修改或添加标签属性
del tag3.attrs['href']
#删除标签属性
#判断是标签对象还是文本：
from bs4.elementimport Tag
tags = soup.find('body').children
for tag in tags:
　　if type(tag) == Tag:
　　　　print(tag)
　　else:
　　　　print('文本。。。')
children
#body中所有儿子标签
descendants
#body中所有子子孙孙标签
body = soup.find('body')
v = body.descendants
clear
#将标签的所有子标签全部清空（保留body标签名）
soup.find('body').clear()
print(soup)
decompose
#将标签的所有子标签全部清空（包括body标签名）
soup.find('body').decompose()
print(soup)
extract
#将标签的所有子标签全部清空（包括body标签名）,返回删除的标签（类似pop）
find_all
v = soup.find_all(name=['a','div'])
#找到所有a标签和div标签
v = soup .find_all(id=['link1','link2'])
#找到所有id=link1或id=link2的标签
import re
rep = re.compile('^p')
v = soup.find_all(name=rep)
#找以p开头的所有标签
rep = re.compile('class.*')
# .*匹配除换以外的任意字符，并且有或没有
v = soup.find_all(class_=rep)
#找class等于sister开头的
rep = re.compile('http://www.baidu.com/static/.*')
v = soup.find_all(href=rep)
#一般用于匹配页码
get
#获取标签属性
tag = soup.find('a')
v = tag.get('id')
#获取a标签中的id键值
has_attr
#判断是否含有某属性
tag = soup.find('a')
v = tag.has_attr('id')
#判断a标签是否含有id属性
get_text
#获取标签内部文本内容
tag = soup.find('a')
v = tag.get_text()
#获取a标签内部文本内容
index
#标签在某标签中的索引位置
tag = soup.find('body')
v = tag.index(tag.find('div'))
#找div标签在body中的索引位置
is_empty_element
#检查是否是空标签或自闭合标签
判断如下标签：br hr input img meta spacer link
frame base
当前标签的关联标签
soup.next --->soup.find_next(...)
soup.next_element --->soup.find_nexxt_element(...)
soup.next_elements
soup.next_sibling
soup.next_siblins
tag.previous
tag.previous_element
tag.previous_elements
tag.previous_sibling
tag.previous_siblings
tag.parent
tag.parents
select,select_one
#CSS选择器
append
#追加标签到最后
insert
#插入标签到指定位置
warp
#包裹
unwarp
#解包裹