介绍:
最近在学Python爬虫,在这里对数据解析模块bs4做个学习笔记。
用途:
bs4用于解析xml文档,而html只是xml的一种
bs4 官方文档地址:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
学习笔记:
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class=... ... ... ... ... ... "sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'html.parser') #创建一个BeautifulSoup对象,添加html文件解析器,在不同平台可能不同,在Linux上就不需要
print(soup.prettify()) #美化输出
print(soup.get_text()) #将html_doc变量中保存的全部内容输出(Linux系统会以\n隔开)
print('')
print(type(soup.title))
print(dir(soup.title))
print(soup.title) #获取html标题
<title>The Dormouse's story</title>
print(soup.title.text) #获取html标题内容
"The Dormouse's story"
print(soup.a) #获取a标签(第一个)
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(soup.a.attrs) #获取第一个a标签的所有属性,组成一个字典
{'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
print(soup.a.attrs['href']) #获取第一个a标签的href属性
'http://example.com/elsie'
print(soup.a.has_attr('class')) #判断class属性是否存在
True
print(soup.p) #获取p标签(第一个)
<p class="title"><b>The Dormouse's story</b></p>
print(soup.p.children) #获取第一个p标签下的所有子节点
<list_iterator object at 0x7fe8185261d0>
print(list(soup.p.children))
[<b>The Dormouse's story</b>]
print(list(soup.p.children)[0])
<b>The Dormouse's story</b>
print(list(soup.p.children)[0].text)
"The Dormouse's story"
print(soup.find_all('a')) #获取所有的a标签
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id=a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
for a in soup.find_all('a'): #遍历所有的a标签
print(a.attrs['href'])
print(soup.find(id='link3')) #获取id=link3的标签
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
print('#'*150)
#支持CSS选择器
#查找类名为story的节点
print(soup.select('.story'))
print('')
print(soup.select('.story a'))
print('')
#查找id=link1的节点
print(soup.select('#link1'))