Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式。
html_doc = """ <html> <head> <title>The Dormouse's story</title> </head> <body> <p class="title"> <b>The Dormouse's story</b> <span>eng</span> <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a> </p> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> and they lived at the bottom of a well. <p class="story">...</p> """标签选择器 选择元素
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') soup.title#返回是一个tag对象 # <title>The Dormouse's story</title> print(soup.head) # <head> # <title>The Dormouse's story</title> # </head> print(soup.p) # <p class="title"> #<b>The Dormouse's story</b> #<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> #</p> print(soup.a) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
获取标签名称
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.title.name) # title
获取标签属性
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.a['href']) # http://example.com/elsie print(soup.a.attrs['href']) # http://example.com/elsie
获取标签文本内容
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.title.string) # The Dormouse's story
嵌套选择
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.head.title.string)#返回是一个tag对象,可以在此之上继续选择 # The Dormouse's story
子节点
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.p.contents)#返回含有所有子节点tag对象的一个列表 #['\n', <b>The Dormouse's story</b>, '\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, '\n'] from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.p.children)#返回含有所有子节点tag对象的一个列表迭代器 #<list_iterator object at 0x037F8FE8> for i, child in enumerate(soup.p.children): print(i,child) #0 #1 <b>The Dormouse's story</b> #2 #3 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> #4
子孙节点
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.p.contents)#返回含有所有子孙节点tag对象的一个列表 #['\n', <b>The Dormouse's story</b>, '\n', <span>eng</span>, '\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, '\n'] from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.p.descendants)#返回含有所有子节点tag对象的一个列表迭代器 #<generator object Tag.descendants at 0x0130B808> for i, child in enumerate(soup.p.descendants): print(i,child) #0 #1 <b>The Dormouse's story</b> #2 The Dormouse's story #3 #4 <span>eng</span> #5 eng #6 #7 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> #8 Elsie #9
兄弟节点
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.a.next_sibling)#下一个兄弟节点tag对象 print(soup.a.next_siblings)#下面所有兄弟节点tag对象 print(list(enumerate(soup.a.next_siblings))) from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.a.previous_sibling)#上一个兄弟节点tag对象 print(soup.a.previous_siblings)#上面所有兄弟节点tag对象 print(list(enumerate(soup.a.previous_siblings)))
标准选择器 find_all( name , attrs , recursive , string , **kwargs ) name
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.find_all("a"))#返回所有a标签的tag对象的列表 #[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] print(soup.find_all('a')[0]) #<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
attrs
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.find_all("a"))#返回所有a标签的ta print(soup.find_all(attrs = {'id':'link2'}))#定义一个字典参数来搜索包含特殊属性的ta print(soup.find_all(id = 'link2'))#id、class_可以简写,class要加_ # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
string
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.find_all("a", string="Elsie"))#返回所有字符串与 string 参数值相符的tag # [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>] soup.find_all(string=["Tillie", "Elsie", "Lacie"]) # 列表 # ['Elsie', 'Lacie', 'Tillie'] soup.find_all(string=re.compile("Dormouse")) #正则表达式 # ["The Dormouse's story", "The Dormouse's story"] #find( name , attrs , recursive , string , **kwargs )#返回第一个 #find_parents() 和 find_parent() #find_next_siblings() 和 find_next_sibling() #find_previous_siblings() 和 find_previous_sibling() #find_all_next() 和 find_next() #find_all_previous() 和 find_previous()
CSS选择器
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.select("title"))#传入字符串参数,返回是列表 # [<title>The Dormouse's story</title>] print(soup.select("p:nth-of-type(3)")) # [<p class="story">...</p>]
通过tag标签逐层查找
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.select("p a")) #[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
找到某个tag标签下的直接子标签
from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.select("p > a")) #[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
获取属性
rom bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.select('a[href]')) #[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') for a in soup.select('a'): print(a['id']) print(a.attrs['id'])#另外一种写法 #link1 #link2 #link3
获取文本内容 from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') for a in soup.select('a'): print(a.get_text())
#Elsie #Lacie #Tillie BeautifulSoup官方文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id37