Beautiful Soup

Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式。

 

html_doc = """
<html>
  <head>
    <title>The Dormouse's story</title>
  </head>
  <body>
    <p class="title">
      <b>The Dormouse's story</b> 
        <span>eng</span>
      <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
    </p>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
    and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
    and they lived at the bottom of a well.
    

    <p class="story">...</p>
"""
  标签选择器 选择元素
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
soup.title#返回是一个tag对象
# <title>The Dormouse's story</title>
print(soup.head)
# <head>
# <title>The Dormouse's story</title>
# </head>
print(soup.p)
# <p class="title">
#<b>The Dormouse's story</b>
#<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
#</p>
print(soup.a)
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

 

获取标签名称
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.title.name)
# title

 

获取标签属性
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.a['href'])
# http://example.com/elsie
print(soup.a.attrs['href'])
# http://example.com/elsie

 

获取标签文本内容
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.title.string)
# The Dormouse's story

 

嵌套选择
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.head.title.string)#返回是一个tag对象,可以在此之上继续选择
# The Dormouse's story

 

子节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.p.contents)#返回含有所有子节点tag对象的一个列表
#['\n', <b>The Dormouse's story</b>, '\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, '\n']

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.p.children)#返回含有所有子节点tag对象的一个列表迭代器
#<list_iterator object at 0x037F8FE8>

for i, child in enumerate(soup.p.children):
    print(i,child)

#0

#1 <b>The Dormouse's story</b>
#2

#3 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
#4

 

子孙节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.p.contents)#返回含有所有子孙节点tag对象的一个列表
#['\n', <b>The Dormouse's story</b>, '\n', <span>eng</span>, '\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, '\n']

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.p.descendants)#返回含有所有子节点tag对象的一个列表迭代器
#<generator object Tag.descendants at 0x0130B808>
for i, child in enumerate(soup.p.descendants):
    print(i,child)

#0

#1 <b>The Dormouse's story</b>
#2 The Dormouse's story
#3

#4 <span>eng</span>
#5 eng
#6

#7 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
#8 Elsie
#9

 

兄弟节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.a.next_sibling)#下一个兄弟节点tag对象
print(soup.a.next_siblings)#下面所有兄弟节点tag对象
print(list(enumerate(soup.a.next_siblings)))

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.a.previous_sibling)#上一个兄弟节点tag对象
print(soup.a.previous_siblings)#上面所有兄弟节点tag对象
print(list(enumerate(soup.a.previous_siblings)))

 

标准选择器 find_all( name , attrs , recursive , string , **kwargs )   name
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.find_all("a"))#返回所有a标签的tag对象的列表
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print(soup.find_all('a')[0])
#<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

 

attrs
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.find_all("a"))#返回所有a标签的ta
print(soup.find_all(attrs = {'id':'link2'}))#定义一个字典参数来搜索包含特殊属性的ta
print(soup.find_all(id = 'link2'))#id、class_可以简写,class要加_
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

 

string
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.find_all("a", string="Elsie"))#返回所有字符串与 string 参数值相符的tag
# [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>]

soup.find_all(string=["Tillie", "Elsie", "Lacie"]) # 列表
# ['Elsie', 'Lacie', 'Tillie']

soup.find_all(string=re.compile("Dormouse")) #正则表达式
# ["The Dormouse's story", "The Dormouse's story"]

#find( name , attrs , recursive , string , **kwargs )#返回第一个
#find_parents() 和 find_parent()
#find_next_siblings() 和 find_next_sibling()
#find_previous_siblings() 和 find_previous_sibling()
#find_all_next() 和 find_next()
#find_all_previous() 和 find_previous()

 

CSS选择器
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.select("title"))#传入字符串参数,返回是列表
# [<title>The Dormouse's story</title>]

print(soup.select("p:nth-of-type(3)"))
# [<p class="story">...</p>]

 

通过tag标签逐层查找
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.select("p a"))
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

 

找到某个tag标签下的直接子标签
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.select("p > a"))
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

 

获取属性
rom bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.select('a[href]'))
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
for a in soup.select('a'):
    print(a['id'])
    print(a.attrs['id'])#另外一种写法

#link1
#link2
#link3

 

获取文本内容 from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') for a in soup.select('a'):     print(a.get_text())
#Elsie #Lacie #Tillie   BeautifulSoup官方文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id37

 

上一篇:Locust 快速开始:一个简单的登录、页面浏览脚本【三】


下一篇:你要偷偷的学Python,然后惊呆所有人(第八天)