[Python] BeautifulSoup模块用法演示

from bs4 import BeautifulSoup, element

# 演示用html文本
html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
        <p class="story">Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
    </body>
</html>
"""

soup = BeautifulSoup(html, "html.parser")
# print(soup.prettify())

p1 = soup.find('p')
print(f'1a: {p1.text}')
print(f'1b: {soup.p.text}')
print(f'1c: {soup.p.string}')

print(f'2: {soup.title}')
print(f'3: {soup.head}')
print(f'4: {soup.p}')
print(f'5: {soup.head.name}')
print(f'6: {soup.head.text.strip()}')
print(f'7: {soup.p.attrs}')

soup.p['class'] = "newClass"
print(f"8a: {soup.p['class']}")
print(f"8b: {soup.p.get('class')}")

print(f'9a: {soup.a.attrs["href"]}')
print(f'9b: {soup.a["href"]}')

e1 = soup.find(class_='story')
print(f'10a: {e1}')
print(f'10b: {e1.text}')  # 不同于string

print('-' * 60)
ap_list = soup.find_all(['a', 'p'])
print(len(ap_list))
for ap in ap_list:
    print(ap)

print('+' * 60)
print(soup.find_all(text=["Elsie", "Lacie"]))

print('!' * 60)
li = soup.find_all(id='link1')
print(len(li))
print(li)
print(f'11a: {li[0].text}')
print(f'11b: {li[0].string}')
if type(li[0].string) == element.Comment:
    print('这是注释')


print(soup.find_all(attrs={"href": "http://example.com/lacie"}))
print(soup.select("head > title"))
print(soup.select('p #link1'))

输出结果

"D:\Program Files\Python\python.exe" C:/Users/issuser/PycharmProjects/pythonProject/10/soup001.py
1a: The Dormouse's story
1b: The Dormouse's story
1c: The Dormouse's story
2: <title>The Dormouse's story</title>
3: <head>
<title>The Dormouse's story</title>
</head>
4: <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
5: head
6: The Dormouse's story
7: {'class': ['title'], 'name': 'dromouse'}
8a: newClass
8b: newClass
9a: http://example.com/elsie
9b: http://example.com/elsie
10a: <p class="story">Once upon a time there were three little sisters; and their names were
        <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
        <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
        <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
        and they lived at the bottom of a well.</p>
10b: Once upon a time there were three little sisters; and their names were
        ,
        Lacie and
        Tillie;
        and they lived at the bottom of a well.
------------------------------------------------------------
6
<p class="newClass" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
        <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
        <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
        <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
        and they lived at the bottom of a well.</p>
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
<p class="story">...</p>
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
['Lacie']
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
11a: 
11b:  Elsie 
这是注释
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
[<title>The Dormouse's story</title>]
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]

上一篇:学习笔记:BeautifulSoup的3种遍历方式


下一篇:一图搞懂Web应用的单点登录