from bs4 import BeautifulSoup, element
# 演示用html文本
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
</body>
</html>
"""
soup = BeautifulSoup(html, "html.parser")
# print(soup.prettify())
p1 = soup.find('p')
print(f'1a: {p1.text}')
print(f'1b: {soup.p.text}')
print(f'1c: {soup.p.string}')
print(f'2: {soup.title}')
print(f'3: {soup.head}')
print(f'4: {soup.p}')
print(f'5: {soup.head.name}')
print(f'6: {soup.head.text.strip()}')
print(f'7: {soup.p.attrs}')
soup.p['class'] = "newClass"
print(f"8a: {soup.p['class']}")
print(f"8b: {soup.p.get('class')}")
print(f'9a: {soup.a.attrs["href"]}')
print(f'9b: {soup.a["href"]}')
e1 = soup.find(class_='story')
print(f'10a: {e1}')
print(f'10b: {e1.text}') # 不同于string
print('-' * 60)
ap_list = soup.find_all(['a', 'p'])
print(len(ap_list))
for ap in ap_list:
print(ap)
print('+' * 60)
print(soup.find_all(text=["Elsie", "Lacie"]))
print('!' * 60)
li = soup.find_all(id='link1')
print(len(li))
print(li)
print(f'11a: {li[0].text}')
print(f'11b: {li[0].string}')
if type(li[0].string) == element.Comment:
print('这是注释')
print(soup.find_all(attrs={"href": "http://example.com/lacie"}))
print(soup.select("head > title"))
print(soup.select('p #link1'))
输出结果
"D:\Program Files\Python\python.exe" C:/Users/issuser/PycharmProjects/pythonProject/10/soup001.py
1a: The Dormouse's story
1b: The Dormouse's story
1c: The Dormouse's story
2: <title>The Dormouse's story</title>
3: <head>
<title>The Dormouse's story</title>
</head>
4: <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
5: head
6: The Dormouse's story
7: {'class': ['title'], 'name': 'dromouse'}
8a: newClass
8b: newClass
9a: http://example.com/elsie
9b: http://example.com/elsie
10a: <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
10b: Once upon a time there were three little sisters; and their names were
,
Lacie and
Tillie;
and they lived at the bottom of a well.
------------------------------------------------------------
6
<p class="newClass" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
<p class="story">...</p>
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
['Lacie']
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
11a:
11b: Elsie
这是注释
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
[<title>The Dormouse's story</title>]
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]