#html文本提取
from bs4 import BeautifulSoup
html_sample = '\
<html> \
<body> \
<h1 id = "title">Hello world</h1>\
<a href = "#www.baidu.com" class = "link"> This is link1</a>\
<a href = "#link2" class = "link"> This is link2</a> \
</body> \
</html>'
soup = BeautifulSoup(html_sample,'html.parser')
print(soup.text)
soup.select('h1')
print(soup.select('h1')[0].text)
print(soup.select('a')[0].text)
print(soup.select('a')[1].text) for alink in soup.select('a'):
print(alink.text) print(soup.select('#title')[0].text)
print(soup.select('.link')[0].text) alinks = soup.select('a')
for link in alinks:
print(link['href'])
demo2:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://news.qq.com/')
soup = BeautifulSoup(res.text,'html.parser')
newsary = []
for news in soup.select('.Q-tpWrap .text'):
newsary.append({'title':news.select('a')[0].text, 'url':news.select('a')[0]['href']}) import pandas
newsdf = pandas.DataFrame(newsary)
newsdf.to_excel('news.xlsx')
推荐使用:Jupyter Notebook 做练习,很方便。