爬取个人随笔内容——练手,待补充

import requests,lxml
from bs4 import BeautifulSoup
url='https://www.cnblogs.com/wjlv/default.html?page=2' # 打开网页
html_index = requests.get(url).text # 获取请求内容
soap = BeautifulSoup(html_index,"lxml")
a_list = soap.find_all('a',{"class":"postTitle2"}) # 得到所有随笔标题和地址标签
for h in a_list:
soap = BeautifulSoup(str(h),'lxml')
# print('{}:{}'.format(soap.find('a').text,soap.find('a')['href'])) # 得到单个随笔的地址
article = requests.get(soap.find('a')['href']).text # 得到单个随笔的内容
soap_a = BeautifulSoup(article,'lxml')
p_lab = soap_a.find_all('p') # 获取单个随笔内容标签
for txt in p_lab:
soap_t = BeautifulSoup(str(txt),'lxml')
print(soap_t.find('p').text) # 得到所有标签的具体内容
上一篇:使用python和lxml从表中提取文本


下一篇:python – 获取lxml中特定名称的所有节点?