import requests
from bs4 import BeautifulSoup
if __name__ == ‘__main__‘:
#对首页的页面数据进行爬取
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36‘
}
url = ‘https://www.shicimingju.com/book/sanguoyanyi.html‘
response = requests.get(url = url, headers = headers)
response.encoding = ‘utf-8‘
page_text = response.text # 网页好像挂了
#在首页中解析出章节的标题和详情页的url
#实例化BeautifulSoup对象,需要将页面源码数据加载到该对象中
soup = BeautifulSoup(page_text, ‘lxml‘)
# 解析章节标题和详情页的url
li_list = soup.find_all(‘div ul li a‘) # find_all带上属性就为空 属性为空找不到
print(li_list)
# li_list = soup.select(‘.book-mulu > ul > li‘)
# fp = open(‘./sanguo.txt‘, ‘w‘, encoding = ‘utf-8‘)
# for li in li_list:
# title = li.a.string
# detail_url =‘http://www.shicimingju.com‘ + li.a[‘href‘] # a为标签名 []内的为属性
# #对详情页发起请求,解析出章节内容
# detail_response = requests.get(url = detail_url, headers = headers)
# detail_response.encoding = ‘utf-8‘
# detail_page_text = detail_response.text
# #解析出详情页中相关的章节内容
# a = BeautifulSoup(‘提取的数据对象或者文件名‘,‘lxml‘)
# detail_soup = BeautifulSoup(detail_page_text, ‘lxml‘)# 这个内容很好玩 直接用文字嵌入里面 再用<br>换行
# div_tag = detail_soup.find(‘div‘, class_ = ‘chapter_content‘) # 注意是class_,不是class,因为class是python的关键字,所以后面要加个尾巴,防止冲突
# #解析到了章节的内容
# content = div_tag.text
# fp.write(title + ‘:‘ + content + ‘\n‘)
# print(title, ‘爬取成功!‘)
bs4的解析不成功,select怎么都是返回空列表