from lxml import html
import requests
import os
class DaoMuNote:
def __init__(self, url=None):
self.url = url
# 获取相应文本
def get_response_text(self, url):
response = requests.request('get', url=url)
return response.text
# 创建 etree 对象
def create_etree(self, text):
etree = html.etree.HTML(text)
return etree
# 创建文件夹
def create_files(self, file_name):
self.file_name = file_name
if os.path.exists(file_name):
return file_name
else:
os.mkdir(file_name)
return file_name
# 写文本
def writr_text(self, file_name, files, text):
try:
with open('{}/{}.txt'.format(file_name, files), 'w+') as f:
f.write(text)
except Exception:
print("编写出现错误")
url = "http://www.daomubiji.com/"
# 初始化 DaoMuNote 对象
note = DaoMuNote(url=url)
# 第一页的所有标签
first_content = note.get_response_text(note.url)
first_etree = note.create_etree(first_content)
first_links = first_etree.xpath("//ul[@class='sub-menu']/li/a/@href")
first_title = first_etree.xpath("//ul[@class='sub-menu']/li/a/text()")
# 异常处理
try:
# 第一页遍历 访问各个集数的链接 并进行xpath解析
for i in range(len(first_links)):
print(first_links[i])
second_content = note.get_response_text(first_links[i])
second_etree = note.create_etree(second_content)
second_links = second_etree.xpath("//article[@class='excerpt excerpt-c3']/a/@href")
second_titles = second_etree.xpath("//article[@class='excerpt excerpt-c3']/a/text()")
# 创建文件夹 书的集数作为标题
file_name = note.create_files(first_title[i])
for j in range(len(second_links)):
third_content = note.get_response_text(second_links[j])
third_etree = note.create_etree(third_content)
book_content = third_etree.xpath('//article[@class="article-content"]/p/text()')
print(book_content)
for data in book_content:
data += '\n'
note.writr_text(file_name, second_titles[i], data)
except Exception as e:
print("出现异常",e)