'''
获取全书网的任意一本书的正文,每个章节为一个txt文件(如果章节太多可以获取前5章),这些文件全部放在以书名命名的文件夹中
'''
import os
import re
from urllib.parse import quote
import requests
class QuanShu:
def __init__(self, name=''):
name = quote(name.encode('gb2312'))
# print(name)
self.host = 'http://www.quanshuxs.com/'
self.url = f'http://www.quanshuxs.com/search.asp?key={name}&x=0&y=0'
pass
def get_search_html(self):
resp = requests.get(self.url)
resp.encoding = resp.apparent_encoding
html = resp.text
tables = re.findall(
r'<table cellspacing="0" cellpadding="0" width="962" border="0" align="center" class="m9">(.*?)</table>',
html, re.S)
url_list = []
for i, table in enumerate(tables):
works_url = re.findall(r'<a href="(.*)" target="_blank">', table, re.S)
works_info = re.findall(r'<a href=".*?">(.*?)</a>', table, re.S)
works_status = re.findall(r'状态: </font>\r\n(.*?) \|', table, re.S)
if len(works_url) < 1:
print("没有搜索到对应作品!")
break
else:
works_url = works_url[0]
works_name = str(works_info[0]).replace("<font color='red'>", "").replace("</font>", "")
new_chapter = works_info[1]
works_author = works_info[2]
works_type = works_info[3]
works_status = works_status[0]
url_list.append(self.host + works_url)
print(
f'序号:{i:3}作品名称:{works_name} 最新章节:{new_chapter} 作者: {works_author} 类型: {works_type} 状态: {works_status}',
end='\n\n')
url_num = input("请选择需要下载的作品序号(enter):")
self.get_works_html(url_list[int(url_num)])
def get_works_html(self, url):
works_resp = requests.get(url)
works_resp.encoding = works_resp.apparent_encoding
html = works_resp.text
table = re.findall(r'class="mread">(.*?)</table>', html, re.S)[0]
self.title = re.findall(r'<font color="#7B352B">(.*?)全文阅读</font>', table, re.S)[0]
if not os.path.exists(self.title):
os.mkdir(self.title)
chapters = re.findall(r'<div class="bai"><a href="(.*?)">(.*?)</a>', table, re.S)[0]
# print(chapters)
self.get_chapter_html(chapters[0])
# 访问单个章节
def get_chapter_html(self, url):
chapter_resp = requests.get(url)
chapter_resp.encoding = chapter_resp.apparent_encoding
html = chapter_resp.text
chapter_name = re.findall(r"<strong>(.*?)</strong>", html, re.S)[0]
print(f'正在下载---->{chapter_name}')
next = re.findall(r"<a href='(.*?)'><font color='#7B352B'>下一章</font></a> \( → \)", html, re.S)
content = re.findall(r'<td colspan="2" class="content">(.*?)</td>', html, re.S)
# print(content)
content = content[0].replace('<br><br>', '\n').replace('<img src="image/', '').replace('.jpg">', ' ').replace(
'—', '—')
with open(f'{self.title}/{chapter_name}.txt', 'w+') as f:
f.write(" ")
f.write(content)
print(f'已下载---->{chapter_name},{url}')
if len(next) > 0:
self.get_chapter_html(next[0])
if __name__ == '__main__':
name = input("请输入小说名称:")
quanshu = QuanShu(name)
quanshu.get_search_html()
quanshu.get_works_html()