最近迷上了三体小说,网上小说基本上都是分章节一篇一篇的人肉ctrl c v实在是太low了。干脆自己写个脚本吧,一劳永逸。
基本思路:
- 1、获取小说首页所有的章节名称和链接
- 2、使用异步请求所有的章节网页
- 3、根据网页内容使用xpath提取章节文本,再分章节存储
实现如下:
""" =================================== -*- coding:utf-8 -*- Author :GadyPu E_mail :Gadypy@gmail.com Time :2020/10/7 0007 上午 11:59 FileName :spider.py =================================== """ import os import re import sys import requests from lxml import etree import asyncio import aiohttp from queue import Queue import threading class GetNovels(object): def __init__(self, url, name): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' } self.novel_url = url self.htmlQ = asyncio.Queue() self.chapterQ = Queue() self.novel_name = name def get_chapter_urls(self): try: response = requests.get(url = self.novel_url, headers = self.headers) html = etree.HTML(response.content.decode('utf-8')) titles = html.xpath(r'//div[@class="book-list clearfix"]/ul/li/a/text()') links = html.xpath(r'//div[@class="book-list clearfix"]/ul/li/a/@href') for title, link in zip(titles, links): self.htmlQ.put_nowait((title, link)) except Exception as e: print(e, '\n', 'network error cannot parse chapter url') sys.exit() async def fetch(self): try: async with aiohttp.ClientSession(connector = aiohttp.TCPConnector(ssl = False)) as session: while not self.htmlQ.empty(): data = await self.htmlQ.get() async with session.get(url = data[1], headers = self.headers) as response: if response.status == 200: html = await response.read() self.chapterQ.put((data[0], html.decode('utf-8'))) await asyncio.sleep(0.3) except Exception as e: print(e, '\n', 'network error cannot fetch chapters...') sys.exit() def parse_chapter(self, path, id): while True: data = self.chapterQ.get() if not data: break temp = data[0].split(' ') html = etree.HTML(data[1]) # 获取p标签下所有文本 content = html.xpath(r'//*[@id="nr1"]/p//text()') chapter = html.xpath(r'//*[@id="bcrumb"]/span[5]/a/text()')[0] chapter_dir = os.path.join(path, chapter) if not os.path.exists(chapter_dir): os.makedirs(chapter_dir) chapter_name = os.path.join(chapter_dir, re.sub('[\/:*?"<>|]', '-', ' '.join(temp))) print(f'thread:{id} is parsing: ' + ' '.join(temp)) with open(chapter_name + '.txt', 'w+', encoding = 'utf-8') as wf: wf.write(' '.join(temp) + '\n\n') for cont in content: wf.write(str(cont) + '\n') def run(self): self.get_chapter_urls() loop = asyncio.get_event_loop() # 为了防止爬的过快控制并发数量 tasks = [self.fetch() for _ in range(20)] path = os.path.join(os.getcwd(), self.novel_name) if not os.path.exists(path): os.makedirs(path) thread_lists = [] for i in range(3): t = threading.Thread(target = self.parse_chapter, args = (path, i + 1)) t.setDaemon(True) thread_lists.append(t) t.start() loop.run_until_complete(asyncio.wait(tasks)) [self.chapterQ.put_nowait(None) for _ in range(3)] [i.join() for i in thread_lists] if __name__ == '__main__': url = 'https://www.luoxia.com/santi/' name = '三体' d = GetNovels(url, name) d.run()