思路:
1.获取第一张内容
2.判断请求方式
3.对URL存在回车进行处理
4.正则匹配
5.写入文件中
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/8/27 20:34 # @Author : Lhtester # @Site : # @File : book.py # @Software: PyCharm import requests import re import time import random import sys sys.setrecursionlimit(16000)#设置递归深度 class Book_deaill(): def __init__(self): self.url = ‘https://m.xyshuge.com/k3nl5/19/19364/55418253.html‘ # self.url = ‘https://m.xyshuge.com/k3nl5/19/19364/55728792_2.html‘ self.headers ={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"} def data_get(self,url=None): if url==None: url=self.url result = requests.get(url=url, headers=self.headers).text # print(result) title= re.findall(r‘<div class="nr_title" id="nr_title">(.*?)</div>‘,result)#匹配章节名称 print(title) with open(‘../image/book2.txt‘,‘a+‘,encoding=‘utf-8‘)as f: for i in title: f.write(i) f.write(‘\n‘) print(‘write titie complete‘) text = re.findall(r"<p class=‘c_detail‘>(.*?)</p>",result)#匹配正文 with open(‘../image/book2.txt‘,‘a+‘,encoding=‘utf-8‘)as f: for n in text: n = n.replace(" ","") n= n.replace("阅书阁『wWw.xyshuge.Com』,全文免费阅读.","")#删除网站的自定义文字 f.write(‘\n‘) f.write(n) time.sleep(random.randint(1 , 5) )#随机休眠,避免被对方反爬检测到 print(‘write text complete‘) self.start_analysis(result) def start_analysis(self,result): new_url = ‘https://m.xyshuge.com/k3nl5‘ get_next_page = re.findall(r‘<a id="pb_next" href="/k3nl5\n(.*?)">↓一页</a>‘,result) if len(get_next_page)==0: get_next_page = re.findall(r‘<a id="pb_next" href="/k3nl5\n(.*?)">↓一章</a>‘,result) print(‘下一章:‘,get_next_page) if len(get_next_page) == 0:#最后一章再次判断 print(‘爬虫结束‘) else: new_url = new_url+get_next_page[0] print(new_url) self.data_get(new_url)#地址拼接 def start_get_data(self): print(‘start get data ‘) self.data_get() if __name__==‘__main__‘: data =Book_deaill() data.start_get_data()