python 爬虫小说

 

思路:

1.获取第一张内容

2.判断请求方式

3.对URL存在回车进行处理

4.正则匹配

5.写入文件中

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/8/27 20:34
# @Author : Lhtester
# @Site : 
# @File : book.py
# @Software: PyCharm
import requests
import re
import time
import random
import sys

sys.setrecursionlimit(16000)#设置递归深度
class Book_deaill():
    def __init__(self):
        self.url = https://m.xyshuge.com/k3nl5/19/19364/55418253.html
        # self.url = ‘https://m.xyshuge.com/k3nl5/19/19364/55728792_2.html‘
        self.headers ={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"}


    def data_get(self,url=None):
        if url==None:
            url=self.url
        result = requests.get(url=url, headers=self.headers).text
        # print(result)
        title= re.findall(r<div class="nr_title" id="nr_title">(.*?)</div>,result)#匹配章节名称
        print(title)
        with open(../image/book2.txt,a+,encoding=utf-8)as f:
            for i in title:
                f.write(i)
                f.write(\n)
        print(write titie complete)
        text = re.findall(r"<p class=‘c_detail‘>(.*?)</p>",result)#匹配正文
        with open(../image/book2.txt,a+,encoding=utf-8)as f:
            for n in text:
                n = n.replace("&nbsp;","")
                n= n.replace("阅书阁『wWw.xyshuge.Com』,全文免费阅读.","")#删除网站的自定义文字
                f.write(\n)
                f.write(n)
        time.sleep(random.randint(1 , 5) )#随机休眠,避免被对方反爬检测到
        print(write text complete)

        self.start_analysis(result)


    def start_analysis(self,result):

        new_url = https://m.xyshuge.com/k3nl5
        get_next_page = re.findall(r<a id="pb_next" href="/k3nl5\n(.*?)">↓一页</a>,result)
        if len(get_next_page)==0:
            get_next_page = re.findall(r<a id="pb_next" href="/k3nl5\n(.*?)">↓一章</a>,result)
            print(下一章:,get_next_page)
        if len(get_next_page) == 0:#最后一章再次判断
            print(爬虫结束)
        else:
            new_url = new_url+get_next_page[0]
            print(new_url)
            self.data_get(new_url)#地址拼接

    def start_get_data(self):
        print(start get data )
        self.data_get()



if __name__==__main__:
    data =Book_deaill()
    data.start_get_data()

 

python 爬虫小说

上一篇:go语言 判断一个字符串是否为空


下一篇:javaweb-05:Maven环境搭建