2021-05-13

爬取斗罗大陆小说最基本爬虫

import requests
from lxml import etree

url = 'https://www.soshuw.com/DouLuoDaLu/'
headers = {
    'User_Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.56'    #header头浏览器检测
}

class Douluo:
    def __init__(self,url,headers):
        self.url=url
        self.headers=headers
    def first_page(self):
        page_text = requests.get(url=url, headers=headers).text  #首页源代码
        html = etree.HTML(page_text)
        dd_list = html.xpath('//*[@id="novel15238"]/dl/dd')     #xpath
        fp = open('./斗罗大陆小说爬取.txt', 'w', encoding='utf-8')   #创建一个txt文件
        for dd in dd_list:
            title = dd.xpath('./a/text()')[0]
            detail_url = 'https://www.soshuw.com/'+dd.xpath('./a/@href')[0]     xpath解析方式
            detail_page = requests.get(url=detail_url,headers=headers).text   
            tree = etree.HTML(detail_page)
            detail_data = tree.xpath('//div[@class="content"]//text()')    
            detail_data = ''.join(detail_data)   #把列表转换成字符串
            fp.write('\t\t\t\t\t\t'+title+'\n'+detail_data+'\n\n\n')
            print(title + '\t\t\t\t\t\n\n\n' + detail_data )
        print('爬取结束!!!')



douluo = Douluo(url=url,headers=headers)    
douluo.first_page()

上一篇:非常有趣的的免费API接口


下一篇:vue(一)