爬取斗罗大陆小说最基本爬虫
import requests
from lxml import etree
url = 'https://www.soshuw.com/DouLuoDaLu/'
headers = {
'User_Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.56' #header头浏览器检测
}
class Douluo:
def __init__(self,url,headers):
self.url=url
self.headers=headers
def first_page(self):
page_text = requests.get(url=url, headers=headers).text #首页源代码
html = etree.HTML(page_text)
dd_list = html.xpath('//*[@id="novel15238"]/dl/dd') #xpath
fp = open('./斗罗大陆小说爬取.txt', 'w', encoding='utf-8') #创建一个txt文件
for dd in dd_list:
title = dd.xpath('./a/text()')[0]
detail_url = 'https://www.soshuw.com/'+dd.xpath('./a/@href')[0] xpath解析方式
detail_page = requests.get(url=detail_url,headers=headers).text
tree = etree.HTML(detail_page)
detail_data = tree.xpath('//div[@class="content"]//text()')
detail_data = ''.join(detail_data) #把列表转换成字符串
fp.write('\t\t\t\t\t\t'+title+'\n'+detail_data+'\n\n\n')
print(title + '\t\t\t\t\t\n\n\n' + detail_data )
print('爬取结束!!!')
douluo = Douluo(url=url,headers=headers)
douluo.first_page()