import re
import urllib.request
from bs4 import BeautifulSoup
import time url=input("第一章网址:") def gethtml(url):
#获取页面源代码html
page=urllib.request.urlopen(url)
html=page.read().decode('utf-8') #html是一个列表
soup=BeautifulSoup(html,'html.parser') return soup def getcontent(soup,load): content=soup.find_all("div",{"class":"read-content j_readContent"}) content1=re.compile(r'<p>([\s\S]*?)</p>') #匹配到段落内容 content2=content1.findall(str(content)) content3=re.sub("</?\w+[^>]*>",'',content2[0]) #除掉html标签 content4=content3.replace('。','。\n\n\0\0\0') #把以句号换位“。\n\n\0\0\0 两个换行符三个空格” 到此,将章节内容获取完毕 contentname=re.compile(r'<h3 class="j_chapterName">(.*?)</h3>') contentname1=contentname.findall(str(soup)) #获取章节名称 book="----------------------------------------------------------------"+contentname1[0]+"------------------------------------------------------------\n\n\n"+content4 with open(load, 'a') as f: f.write(book) def nextcontent(soup): content=soup.find_all("div",{"class":"chapter-control dib-wrap"}) #print(str(content)) step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">') content1=step.findall(str(content)) if content1 == []: #判断该页是否为最后一章,是,获取最后一章(特殊)的url,不是,以常规方法获取下一章url step1=re.compile(r'<a data-eid="qd_R118" href="(.*?)" id="j_chapterNext">') content2=step1.findall(str(content)) url="http:"+content2[0] return url
else:
url="http:"+content1[0] return url def panduan(soup): content=soup.find_all("div",{"class":"chapter-control dib-wrap"}) #print(str(content)) step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">') content1=step.findall(str(content)) return content1
#------------------------------------------------------------------------- #------------------------------------------------------------------------- soup=gethtml(url)
bookname=re.findall(r'<h1>(.*?)</h1>' ,str(soup)) #匹配书名 load="d:/88/%s.txt" % bookname[0]
i=0
while 1==1:
soup=gethtml(url)
getcontent(soup,load)
url=nextcontent(soup)
content1=panduan(soup) #在该章里匹配下一章的url,若无法匹配到(输出为[]空),说明没有下一章
i+=1
print("第%d章下载完成" % i) if content1 == []: #
break time.sleep(0.2)
下一篇,将结合该篇写一个爬取某一页所有小说的爬虫
(本文仅供技术参考,请勿用作非法途径)