由于版权问题,暂不明述具体网站
遇到问题
- 在不停访问每一个章节时,需要用response.close(),避免与服务器连接过多,无法进行新的连接
- request库保存图片时,图片的url不能出现汉字,使用
request.quote(url, safe=";/?: @&=+$,", encoding="utf-8")
具体请看https://blog.csdn.net/a12355556/article/details/113726856
单线程爬取
使用正则获得每一章节的名字,url,包含的所有图片url,使用request保存图片
代码
import requests,re,os
from urllib import request
novel_name = ''
def get_chapters(url):
r = requests.get(url).content.decode('gbk')
novel_name = re.findall('<td colspan=\'2\'>(.*?)</td>',r)[0]
if not os.path.exists(novel_name):
os.mkdir(novel_name)
chapters_urls = [ 'http://comic.kkkkdm.com'+i for i in re.findall('(/comiclist/\d+/\d+/\d+.htm)',r)][::4]
names=[]
[ names.append(i) for i in re.findall('<A.*?target=\'_blank\'>(.*?)</A>',r) if len(i)>2 and str(type(i))=="<class 'str'>"]
for i in range(len(names)):
num=0
ch_name = novel_name+'\\'+names[i]
if not os.path.exists(ch_name):
print(names[i]+'下载中')
os.mkdir(ch_name)
try:
imgs = one_chap_urls(chapters_urls[i])
for img_url in imgs:
print(img_url)
save(ch_name+'\\{}.png'.format(str(num)),img_url)
num+=1
print(ch_name+'下载完成')
except:
print(names[i]+'下载完成')
continue
def one_chap_urls(url):
imgs=[]
r = requests.get(url).content.decode('gbk')
num = int(re.search('共(\d+)页',r).group(1))
urls = [re.sub('\d+.htm','',url)+'{}.htm'.format(str(i)) for i in range(1,num+1)]
for url in urls:
try:
r = requests.get(url).content.decode('gbk')
img = 'http://ss1.kkkkdm.com'+re.findall('IMG SRC=(.*?)>',r)[0].replace('"+m201304d+"','/').replace("'",'')
imgs.append(img)
except:
continue
return imgs
def save(name,url):
try:
if not os.path.exists(name):
request.urlretrieve(url,name)
except:
url = request.quote(url, safe=";/?:@&=+$,", encoding="utf-8")
if not os.path.exists(name):
request.urlretrieve(url,name)
if __name__ == '__main__':
url = input("请输入漫画目录地址:")
get_chapters(url)
多线程爬取
先把每一章节url放入chapter_queue队列中,生产者用来解析每一章节的url,生成需要下载的图片url,并把url放入img_queue队列中;
消费者用来下载img_queue中的url使之为图片
代码
from lxml import etree
from urllib import request
import os,re,time,requests,threading,random
from queue import Queue
class Producer(threading.Thread):
def __init__(self,chapter_queue,img_queue,*args,**kwargs):
super(Producer, self).__init__(*args,**kwargs)
self.chapter_queue = chapter_queue
self.img_queue = img_queue
def run(self):
while True:
if self.chapter_queue.empty():
break
url,name = self.chapter_queue.get(block=True)
self.parse_chapter(url,name)
def parse_chapter(self,url,name):
n=0
ch_name = os.getcwd()+'\\'+novel_name+'\\'+name+'\\'
if not os.path.exists(ch_name):
os.mkdir(ch_name)
try:
imgs = self.one_chap_urls(url)
for img_url in imgs:
img_name=ch_name+'{}.png'.format(str(n))
self.img_queue.put((img_url,img_name))
n+=1
except Exception as e:
print(e)
pass
def one_chap_urls(self,url):
imgs=[]
response = requests.get(url)
r = response.content.decode('gbk')
response.close()
num = int(re.search('共(\d+)页',r).group(1))
urls = [re.sub('\d+.htm','',url)+'{}.htm'.format(str(i)) for i in range(1,num+1)]
for url in urls:
try:
r1 = requests.get(url)
r1.close()
r1 = r1.content.decode('gbk')
img = 'http://ss1.kkkkdm.com'+re.findall('IMG SRC=(.*?)>',r1)[0].replace('"+m201304d+"','/').replace("'",'')
imgs.append(img)
except:
continue
return imgs
class Consumer(threading.Thread):
def __init__(self,chapter_queue,img_queue,*args,**kwargs):
super(Consumer, self).__init__(*args,**kwargs)
self.chapter_queue = chapter_queue
self.img_queue = img_queue
def run(self):
while True:
if self.img_queue.empty():
time.sleep(20)
if self.chapter_queue.empty():
print("队列空")
return
img = self.img_queue.get(block=True)
time.sleep(0.1)
url,filename = img
url0 = request.quote(url, safe=";/?:@&=+$,", encoding="utf-8")
try:
request.urlretrieve(url0,filename)
print(filename+' 下载完成!')
except Exception as e:
print(e)
print(filename+' 下载失败!')
continue
def main():
global novel_name
start = time.time()
chapter_queue = Queue(1000)
img_queue = Queue(2000)
url = input("请输入漫画目录地址:")
response = requests.get(url)
r = response.content.decode('gbk')
response.close()
novel_name = re.findall('<td colspan=\'2\'>(.*?)</td>',r)[0]
if not os.path.exists(novel_name):
os.mkdir(novel_name)
chapters_urls = [ 'http://comic.kkkkdm.com'+i for i in re.findall('(/comiclist/\d+/\d+/\d+.htm)',r)][::4]
names=[]
[ names.append(i) for i in re.findall('<A.*?target=\'_blank\'>(.*?)</A>',r) if len(i)>2 and str(type(i))=="<class 'str'>"]
print('章节数:'+str(len(chapters_urls)))
num=0
for ur in chapters_urls:
chapter_queue.put((ur,names[num]))
num+=1
print("生产者正在生产url")
for x in range(20):
t = Producer(chapter_queue,img_queue)
t.start()
print("消费者正在下载图片")
for x in range(200):
t = Consumer(chapter_queue,img_queue)
t.start()
time.sleep(1)
if __name__ == '__main__':
print('请在此网站找目录哦 http://comic.kkkkdm.com/')
main()