小说爬虫强制绕过ssl验证
import requests
# 取消忽略ssl的验证警告
import urllib3
urllib3.disable_warnings()
from lxml import etree
from multiprocessing.dummy import Pool
ip = {
}
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
name = None
dict = {}
####缓存下载--章节列表####
def cache_chapter(list_tag,pool_num):
pool = Pool(pool_num)
chapter_url_list = []
for dd_tag in list_tag[12:]:
chapter_url=target+dd_tag.xpath('./a/@href')[0]
chapter_url_list.append(chapter_url)
if pool_num == 88:
pool.imap(cache_download,chapter_url_list)
pool.close()
pool.join()
elif pool_num == 1:
pool.imap(cache_text,chapter_url_list)
pool.close()
pool.join()
####缓存下载--多线程下载章节####
def cache_download(url):
dictname = url[-13:-5]
chapter_req = requests.get(url,headers=headers,proxies=ip,verify=False)
chapter_req.encoding = 'gbk'
chapter_soup = etree.HTML(chapter_req.text)
content_name = chapter_soup.xpath('//*[@class="bookname"]/h1/text()')[0]
content_text = chapter_soup.xpath('//*[@id="content"]/text()')
content_text = ''.join(content_text)
dict[dictname] = [content_name,content_text]
print(content_name)
####缓存下载--顺序合并多线程下载章节####
def cache_text(url):
dictname = url[-13:-5]
content_name = dict[dictname][0]
content_text = dict[dictname][1]
with open(name+'.txt','a',encoding='utf-8') as f:
f.write(content_name+'\n')
f.write(content_text+'\n')
####开始####
if __name__ == '__main__':
print('仅支持:https://www.45zw.la/'+'\n')
a = input('输入要下载的链接码:')
target='https://www.45zw.la/txt/'+a+'/'
req_text=requests.get(url=target,headers=headers,proxies=ip,verify=False)
req_text.encoding = 'gbk'
soup = etree.HTML(req_text.text)
list_tag = soup.xpath('//*[@id="list"]/dl/dd')
name = soup.xpath('//*[@id="info"]/h1/text()')[0]
print('\n'+'你正在下载的是: '+name)
pool_num = 88
cache_chapter(list_tag,pool_num)
pool_num = 1
cache_chapter(list_tag,pool_num)
print('\n'+'....下载完成....')