小说爬虫强制绕过ssl验证

小说爬虫强制绕过ssl验证

import requests
# 取消忽略ssl的验证警告
import urllib3
urllib3.disable_warnings()

from lxml import etree
from multiprocessing.dummy import Pool

ip = {
        }
        
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
    }

name = None
dict = {}

####缓存下载--章节列表####
def cache_chapter(list_tag,pool_num):
	pool = Pool(pool_num)
	chapter_url_list = []
	for dd_tag in list_tag[12:]:
		chapter_url=target+dd_tag.xpath('./a/@href')[0]
		chapter_url_list.append(chapter_url)
	if pool_num == 88:
		pool.imap(cache_download,chapter_url_list)
		pool.close()
		pool.join()
	elif pool_num == 1:
		pool.imap(cache_text,chapter_url_list)
		pool.close()
		pool.join()

####缓存下载--多线程下载章节####
def cache_download(url):
	dictname = url[-13:-5]
	chapter_req = requests.get(url,headers=headers,proxies=ip,verify=False)
	chapter_req.encoding = 'gbk'
	chapter_soup = etree.HTML(chapter_req.text)
	content_name = chapter_soup.xpath('//*[@class="bookname"]/h1/text()')[0]
	content_text = chapter_soup.xpath('//*[@id="content"]/text()')
	content_text = ''.join(content_text)
	dict[dictname] = [content_name,content_text]
	print(content_name)

####缓存下载--顺序合并多线程下载章节####
def cache_text(url):
	dictname = url[-13:-5]
	content_name = dict[dictname][0]
	content_text = dict[dictname][1]
	with open(name+'.txt','a',encoding='utf-8') as f:
		f.write(content_name+'\n')
		f.write(content_text+'\n')
		
####开始####
if __name__ == '__main__':
	print('仅支持:https://www.45zw.la/'+'\n')
	a = input('输入要下载的链接码:')
	target='https://www.45zw.la/txt/'+a+'/'
	req_text=requests.get(url=target,headers=headers,proxies=ip,verify=False)
	req_text.encoding = 'gbk'
	soup = etree.HTML(req_text.text)
	list_tag = soup.xpath('//*[@id="list"]/dl/dd')
	name = soup.xpath('//*[@id="info"]/h1/text()')[0]
	print('\n'+'你正在下载的是: '+name)
	pool_num = 88
	cache_chapter(list_tag,pool_num)
	pool_num = 1
	cache_chapter(list_tag,pool_num)
	print('\n'+'....下载完成....')
上一篇:OSTEP Chapter 4. The Abstraction: The Process


下一篇:WebGIS中一种根据网格索引判断点面关系的方法