bs4与requests结合 爬取代理目录扫描 (挂代理池)
def scan():
header = {
'Host': 'www.xxx.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'application/x-www-form-urlencoded'
}
i_list = {}
for dirs in open('zidian.txt'):
urls = 'http://www.baidu.com'
url_dir = urls + dirs
url_dir.replace('\n','')
for ips in open('ip.txt'):
ips = ips.replace('\n','')
i_list['HTTP'] = ips
try:
code = requests.get(url_dir,header=header,proxies=i_list,verify=False).status_code
if code == 200 or code == 403:
print(url_dir)
except requests.exceptions.ConnectionError:
print('ConnectionError -- 等待一分钟')
time.sleep(1)
except requests.exceptions.ChunkedEncodingError:
print('ConnectionError -- 等待一分钟')
time.sleep(1)
except:
print('UnboundLocalError -- 等待一分钟')
time.sleep(1)
def getip():
for x in range(i,int(i)+1):
url = 'https://www.kuaidaili.com/free/inha/'+str(x)
rep = requests.get(url).content
time.sleep(1)
soup = BeautifulSoup(rep,"lxml")
ip = soup.select('td[data-title="IP"]')
port = soup.select('td[data-title="PORT"]')
for ips,ports in zip(ip,port):
file = open('ip.txt','a+')
file.write(ips.string+":"+ports.string+'\n')
file.close()
if __name__ == '__main__':
i = input("需要多少页ip:")
getip()
scan()